xref: /linux/net/ipv4/route.c (revision 5499b45190237ca90dd2ac86395cf464fe1f4cc7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference_bh(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference_bh(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
517 {
518 	struct ip_rt_acct *dst, *src;
519 	unsigned int i, j;
520 
521 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 	if (!dst)
523 		return -ENOMEM;
524 
525 	for_each_possible_cpu(i) {
526 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 		for (j = 0; j < 256; j++) {
528 			dst[j].o_bytes   += src[j].o_bytes;
529 			dst[j].o_packets += src[j].o_packets;
530 			dst[j].i_bytes   += src[j].i_bytes;
531 			dst[j].i_packets += src[j].i_packets;
532 		}
533 	}
534 
535 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 	kfree(dst);
537 	return 0;
538 }
539 
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
541 {
542 	return single_open(file, rt_acct_proc_show, NULL);
543 }
544 
545 static const struct file_operations rt_acct_proc_fops = {
546 	.owner		= THIS_MODULE,
547 	.open		= rt_acct_proc_open,
548 	.read		= seq_read,
549 	.llseek		= seq_lseek,
550 	.release	= single_release,
551 };
552 #endif
553 
554 static int __net_init ip_rt_do_proc_init(struct net *net)
555 {
556 	struct proc_dir_entry *pde;
557 
558 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 			&rt_cache_seq_fops);
560 	if (!pde)
561 		goto err1;
562 
563 	pde = proc_create("rt_cache", S_IRUGO,
564 			  net->proc_net_stat, &rt_cpu_seq_fops);
565 	if (!pde)
566 		goto err2;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 	if (!pde)
571 		goto err3;
572 #endif
573 	return 0;
574 
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 	remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 	remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 	return -ENOMEM;
583 }
584 
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587 	remove_proc_entry("rt_cache", net->proc_net_stat);
588 	remove_proc_entry("rt_cache", net->proc_net);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 	remove_proc_entry("rt_acct", net->proc_net);
591 #endif
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 static void rt_check_expire(void)
784 {
785 	static unsigned int rover;
786 	unsigned int i = rover, goal;
787 	struct rtable *rth, *aux, **rthp;
788 	unsigned long samples = 0;
789 	unsigned long sum = 0, sum2 = 0;
790 	unsigned long delta;
791 	u64 mult;
792 
793 	delta = jiffies - expires_ljiffies;
794 	expires_ljiffies = jiffies;
795 	mult = ((u64)delta) << rt_hash_log;
796 	if (ip_rt_gc_timeout > 1)
797 		do_div(mult, ip_rt_gc_timeout);
798 	goal = (unsigned int)mult;
799 	if (goal > rt_hash_mask)
800 		goal = rt_hash_mask + 1;
801 	for (; goal > 0; goal--) {
802 		unsigned long tmo = ip_rt_gc_timeout;
803 		unsigned long length;
804 
805 		i = (i + 1) & rt_hash_mask;
806 		rthp = &rt_hash_table[i].chain;
807 
808 		if (need_resched())
809 			cond_resched();
810 
811 		samples++;
812 
813 		if (*rthp == NULL)
814 			continue;
815 		length = 0;
816 		spin_lock_bh(rt_hash_lock_addr(i));
817 		while ((rth = *rthp) != NULL) {
818 			prefetch(rth->u.dst.rt_next);
819 			if (rt_is_expired(rth)) {
820 				*rthp = rth->u.dst.rt_next;
821 				rt_free(rth);
822 				continue;
823 			}
824 			if (rth->u.dst.expires) {
825 				/* Entry is expired even if it is in use */
826 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
827 nofree:
828 					tmo >>= 1;
829 					rthp = &rth->u.dst.rt_next;
830 					/*
831 					 * We only count entries on
832 					 * a chain with equal hash inputs once
833 					 * so that entries for different QOS
834 					 * levels, and other non-hash input
835 					 * attributes don't unfairly skew
836 					 * the length computation
837 					 */
838 					for (aux = rt_hash_table[i].chain;;) {
839 						if (aux == rth) {
840 							length += ONE;
841 							break;
842 						}
843 						if (compare_hash_inputs(&aux->fl, &rth->fl))
844 							break;
845 						aux = aux->u.dst.rt_next;
846 					}
847 					continue;
848 				}
849 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 				goto nofree;
851 
852 			/* Cleanup aged off entries. */
853 			*rthp = rth->u.dst.rt_next;
854 			rt_free(rth);
855 		}
856 		spin_unlock_bh(rt_hash_lock_addr(i));
857 		sum += length;
858 		sum2 += length*length;
859 	}
860 	if (samples) {
861 		unsigned long avg = sum / samples;
862 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 		rt_chain_length_max = max_t(unsigned long,
864 					ip_rt_gc_elasticity,
865 					(avg + 4*sd) >> FRACT_BITS);
866 	}
867 	rover = i;
868 }
869 
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876 	rt_check_expire();
877 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879 
880 /*
881  * Pertubation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888 	unsigned char shuffle;
889 
890 	get_random_bytes(&shuffle, sizeof(shuffle));
891 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893 
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900 	rt_cache_invalidate(net);
901 	if (delay >= 0)
902 		rt_do_flush(!in_softirq());
903 }
904 
905 /* Flush previous cache invalidated entries from the cache */
906 void rt_cache_flush_batch(void)
907 {
908 	rt_do_flush(!in_softirq());
909 }
910 
911 /*
912  * We change rt_genid and let gc do the cleanup
913  */
914 static void rt_secret_rebuild(unsigned long __net)
915 {
916 	struct net *net = (struct net *)__net;
917 	rt_cache_invalidate(net);
918 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
919 }
920 
921 static void rt_secret_rebuild_oneshot(struct net *net)
922 {
923 	del_timer_sync(&net->ipv4.rt_secret_timer);
924 	rt_cache_invalidate(net);
925 	if (ip_rt_secret_interval) {
926 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
927 		add_timer(&net->ipv4.rt_secret_timer);
928 	}
929 }
930 
931 static void rt_emergency_hash_rebuild(struct net *net)
932 {
933 	if (net_ratelimit()) {
934 		printk(KERN_WARNING "Route hash chain too long!\n");
935 		printk(KERN_WARNING "Adjust your secret_interval!\n");
936 	}
937 
938 	rt_secret_rebuild_oneshot(net);
939 }
940 
941 /*
942    Short description of GC goals.
943 
944    We want to build algorithm, which will keep routing cache
945    at some equilibrium point, when number of aged off entries
946    is kept approximately equal to newly generated ones.
947 
948    Current expiration strength is variable "expire".
949    We try to adjust it dynamically, so that if networking
950    is idle expires is large enough to keep enough of warm entries,
951    and when load increases it reduces to limit cache size.
952  */
953 
954 static int rt_garbage_collect(struct dst_ops *ops)
955 {
956 	static unsigned long expire = RT_GC_TIMEOUT;
957 	static unsigned long last_gc;
958 	static int rover;
959 	static int equilibrium;
960 	struct rtable *rth, **rthp;
961 	unsigned long now = jiffies;
962 	int goal;
963 
964 	/*
965 	 * Garbage collection is pretty expensive,
966 	 * do not make it too frequently.
967 	 */
968 
969 	RT_CACHE_STAT_INC(gc_total);
970 
971 	if (now - last_gc < ip_rt_gc_min_interval &&
972 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
973 		RT_CACHE_STAT_INC(gc_ignored);
974 		goto out;
975 	}
976 
977 	/* Calculate number of entries, which we want to expire now. */
978 	goal = atomic_read(&ipv4_dst_ops.entries) -
979 		(ip_rt_gc_elasticity << rt_hash_log);
980 	if (goal <= 0) {
981 		if (equilibrium < ipv4_dst_ops.gc_thresh)
982 			equilibrium = ipv4_dst_ops.gc_thresh;
983 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
984 		if (goal > 0) {
985 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
986 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
987 		}
988 	} else {
989 		/* We are in dangerous area. Try to reduce cache really
990 		 * aggressively.
991 		 */
992 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
993 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
994 	}
995 
996 	if (now - last_gc >= ip_rt_gc_min_interval)
997 		last_gc = now;
998 
999 	if (goal <= 0) {
1000 		equilibrium += goal;
1001 		goto work_done;
1002 	}
1003 
1004 	do {
1005 		int i, k;
1006 
1007 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1008 			unsigned long tmo = expire;
1009 
1010 			k = (k + 1) & rt_hash_mask;
1011 			rthp = &rt_hash_table[k].chain;
1012 			spin_lock_bh(rt_hash_lock_addr(k));
1013 			while ((rth = *rthp) != NULL) {
1014 				if (!rt_is_expired(rth) &&
1015 					!rt_may_expire(rth, tmo, expire)) {
1016 					tmo >>= 1;
1017 					rthp = &rth->u.dst.rt_next;
1018 					continue;
1019 				}
1020 				*rthp = rth->u.dst.rt_next;
1021 				rt_free(rth);
1022 				goal--;
1023 			}
1024 			spin_unlock_bh(rt_hash_lock_addr(k));
1025 			if (goal <= 0)
1026 				break;
1027 		}
1028 		rover = k;
1029 
1030 		if (goal <= 0)
1031 			goto work_done;
1032 
1033 		/* Goal is not achieved. We stop process if:
1034 
1035 		   - if expire reduced to zero. Otherwise, expire is halfed.
1036 		   - if table is not full.
1037 		   - if we are called from interrupt.
1038 		   - jiffies check is just fallback/debug loop breaker.
1039 		     We will not spin here for long time in any case.
1040 		 */
1041 
1042 		RT_CACHE_STAT_INC(gc_goal_miss);
1043 
1044 		if (expire == 0)
1045 			break;
1046 
1047 		expire >>= 1;
1048 #if RT_CACHE_DEBUG >= 2
1049 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1050 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1051 #endif
1052 
1053 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1054 			goto out;
1055 	} while (!in_softirq() && time_before_eq(jiffies, now));
1056 
1057 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1058 		goto out;
1059 	if (net_ratelimit())
1060 		printk(KERN_WARNING "dst cache overflow\n");
1061 	RT_CACHE_STAT_INC(gc_dst_overflow);
1062 	return 1;
1063 
1064 work_done:
1065 	expire += ip_rt_gc_min_interval;
1066 	if (expire > ip_rt_gc_timeout ||
1067 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1068 		expire = ip_rt_gc_timeout;
1069 #if RT_CACHE_DEBUG >= 2
1070 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1071 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1072 #endif
1073 out:	return 0;
1074 }
1075 
1076 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 			  struct rtable **rp, struct sk_buff *skb)
1078 {
1079 	struct rtable	*rth, **rthp;
1080 	unsigned long	now;
1081 	struct rtable *cand, **candp;
1082 	u32 		min_score;
1083 	int		chain_length;
1084 	int attempts = !in_softirq();
1085 
1086 restart:
1087 	chain_length = 0;
1088 	min_score = ~(u32)0;
1089 	cand = NULL;
1090 	candp = NULL;
1091 	now = jiffies;
1092 
1093 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1094 		/*
1095 		 * If we're not caching, just tell the caller we
1096 		 * were successful and don't touch the route.  The
1097 		 * caller hold the sole reference to the cache entry, and
1098 		 * it will be released when the caller is done with it.
1099 		 * If we drop it here, the callers have no way to resolve routes
1100 		 * when we're not caching.  Instead, just point *rp at rt, so
1101 		 * the caller gets a single use out of the route
1102 		 * Note that we do rt_free on this new route entry, so that
1103 		 * once its refcount hits zero, we are still able to reap it
1104 		 * (Thanks Alexey)
1105 		 * Note also the rt_free uses call_rcu.  We don't actually
1106 		 * need rcu protection here, this is just our path to get
1107 		 * on the route gc list.
1108 		 */
1109 
1110 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 			int err = arp_bind_neighbour(&rt->u.dst);
1112 			if (err) {
1113 				if (net_ratelimit())
1114 					printk(KERN_WARNING
1115 					    "Neighbour table failure & not caching routes.\n");
1116 				rt_drop(rt);
1117 				return err;
1118 			}
1119 		}
1120 
1121 		rt_free(rt);
1122 		goto skip_hashing;
1123 	}
1124 
1125 	rthp = &rt_hash_table[hash].chain;
1126 
1127 	spin_lock_bh(rt_hash_lock_addr(hash));
1128 	while ((rth = *rthp) != NULL) {
1129 		if (rt_is_expired(rth)) {
1130 			*rthp = rth->u.dst.rt_next;
1131 			rt_free(rth);
1132 			continue;
1133 		}
1134 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135 			/* Put it first */
1136 			*rthp = rth->u.dst.rt_next;
1137 			/*
1138 			 * Since lookup is lockfree, the deletion
1139 			 * must be visible to another weakly ordered CPU before
1140 			 * the insertion at the start of the hash chain.
1141 			 */
1142 			rcu_assign_pointer(rth->u.dst.rt_next,
1143 					   rt_hash_table[hash].chain);
1144 			/*
1145 			 * Since lookup is lockfree, the update writes
1146 			 * must be ordered for consistency on SMP.
1147 			 */
1148 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149 
1150 			dst_use(&rth->u.dst, now);
1151 			spin_unlock_bh(rt_hash_lock_addr(hash));
1152 
1153 			rt_drop(rt);
1154 			if (rp)
1155 				*rp = rth;
1156 			else
1157 				skb_dst_set(skb, &rth->u.dst);
1158 			return 0;
1159 		}
1160 
1161 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1162 			u32 score = rt_score(rth);
1163 
1164 			if (score <= min_score) {
1165 				cand = rth;
1166 				candp = rthp;
1167 				min_score = score;
1168 			}
1169 		}
1170 
1171 		chain_length++;
1172 
1173 		rthp = &rth->u.dst.rt_next;
1174 	}
1175 
1176 	if (cand) {
1177 		/* ip_rt_gc_elasticity used to be average length of chain
1178 		 * length, when exceeded gc becomes really aggressive.
1179 		 *
1180 		 * The second limit is less certain. At the moment it allows
1181 		 * only 2 entries per bucket. We will see.
1182 		 */
1183 		if (chain_length > ip_rt_gc_elasticity) {
1184 			*candp = cand->u.dst.rt_next;
1185 			rt_free(cand);
1186 		}
1187 	} else {
1188 		if (chain_length > rt_chain_length_max) {
1189 			struct net *net = dev_net(rt->u.dst.dev);
1190 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1191 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1192 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1193 					rt->u.dst.dev->name, num);
1194 			}
1195 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1196 		}
1197 	}
1198 
1199 	/* Try to bind route to arp only if it is output
1200 	   route or unicast forwarding path.
1201 	 */
1202 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1203 		int err = arp_bind_neighbour(&rt->u.dst);
1204 		if (err) {
1205 			spin_unlock_bh(rt_hash_lock_addr(hash));
1206 
1207 			if (err != -ENOBUFS) {
1208 				rt_drop(rt);
1209 				return err;
1210 			}
1211 
1212 			/* Neighbour tables are full and nothing
1213 			   can be released. Try to shrink route cache,
1214 			   it is most likely it holds some neighbour records.
1215 			 */
1216 			if (attempts-- > 0) {
1217 				int saved_elasticity = ip_rt_gc_elasticity;
1218 				int saved_int = ip_rt_gc_min_interval;
1219 				ip_rt_gc_elasticity	= 1;
1220 				ip_rt_gc_min_interval	= 0;
1221 				rt_garbage_collect(&ipv4_dst_ops);
1222 				ip_rt_gc_min_interval	= saved_int;
1223 				ip_rt_gc_elasticity	= saved_elasticity;
1224 				goto restart;
1225 			}
1226 
1227 			if (net_ratelimit())
1228 				printk(KERN_WARNING "Neighbour table overflow.\n");
1229 			rt_drop(rt);
1230 			return -ENOBUFS;
1231 		}
1232 	}
1233 
1234 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1235 
1236 #if RT_CACHE_DEBUG >= 2
1237 	if (rt->u.dst.rt_next) {
1238 		struct rtable *trt;
1239 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1240 		       hash, &rt->rt_dst);
1241 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1242 			printk(" . %pI4", &trt->rt_dst);
1243 		printk("\n");
1244 	}
1245 #endif
1246 	/*
1247 	 * Since lookup is lockfree, we must make sure
1248 	 * previous writes to rt are comitted to memory
1249 	 * before making rt visible to other CPUS.
1250 	 */
1251 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1252 
1253 	spin_unlock_bh(rt_hash_lock_addr(hash));
1254 
1255 skip_hashing:
1256 	if (rp)
1257 		*rp = rt;
1258 	else
1259 		skb_dst_set(skb, &rt->u.dst);
1260 	return 0;
1261 }
1262 
1263 void rt_bind_peer(struct rtable *rt, int create)
1264 {
1265 	static DEFINE_SPINLOCK(rt_peer_lock);
1266 	struct inet_peer *peer;
1267 
1268 	peer = inet_getpeer(rt->rt_dst, create);
1269 
1270 	spin_lock_bh(&rt_peer_lock);
1271 	if (rt->peer == NULL) {
1272 		rt->peer = peer;
1273 		peer = NULL;
1274 	}
1275 	spin_unlock_bh(&rt_peer_lock);
1276 	if (peer)
1277 		inet_putpeer(peer);
1278 }
1279 
1280 /*
1281  * Peer allocation may fail only in serious out-of-memory conditions.  However
1282  * we still can generate some output.
1283  * Random ID selection looks a bit dangerous because we have no chances to
1284  * select ID being unique in a reasonable period of time.
1285  * But broken packet identifier may be better than no packet at all.
1286  */
1287 static void ip_select_fb_ident(struct iphdr *iph)
1288 {
1289 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1290 	static u32 ip_fallback_id;
1291 	u32 salt;
1292 
1293 	spin_lock_bh(&ip_fb_id_lock);
1294 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1295 	iph->id = htons(salt & 0xFFFF);
1296 	ip_fallback_id = salt;
1297 	spin_unlock_bh(&ip_fb_id_lock);
1298 }
1299 
1300 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1301 {
1302 	struct rtable *rt = (struct rtable *) dst;
1303 
1304 	if (rt) {
1305 		if (rt->peer == NULL)
1306 			rt_bind_peer(rt, 1);
1307 
1308 		/* If peer is attached to destination, it is never detached,
1309 		   so that we need not to grab a lock to dereference it.
1310 		 */
1311 		if (rt->peer) {
1312 			iph->id = htons(inet_getid(rt->peer, more));
1313 			return;
1314 		}
1315 	} else
1316 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1317 		       __builtin_return_address(0));
1318 
1319 	ip_select_fb_ident(iph);
1320 }
1321 
1322 static void rt_del(unsigned hash, struct rtable *rt)
1323 {
1324 	struct rtable **rthp, *aux;
1325 
1326 	rthp = &rt_hash_table[hash].chain;
1327 	spin_lock_bh(rt_hash_lock_addr(hash));
1328 	ip_rt_put(rt);
1329 	while ((aux = *rthp) != NULL) {
1330 		if (aux == rt || rt_is_expired(aux)) {
1331 			*rthp = aux->u.dst.rt_next;
1332 			rt_free(aux);
1333 			continue;
1334 		}
1335 		rthp = &aux->u.dst.rt_next;
1336 	}
1337 	spin_unlock_bh(rt_hash_lock_addr(hash));
1338 }
1339 
1340 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341 		    __be32 saddr, struct net_device *dev)
1342 {
1343 	int i, k;
1344 	struct in_device *in_dev = in_dev_get(dev);
1345 	struct rtable *rth, **rthp;
1346 	__be32  skeys[2] = { saddr, 0 };
1347 	int  ikeys[2] = { dev->ifindex, 0 };
1348 	struct netevent_redirect netevent;
1349 	struct net *net;
1350 
1351 	if (!in_dev)
1352 		return;
1353 
1354 	net = dev_net(dev);
1355 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1356 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1357 	    ipv4_is_zeronet(new_gw))
1358 		goto reject_redirect;
1359 
1360 	if (!rt_caching(net))
1361 		goto reject_redirect;
1362 
1363 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1364 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1365 			goto reject_redirect;
1366 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1367 			goto reject_redirect;
1368 	} else {
1369 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1370 			goto reject_redirect;
1371 	}
1372 
1373 	for (i = 0; i < 2; i++) {
1374 		for (k = 0; k < 2; k++) {
1375 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1376 						rt_genid(net));
1377 
1378 			rthp=&rt_hash_table[hash].chain;
1379 
1380 			rcu_read_lock();
1381 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1382 				struct rtable *rt;
1383 
1384 				if (rth->fl.fl4_dst != daddr ||
1385 				    rth->fl.fl4_src != skeys[i] ||
1386 				    rth->fl.oif != ikeys[k] ||
1387 				    rth->fl.iif != 0 ||
1388 				    rt_is_expired(rth) ||
1389 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1390 					rthp = &rth->u.dst.rt_next;
1391 					continue;
1392 				}
1393 
1394 				if (rth->rt_dst != daddr ||
1395 				    rth->rt_src != saddr ||
1396 				    rth->u.dst.error ||
1397 				    rth->rt_gateway != old_gw ||
1398 				    rth->u.dst.dev != dev)
1399 					break;
1400 
1401 				dst_hold(&rth->u.dst);
1402 				rcu_read_unlock();
1403 
1404 				rt = dst_alloc(&ipv4_dst_ops);
1405 				if (rt == NULL) {
1406 					ip_rt_put(rth);
1407 					in_dev_put(in_dev);
1408 					return;
1409 				}
1410 
1411 				/* Copy all the information. */
1412 				*rt = *rth;
1413 				rt->u.dst.__use		= 1;
1414 				atomic_set(&rt->u.dst.__refcnt, 1);
1415 				rt->u.dst.child		= NULL;
1416 				if (rt->u.dst.dev)
1417 					dev_hold(rt->u.dst.dev);
1418 				if (rt->idev)
1419 					in_dev_hold(rt->idev);
1420 				rt->u.dst.obsolete	= 0;
1421 				rt->u.dst.lastuse	= jiffies;
1422 				rt->u.dst.path		= &rt->u.dst;
1423 				rt->u.dst.neighbour	= NULL;
1424 				rt->u.dst.hh		= NULL;
1425 #ifdef CONFIG_XFRM
1426 				rt->u.dst.xfrm		= NULL;
1427 #endif
1428 				rt->rt_genid		= rt_genid(net);
1429 				rt->rt_flags		|= RTCF_REDIRECTED;
1430 
1431 				/* Gateway is different ... */
1432 				rt->rt_gateway		= new_gw;
1433 
1434 				/* Redirect received -> path was valid */
1435 				dst_confirm(&rth->u.dst);
1436 
1437 				if (rt->peer)
1438 					atomic_inc(&rt->peer->refcnt);
1439 
1440 				if (arp_bind_neighbour(&rt->u.dst) ||
1441 				    !(rt->u.dst.neighbour->nud_state &
1442 					    NUD_VALID)) {
1443 					if (rt->u.dst.neighbour)
1444 						neigh_event_send(rt->u.dst.neighbour, NULL);
1445 					ip_rt_put(rth);
1446 					rt_drop(rt);
1447 					goto do_next;
1448 				}
1449 
1450 				netevent.old = &rth->u.dst;
1451 				netevent.new = &rt->u.dst;
1452 				call_netevent_notifiers(NETEVENT_REDIRECT,
1453 							&netevent);
1454 
1455 				rt_del(hash, rth);
1456 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1457 					ip_rt_put(rt);
1458 				goto do_next;
1459 			}
1460 			rcu_read_unlock();
1461 		do_next:
1462 			;
1463 		}
1464 	}
1465 	in_dev_put(in_dev);
1466 	return;
1467 
1468 reject_redirect:
1469 #ifdef CONFIG_IP_ROUTE_VERBOSE
1470 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472 			"  Advised path = %pI4 -> %pI4\n",
1473 		       &old_gw, dev->name, &new_gw,
1474 		       &saddr, &daddr);
1475 #endif
1476 	in_dev_put(in_dev);
1477 }
1478 
1479 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1480 {
1481 	struct rtable *rt = (struct rtable *)dst;
1482 	struct dst_entry *ret = dst;
1483 
1484 	if (rt) {
1485 		if (dst->obsolete) {
1486 			ip_rt_put(rt);
1487 			ret = NULL;
1488 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489 			   rt->u.dst.expires) {
1490 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491 						rt->fl.oif,
1492 						rt_genid(dev_net(dst->dev)));
1493 #if RT_CACHE_DEBUG >= 1
1494 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1495 				&rt->rt_dst, rt->fl.fl4_tos);
1496 #endif
1497 			rt_del(hash, rt);
1498 			ret = NULL;
1499 		}
1500 	}
1501 	return ret;
1502 }
1503 
1504 /*
1505  * Algorithm:
1506  *	1. The first ip_rt_redirect_number redirects are sent
1507  *	   with exponential backoff, then we stop sending them at all,
1508  *	   assuming that the host ignores our redirects.
1509  *	2. If we did not see packets requiring redirects
1510  *	   during ip_rt_redirect_silence, we assume that the host
1511  *	   forgot redirected route and start to send redirects again.
1512  *
1513  * This algorithm is much cheaper and more intelligent than dumb load limiting
1514  * in icmp.c.
1515  *
1516  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1517  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1518  */
1519 
1520 void ip_rt_send_redirect(struct sk_buff *skb)
1521 {
1522 	struct rtable *rt = skb_rtable(skb);
1523 	struct in_device *in_dev;
1524 	int log_martians;
1525 
1526 	rcu_read_lock();
1527 	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1528 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529 		rcu_read_unlock();
1530 		return;
1531 	}
1532 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1533 	rcu_read_unlock();
1534 
1535 	/* No redirected packets during ip_rt_redirect_silence;
1536 	 * reset the algorithm.
1537 	 */
1538 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1539 		rt->u.dst.rate_tokens = 0;
1540 
1541 	/* Too many ignored redirects; do not send anything
1542 	 * set u.dst.rate_last to the last seen redirected packet.
1543 	 */
1544 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1545 		rt->u.dst.rate_last = jiffies;
1546 		return;
1547 	}
1548 
1549 	/* Check for load limit; set rate_last to the latest sent
1550 	 * redirect.
1551 	 */
1552 	if (rt->u.dst.rate_tokens == 0 ||
1553 	    time_after(jiffies,
1554 		       (rt->u.dst.rate_last +
1555 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1556 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557 		rt->u.dst.rate_last = jiffies;
1558 		++rt->u.dst.rate_tokens;
1559 #ifdef CONFIG_IP_ROUTE_VERBOSE
1560 		if (log_martians &&
1561 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1562 		    net_ratelimit())
1563 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564 				&rt->rt_src, rt->rt_iif,
1565 				&rt->rt_dst, &rt->rt_gateway);
1566 #endif
1567 	}
1568 }
1569 
1570 static int ip_error(struct sk_buff *skb)
1571 {
1572 	struct rtable *rt = skb_rtable(skb);
1573 	unsigned long now;
1574 	int code;
1575 
1576 	switch (rt->u.dst.error) {
1577 		case EINVAL:
1578 		default:
1579 			goto out;
1580 		case EHOSTUNREACH:
1581 			code = ICMP_HOST_UNREACH;
1582 			break;
1583 		case ENETUNREACH:
1584 			code = ICMP_NET_UNREACH;
1585 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1586 					IPSTATS_MIB_INNOROUTES);
1587 			break;
1588 		case EACCES:
1589 			code = ICMP_PKT_FILTERED;
1590 			break;
1591 	}
1592 
1593 	now = jiffies;
1594 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1595 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1596 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1597 	rt->u.dst.rate_last = now;
1598 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1599 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1600 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601 	}
1602 
1603 out:	kfree_skb(skb);
1604 	return 0;
1605 }
1606 
1607 /*
1608  *	The last two values are not from the RFC but
1609  *	are needed for AMPRnet AX.25 paths.
1610  */
1611 
1612 static const unsigned short mtu_plateau[] =
1613 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1614 
1615 static inline unsigned short guess_mtu(unsigned short old_mtu)
1616 {
1617 	int i;
1618 
1619 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1620 		if (old_mtu > mtu_plateau[i])
1621 			return mtu_plateau[i];
1622 	return 68;
1623 }
1624 
1625 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1626 				 unsigned short new_mtu,
1627 				 struct net_device *dev)
1628 {
1629 	int i, k;
1630 	unsigned short old_mtu = ntohs(iph->tot_len);
1631 	struct rtable *rth;
1632 	int  ikeys[2] = { dev->ifindex, 0 };
1633 	__be32  skeys[2] = { iph->saddr, 0, };
1634 	__be32  daddr = iph->daddr;
1635 	unsigned short est_mtu = 0;
1636 
1637 	for (k = 0; k < 2; k++) {
1638 		for (i = 0; i < 2; i++) {
1639 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1640 						rt_genid(net));
1641 
1642 			rcu_read_lock();
1643 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1645 				unsigned short mtu = new_mtu;
1646 
1647 				if (rth->fl.fl4_dst != daddr ||
1648 				    rth->fl.fl4_src != skeys[i] ||
1649 				    rth->rt_dst != daddr ||
1650 				    rth->rt_src != iph->saddr ||
1651 				    rth->fl.oif != ikeys[k] ||
1652 				    rth->fl.iif != 0 ||
1653 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1654 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1655 				    rt_is_expired(rth))
1656 					continue;
1657 
1658 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1659 
1660 					/* BSD 4.2 compatibility hack :-( */
1661 					if (mtu == 0 &&
1662 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1663 					    old_mtu >= 68 + (iph->ihl << 2))
1664 						old_mtu -= iph->ihl << 2;
1665 
1666 					mtu = guess_mtu(old_mtu);
1667 				}
1668 				if (mtu <= dst_mtu(&rth->u.dst)) {
1669 					if (mtu < dst_mtu(&rth->u.dst)) {
1670 						dst_confirm(&rth->u.dst);
1671 						if (mtu < ip_rt_min_pmtu) {
1672 							mtu = ip_rt_min_pmtu;
1673 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1674 								(1 << RTAX_MTU);
1675 						}
1676 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1677 						dst_set_expires(&rth->u.dst,
1678 							ip_rt_mtu_expires);
1679 					}
1680 					est_mtu = mtu;
1681 				}
1682 			}
1683 			rcu_read_unlock();
1684 		}
1685 	}
1686 	return est_mtu ? : new_mtu;
1687 }
1688 
1689 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1690 {
1691 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1692 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1693 		if (mtu < ip_rt_min_pmtu) {
1694 			mtu = ip_rt_min_pmtu;
1695 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1696 		}
1697 		dst->metrics[RTAX_MTU-1] = mtu;
1698 		dst_set_expires(dst, ip_rt_mtu_expires);
1699 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1700 	}
1701 }
1702 
1703 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704 {
1705 	return NULL;
1706 }
1707 
1708 static void ipv4_dst_destroy(struct dst_entry *dst)
1709 {
1710 	struct rtable *rt = (struct rtable *) dst;
1711 	struct inet_peer *peer = rt->peer;
1712 	struct in_device *idev = rt->idev;
1713 
1714 	if (peer) {
1715 		rt->peer = NULL;
1716 		inet_putpeer(peer);
1717 	}
1718 
1719 	if (idev) {
1720 		rt->idev = NULL;
1721 		in_dev_put(idev);
1722 	}
1723 }
1724 
1725 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1726 			    int how)
1727 {
1728 	struct rtable *rt = (struct rtable *) dst;
1729 	struct in_device *idev = rt->idev;
1730 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1731 		struct in_device *loopback_idev =
1732 			in_dev_get(dev_net(dev)->loopback_dev);
1733 		if (loopback_idev) {
1734 			rt->idev = loopback_idev;
1735 			in_dev_put(idev);
1736 		}
1737 	}
1738 }
1739 
1740 static void ipv4_link_failure(struct sk_buff *skb)
1741 {
1742 	struct rtable *rt;
1743 
1744 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1745 
1746 	rt = skb_rtable(skb);
1747 	if (rt)
1748 		dst_set_expires(&rt->u.dst, 0);
1749 }
1750 
1751 static int ip_rt_bug(struct sk_buff *skb)
1752 {
1753 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1754 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1755 		skb->dev ? skb->dev->name : "?");
1756 	kfree_skb(skb);
1757 	return 0;
1758 }
1759 
1760 /*
1761    We do not cache source address of outgoing interface,
1762    because it is used only by IP RR, TS and SRR options,
1763    so that it out of fast path.
1764 
1765    BTW remember: "addr" is allowed to be not aligned
1766    in IP options!
1767  */
1768 
1769 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1770 {
1771 	__be32 src;
1772 	struct fib_result res;
1773 
1774 	if (rt->fl.iif == 0)
1775 		src = rt->rt_src;
1776 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1777 		src = FIB_RES_PREFSRC(res);
1778 		fib_res_put(&res);
1779 	} else
1780 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1781 					RT_SCOPE_UNIVERSE);
1782 	memcpy(addr, &src, 4);
1783 }
1784 
1785 #ifdef CONFIG_NET_CLS_ROUTE
1786 static void set_class_tag(struct rtable *rt, u32 tag)
1787 {
1788 	if (!(rt->u.dst.tclassid & 0xFFFF))
1789 		rt->u.dst.tclassid |= tag & 0xFFFF;
1790 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1791 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1792 }
1793 #endif
1794 
1795 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1796 {
1797 	struct fib_info *fi = res->fi;
1798 
1799 	if (fi) {
1800 		if (FIB_RES_GW(*res) &&
1801 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1802 			rt->rt_gateway = FIB_RES_GW(*res);
1803 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1804 		       sizeof(rt->u.dst.metrics));
1805 		if (fi->fib_mtu == 0) {
1806 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1807 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1808 			    rt->rt_gateway != rt->rt_dst &&
1809 			    rt->u.dst.dev->mtu > 576)
1810 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1811 		}
1812 #ifdef CONFIG_NET_CLS_ROUTE
1813 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1814 #endif
1815 	} else
1816 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1817 
1818 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1819 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1820 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1821 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1822 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1823 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1824 				       ip_rt_min_advmss);
1825 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1826 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1827 
1828 #ifdef CONFIG_NET_CLS_ROUTE
1829 #ifdef CONFIG_IP_MULTIPLE_TABLES
1830 	set_class_tag(rt, fib_rules_tclass(res));
1831 #endif
1832 	set_class_tag(rt, itag);
1833 #endif
1834 	rt->rt_type = res->type;
1835 }
1836 
1837 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838 				u8 tos, struct net_device *dev, int our)
1839 {
1840 	unsigned hash;
1841 	struct rtable *rth;
1842 	__be32 spec_dst;
1843 	struct in_device *in_dev = in_dev_get(dev);
1844 	u32 itag = 0;
1845 
1846 	/* Primary sanity checks. */
1847 
1848 	if (in_dev == NULL)
1849 		return -EINVAL;
1850 
1851 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1852 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1853 		goto e_inval;
1854 
1855 	if (ipv4_is_zeronet(saddr)) {
1856 		if (!ipv4_is_local_multicast(daddr))
1857 			goto e_inval;
1858 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859 	} else if (fib_validate_source(saddr, 0, tos, 0,
1860 					dev, &spec_dst, &itag, 0) < 0)
1861 		goto e_inval;
1862 
1863 	rth = dst_alloc(&ipv4_dst_ops);
1864 	if (!rth)
1865 		goto e_nobufs;
1866 
1867 	rth->u.dst.output= ip_rt_bug;
1868 
1869 	atomic_set(&rth->u.dst.__refcnt, 1);
1870 	rth->u.dst.flags= DST_HOST;
1871 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872 		rth->u.dst.flags |= DST_NOPOLICY;
1873 	rth->fl.fl4_dst	= daddr;
1874 	rth->rt_dst	= daddr;
1875 	rth->fl.fl4_tos	= tos;
1876 	rth->fl.mark    = skb->mark;
1877 	rth->fl.fl4_src	= saddr;
1878 	rth->rt_src	= saddr;
1879 #ifdef CONFIG_NET_CLS_ROUTE
1880 	rth->u.dst.tclassid = itag;
1881 #endif
1882 	rth->rt_iif	=
1883 	rth->fl.iif	= dev->ifindex;
1884 	rth->u.dst.dev	= init_net.loopback_dev;
1885 	dev_hold(rth->u.dst.dev);
1886 	rth->idev	= in_dev_get(rth->u.dst.dev);
1887 	rth->fl.oif	= 0;
1888 	rth->rt_gateway	= daddr;
1889 	rth->rt_spec_dst= spec_dst;
1890 	rth->rt_genid	= rt_genid(dev_net(dev));
1891 	rth->rt_flags	= RTCF_MULTICAST;
1892 	rth->rt_type	= RTN_MULTICAST;
1893 	if (our) {
1894 		rth->u.dst.input= ip_local_deliver;
1895 		rth->rt_flags |= RTCF_LOCAL;
1896 	}
1897 
1898 #ifdef CONFIG_IP_MROUTE
1899 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900 		rth->u.dst.input = ip_mr_input;
1901 #endif
1902 	RT_CACHE_STAT_INC(in_slow_mc);
1903 
1904 	in_dev_put(in_dev);
1905 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1906 	return rt_intern_hash(hash, rth, NULL, skb);
1907 
1908 e_nobufs:
1909 	in_dev_put(in_dev);
1910 	return -ENOBUFS;
1911 
1912 e_inval:
1913 	in_dev_put(in_dev);
1914 	return -EINVAL;
1915 }
1916 
1917 
1918 static void ip_handle_martian_source(struct net_device *dev,
1919 				     struct in_device *in_dev,
1920 				     struct sk_buff *skb,
1921 				     __be32 daddr,
1922 				     __be32 saddr)
1923 {
1924 	RT_CACHE_STAT_INC(in_martian_src);
1925 #ifdef CONFIG_IP_ROUTE_VERBOSE
1926 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1927 		/*
1928 		 *	RFC1812 recommendation, if source is martian,
1929 		 *	the only hint is MAC header.
1930 		 */
1931 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1932 			&daddr, &saddr, dev->name);
1933 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1934 			int i;
1935 			const unsigned char *p = skb_mac_header(skb);
1936 			printk(KERN_WARNING "ll header: ");
1937 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1938 				printk("%02x", *p);
1939 				if (i < (dev->hard_header_len - 1))
1940 					printk(":");
1941 			}
1942 			printk("\n");
1943 		}
1944 	}
1945 #endif
1946 }
1947 
1948 static int __mkroute_input(struct sk_buff *skb,
1949 			   struct fib_result *res,
1950 			   struct in_device *in_dev,
1951 			   __be32 daddr, __be32 saddr, u32 tos,
1952 			   struct rtable **result)
1953 {
1954 
1955 	struct rtable *rth;
1956 	int err;
1957 	struct in_device *out_dev;
1958 	unsigned flags = 0;
1959 	__be32 spec_dst;
1960 	u32 itag;
1961 
1962 	/* get a working reference to the output device */
1963 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1964 	if (out_dev == NULL) {
1965 		if (net_ratelimit())
1966 			printk(KERN_CRIT "Bug in ip_route_input" \
1967 			       "_slow(). Please, report\n");
1968 		return -EINVAL;
1969 	}
1970 
1971 
1972 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1973 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1974 	if (err < 0) {
1975 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1976 					 saddr);
1977 
1978 		err = -EINVAL;
1979 		goto cleanup;
1980 	}
1981 
1982 	if (err)
1983 		flags |= RTCF_DIRECTSRC;
1984 
1985 	if (out_dev == in_dev && err &&
1986 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1987 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1988 		flags |= RTCF_DOREDIRECT;
1989 
1990 	if (skb->protocol != htons(ETH_P_IP)) {
1991 		/* Not IP (i.e. ARP). Do not create route, if it is
1992 		 * invalid for proxy arp. DNAT routes are always valid.
1993 		 */
1994 		if (out_dev == in_dev) {
1995 			err = -EINVAL;
1996 			goto cleanup;
1997 		}
1998 	}
1999 
2000 
2001 	rth = dst_alloc(&ipv4_dst_ops);
2002 	if (!rth) {
2003 		err = -ENOBUFS;
2004 		goto cleanup;
2005 	}
2006 
2007 	atomic_set(&rth->u.dst.__refcnt, 1);
2008 	rth->u.dst.flags= DST_HOST;
2009 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2010 		rth->u.dst.flags |= DST_NOPOLICY;
2011 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2012 		rth->u.dst.flags |= DST_NOXFRM;
2013 	rth->fl.fl4_dst	= daddr;
2014 	rth->rt_dst	= daddr;
2015 	rth->fl.fl4_tos	= tos;
2016 	rth->fl.mark    = skb->mark;
2017 	rth->fl.fl4_src	= saddr;
2018 	rth->rt_src	= saddr;
2019 	rth->rt_gateway	= daddr;
2020 	rth->rt_iif 	=
2021 		rth->fl.iif	= in_dev->dev->ifindex;
2022 	rth->u.dst.dev	= (out_dev)->dev;
2023 	dev_hold(rth->u.dst.dev);
2024 	rth->idev	= in_dev_get(rth->u.dst.dev);
2025 	rth->fl.oif 	= 0;
2026 	rth->rt_spec_dst= spec_dst;
2027 
2028 	rth->u.dst.input = ip_forward;
2029 	rth->u.dst.output = ip_output;
2030 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2031 
2032 	rt_set_nexthop(rth, res, itag);
2033 
2034 	rth->rt_flags = flags;
2035 
2036 	*result = rth;
2037 	err = 0;
2038  cleanup:
2039 	/* release the working reference to the output device */
2040 	in_dev_put(out_dev);
2041 	return err;
2042 }
2043 
2044 static int ip_mkroute_input(struct sk_buff *skb,
2045 			    struct fib_result *res,
2046 			    const struct flowi *fl,
2047 			    struct in_device *in_dev,
2048 			    __be32 daddr, __be32 saddr, u32 tos)
2049 {
2050 	struct rtable* rth = NULL;
2051 	int err;
2052 	unsigned hash;
2053 
2054 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2055 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2056 		fib_select_multipath(fl, res);
2057 #endif
2058 
2059 	/* create a routing cache entry */
2060 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2061 	if (err)
2062 		return err;
2063 
2064 	/* put it into the cache */
2065 	hash = rt_hash(daddr, saddr, fl->iif,
2066 		       rt_genid(dev_net(rth->u.dst.dev)));
2067 	return rt_intern_hash(hash, rth, NULL, skb);
2068 }
2069 
2070 /*
2071  *	NOTE. We drop all the packets that has local source
2072  *	addresses, because every properly looped back packet
2073  *	must have correct destination already attached by output routine.
2074  *
2075  *	Such approach solves two big problems:
2076  *	1. Not simplex devices are handled properly.
2077  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2078  */
2079 
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 			       u8 tos, struct net_device *dev)
2082 {
2083 	struct fib_result res;
2084 	struct in_device *in_dev = in_dev_get(dev);
2085 	struct flowi fl = { .nl_u = { .ip4_u =
2086 				      { .daddr = daddr,
2087 					.saddr = saddr,
2088 					.tos = tos,
2089 					.scope = RT_SCOPE_UNIVERSE,
2090 				      } },
2091 			    .mark = skb->mark,
2092 			    .iif = dev->ifindex };
2093 	unsigned	flags = 0;
2094 	u32		itag = 0;
2095 	struct rtable * rth;
2096 	unsigned	hash;
2097 	__be32		spec_dst;
2098 	int		err = -EINVAL;
2099 	int		free_res = 0;
2100 	struct net    * net = dev_net(dev);
2101 
2102 	/* IP on this device is disabled. */
2103 
2104 	if (!in_dev)
2105 		goto out;
2106 
2107 	/* Check for the most weird martians, which can be not detected
2108 	   by fib_lookup.
2109 	 */
2110 
2111 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2112 	    ipv4_is_loopback(saddr))
2113 		goto martian_source;
2114 
2115 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2116 		goto brd_input;
2117 
2118 	/* Accept zero addresses only to limited broadcast;
2119 	 * I even do not know to fix it or not. Waiting for complains :-)
2120 	 */
2121 	if (ipv4_is_zeronet(saddr))
2122 		goto martian_source;
2123 
2124 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2125 	    ipv4_is_loopback(daddr))
2126 		goto martian_destination;
2127 
2128 	/*
2129 	 *	Now we are ready to route packet.
2130 	 */
2131 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2132 		if (!IN_DEV_FORWARD(in_dev))
2133 			goto e_hostunreach;
2134 		goto no_route;
2135 	}
2136 	free_res = 1;
2137 
2138 	RT_CACHE_STAT_INC(in_slow_tot);
2139 
2140 	if (res.type == RTN_BROADCAST)
2141 		goto brd_input;
2142 
2143 	if (res.type == RTN_LOCAL) {
2144 		int result;
2145 		result = fib_validate_source(saddr, daddr, tos,
2146 					     net->loopback_dev->ifindex,
2147 					     dev, &spec_dst, &itag, skb->mark);
2148 		if (result < 0)
2149 			goto martian_source;
2150 		if (result)
2151 			flags |= RTCF_DIRECTSRC;
2152 		spec_dst = daddr;
2153 		goto local_input;
2154 	}
2155 
2156 	if (!IN_DEV_FORWARD(in_dev))
2157 		goto e_hostunreach;
2158 	if (res.type != RTN_UNICAST)
2159 		goto martian_destination;
2160 
2161 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2162 done:
2163 	in_dev_put(in_dev);
2164 	if (free_res)
2165 		fib_res_put(&res);
2166 out:	return err;
2167 
2168 brd_input:
2169 	if (skb->protocol != htons(ETH_P_IP))
2170 		goto e_inval;
2171 
2172 	if (ipv4_is_zeronet(saddr))
2173 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2174 	else {
2175 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2176 					  &itag, skb->mark);
2177 		if (err < 0)
2178 			goto martian_source;
2179 		if (err)
2180 			flags |= RTCF_DIRECTSRC;
2181 	}
2182 	flags |= RTCF_BROADCAST;
2183 	res.type = RTN_BROADCAST;
2184 	RT_CACHE_STAT_INC(in_brd);
2185 
2186 local_input:
2187 	rth = dst_alloc(&ipv4_dst_ops);
2188 	if (!rth)
2189 		goto e_nobufs;
2190 
2191 	rth->u.dst.output= ip_rt_bug;
2192 	rth->rt_genid = rt_genid(net);
2193 
2194 	atomic_set(&rth->u.dst.__refcnt, 1);
2195 	rth->u.dst.flags= DST_HOST;
2196 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197 		rth->u.dst.flags |= DST_NOPOLICY;
2198 	rth->fl.fl4_dst	= daddr;
2199 	rth->rt_dst	= daddr;
2200 	rth->fl.fl4_tos	= tos;
2201 	rth->fl.mark    = skb->mark;
2202 	rth->fl.fl4_src	= saddr;
2203 	rth->rt_src	= saddr;
2204 #ifdef CONFIG_NET_CLS_ROUTE
2205 	rth->u.dst.tclassid = itag;
2206 #endif
2207 	rth->rt_iif	=
2208 	rth->fl.iif	= dev->ifindex;
2209 	rth->u.dst.dev	= net->loopback_dev;
2210 	dev_hold(rth->u.dst.dev);
2211 	rth->idev	= in_dev_get(rth->u.dst.dev);
2212 	rth->rt_gateway	= daddr;
2213 	rth->rt_spec_dst= spec_dst;
2214 	rth->u.dst.input= ip_local_deliver;
2215 	rth->rt_flags 	= flags|RTCF_LOCAL;
2216 	if (res.type == RTN_UNREACHABLE) {
2217 		rth->u.dst.input= ip_error;
2218 		rth->u.dst.error= -err;
2219 		rth->rt_flags 	&= ~RTCF_LOCAL;
2220 	}
2221 	rth->rt_type	= res.type;
2222 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2223 	err = rt_intern_hash(hash, rth, NULL, skb);
2224 	goto done;
2225 
2226 no_route:
2227 	RT_CACHE_STAT_INC(in_no_route);
2228 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2229 	res.type = RTN_UNREACHABLE;
2230 	if (err == -ESRCH)
2231 		err = -ENETUNREACH;
2232 	goto local_input;
2233 
2234 	/*
2235 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2236 	 */
2237 martian_destination:
2238 	RT_CACHE_STAT_INC(in_martian_dst);
2239 #ifdef CONFIG_IP_ROUTE_VERBOSE
2240 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2241 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2242 			&daddr, &saddr, dev->name);
2243 #endif
2244 
2245 e_hostunreach:
2246 	err = -EHOSTUNREACH;
2247 	goto done;
2248 
2249 e_inval:
2250 	err = -EINVAL;
2251 	goto done;
2252 
2253 e_nobufs:
2254 	err = -ENOBUFS;
2255 	goto done;
2256 
2257 martian_source:
2258 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2259 	goto e_inval;
2260 }
2261 
2262 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 		   u8 tos, struct net_device *dev)
2264 {
2265 	struct rtable * rth;
2266 	unsigned	hash;
2267 	int iif = dev->ifindex;
2268 	struct net *net;
2269 
2270 	net = dev_net(dev);
2271 
2272 	if (!rt_caching(net))
2273 		goto skip_cache;
2274 
2275 	tos &= IPTOS_RT_MASK;
2276 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2277 
2278 	rcu_read_lock();
2279 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2281 		if (((rth->fl.fl4_dst ^ daddr) |
2282 		     (rth->fl.fl4_src ^ saddr) |
2283 		     (rth->fl.iif ^ iif) |
2284 		     rth->fl.oif |
2285 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2286 		    rth->fl.mark == skb->mark &&
2287 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2288 		    !rt_is_expired(rth)) {
2289 			dst_use(&rth->u.dst, jiffies);
2290 			RT_CACHE_STAT_INC(in_hit);
2291 			rcu_read_unlock();
2292 			skb_dst_set(skb, &rth->u.dst);
2293 			return 0;
2294 		}
2295 		RT_CACHE_STAT_INC(in_hlist_search);
2296 	}
2297 	rcu_read_unlock();
2298 
2299 skip_cache:
2300 	/* Multicast recognition logic is moved from route cache to here.
2301 	   The problem was that too many Ethernet cards have broken/missing
2302 	   hardware multicast filters :-( As result the host on multicasting
2303 	   network acquires a lot of useless route cache entries, sort of
2304 	   SDR messages from all the world. Now we try to get rid of them.
2305 	   Really, provided software IP multicast filter is organized
2306 	   reasonably (at least, hashed), it does not result in a slowdown
2307 	   comparing with route cache reject entries.
2308 	   Note, that multicast routers are not affected, because
2309 	   route cache entry is created eventually.
2310 	 */
2311 	if (ipv4_is_multicast(daddr)) {
2312 		struct in_device *in_dev;
2313 
2314 		rcu_read_lock();
2315 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2316 			int our = ip_check_mc(in_dev, daddr, saddr,
2317 				ip_hdr(skb)->protocol);
2318 			if (our
2319 #ifdef CONFIG_IP_MROUTE
2320 				||
2321 			    (!ipv4_is_local_multicast(daddr) &&
2322 			     IN_DEV_MFORWARD(in_dev))
2323 #endif
2324 			   ) {
2325 				rcu_read_unlock();
2326 				return ip_route_input_mc(skb, daddr, saddr,
2327 							 tos, dev, our);
2328 			}
2329 		}
2330 		rcu_read_unlock();
2331 		return -EINVAL;
2332 	}
2333 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2334 }
2335 
2336 static int __mkroute_output(struct rtable **result,
2337 			    struct fib_result *res,
2338 			    const struct flowi *fl,
2339 			    const struct flowi *oldflp,
2340 			    struct net_device *dev_out,
2341 			    unsigned flags)
2342 {
2343 	struct rtable *rth;
2344 	struct in_device *in_dev;
2345 	u32 tos = RT_FL_TOS(oldflp);
2346 	int err = 0;
2347 
2348 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2349 		return -EINVAL;
2350 
2351 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2352 		res->type = RTN_BROADCAST;
2353 	else if (ipv4_is_multicast(fl->fl4_dst))
2354 		res->type = RTN_MULTICAST;
2355 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2356 		return -EINVAL;
2357 
2358 	if (dev_out->flags & IFF_LOOPBACK)
2359 		flags |= RTCF_LOCAL;
2360 
2361 	/* get work reference to inet device */
2362 	in_dev = in_dev_get(dev_out);
2363 	if (!in_dev)
2364 		return -EINVAL;
2365 
2366 	if (res->type == RTN_BROADCAST) {
2367 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2368 		if (res->fi) {
2369 			fib_info_put(res->fi);
2370 			res->fi = NULL;
2371 		}
2372 	} else if (res->type == RTN_MULTICAST) {
2373 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2374 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2375 				 oldflp->proto))
2376 			flags &= ~RTCF_LOCAL;
2377 		/* If multicast route do not exist use
2378 		   default one, but do not gateway in this case.
2379 		   Yes, it is hack.
2380 		 */
2381 		if (res->fi && res->prefixlen < 4) {
2382 			fib_info_put(res->fi);
2383 			res->fi = NULL;
2384 		}
2385 	}
2386 
2387 
2388 	rth = dst_alloc(&ipv4_dst_ops);
2389 	if (!rth) {
2390 		err = -ENOBUFS;
2391 		goto cleanup;
2392 	}
2393 
2394 	atomic_set(&rth->u.dst.__refcnt, 1);
2395 	rth->u.dst.flags= DST_HOST;
2396 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2397 		rth->u.dst.flags |= DST_NOXFRM;
2398 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2399 		rth->u.dst.flags |= DST_NOPOLICY;
2400 
2401 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2402 	rth->fl.fl4_tos	= tos;
2403 	rth->fl.fl4_src	= oldflp->fl4_src;
2404 	rth->fl.oif	= oldflp->oif;
2405 	rth->fl.mark    = oldflp->mark;
2406 	rth->rt_dst	= fl->fl4_dst;
2407 	rth->rt_src	= fl->fl4_src;
2408 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2409 	/* get references to the devices that are to be hold by the routing
2410 	   cache entry */
2411 	rth->u.dst.dev	= dev_out;
2412 	dev_hold(dev_out);
2413 	rth->idev	= in_dev_get(dev_out);
2414 	rth->rt_gateway = fl->fl4_dst;
2415 	rth->rt_spec_dst= fl->fl4_src;
2416 
2417 	rth->u.dst.output=ip_output;
2418 	rth->rt_genid = rt_genid(dev_net(dev_out));
2419 
2420 	RT_CACHE_STAT_INC(out_slow_tot);
2421 
2422 	if (flags & RTCF_LOCAL) {
2423 		rth->u.dst.input = ip_local_deliver;
2424 		rth->rt_spec_dst = fl->fl4_dst;
2425 	}
2426 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2427 		rth->rt_spec_dst = fl->fl4_src;
2428 		if (flags & RTCF_LOCAL &&
2429 		    !(dev_out->flags & IFF_LOOPBACK)) {
2430 			rth->u.dst.output = ip_mc_output;
2431 			RT_CACHE_STAT_INC(out_slow_mc);
2432 		}
2433 #ifdef CONFIG_IP_MROUTE
2434 		if (res->type == RTN_MULTICAST) {
2435 			if (IN_DEV_MFORWARD(in_dev) &&
2436 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2437 				rth->u.dst.input = ip_mr_input;
2438 				rth->u.dst.output = ip_mc_output;
2439 			}
2440 		}
2441 #endif
2442 	}
2443 
2444 	rt_set_nexthop(rth, res, 0);
2445 
2446 	rth->rt_flags = flags;
2447 
2448 	*result = rth;
2449  cleanup:
2450 	/* release work reference to inet device */
2451 	in_dev_put(in_dev);
2452 
2453 	return err;
2454 }
2455 
2456 static int ip_mkroute_output(struct rtable **rp,
2457 			     struct fib_result *res,
2458 			     const struct flowi *fl,
2459 			     const struct flowi *oldflp,
2460 			     struct net_device *dev_out,
2461 			     unsigned flags)
2462 {
2463 	struct rtable *rth = NULL;
2464 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2465 	unsigned hash;
2466 	if (err == 0) {
2467 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2468 			       rt_genid(dev_net(dev_out)));
2469 		err = rt_intern_hash(hash, rth, rp, NULL);
2470 	}
2471 
2472 	return err;
2473 }
2474 
2475 /*
2476  * Major route resolver routine.
2477  */
2478 
2479 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2480 				const struct flowi *oldflp)
2481 {
2482 	u32 tos	= RT_FL_TOS(oldflp);
2483 	struct flowi fl = { .nl_u = { .ip4_u =
2484 				      { .daddr = oldflp->fl4_dst,
2485 					.saddr = oldflp->fl4_src,
2486 					.tos = tos & IPTOS_RT_MASK,
2487 					.scope = ((tos & RTO_ONLINK) ?
2488 						  RT_SCOPE_LINK :
2489 						  RT_SCOPE_UNIVERSE),
2490 				      } },
2491 			    .mark = oldflp->mark,
2492 			    .iif = net->loopback_dev->ifindex,
2493 			    .oif = oldflp->oif };
2494 	struct fib_result res;
2495 	unsigned flags = 0;
2496 	struct net_device *dev_out = NULL;
2497 	int free_res = 0;
2498 	int err;
2499 
2500 
2501 	res.fi		= NULL;
2502 #ifdef CONFIG_IP_MULTIPLE_TABLES
2503 	res.r		= NULL;
2504 #endif
2505 
2506 	if (oldflp->fl4_src) {
2507 		err = -EINVAL;
2508 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2509 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2510 		    ipv4_is_zeronet(oldflp->fl4_src))
2511 			goto out;
2512 
2513 		/* I removed check for oif == dev_out->oif here.
2514 		   It was wrong for two reasons:
2515 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2516 		      is assigned to multiple interfaces.
2517 		   2. Moreover, we are allowed to send packets with saddr
2518 		      of another iface. --ANK
2519 		 */
2520 
2521 		if (oldflp->oif == 0 &&
2522 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2523 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2524 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2525 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2526 			if (dev_out == NULL)
2527 				goto out;
2528 
2529 			/* Special hack: user can direct multicasts
2530 			   and limited broadcast via necessary interface
2531 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2532 			   This hack is not just for fun, it allows
2533 			   vic,vat and friends to work.
2534 			   They bind socket to loopback, set ttl to zero
2535 			   and expect that it will work.
2536 			   From the viewpoint of routing cache they are broken,
2537 			   because we are not allowed to build multicast path
2538 			   with loopback source addr (look, routing cache
2539 			   cannot know, that ttl is zero, so that packet
2540 			   will not leave this host and route is valid).
2541 			   Luckily, this hack is good workaround.
2542 			 */
2543 
2544 			fl.oif = dev_out->ifindex;
2545 			goto make_route;
2546 		}
2547 
2548 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2549 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2550 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2551 			if (dev_out == NULL)
2552 				goto out;
2553 			dev_put(dev_out);
2554 			dev_out = NULL;
2555 		}
2556 	}
2557 
2558 
2559 	if (oldflp->oif) {
2560 		dev_out = dev_get_by_index(net, oldflp->oif);
2561 		err = -ENODEV;
2562 		if (dev_out == NULL)
2563 			goto out;
2564 
2565 		/* RACE: Check return value of inet_select_addr instead. */
2566 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2567 			dev_put(dev_out);
2568 			goto out;	/* Wrong error code */
2569 		}
2570 
2571 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2572 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2573 			if (!fl.fl4_src)
2574 				fl.fl4_src = inet_select_addr(dev_out, 0,
2575 							      RT_SCOPE_LINK);
2576 			goto make_route;
2577 		}
2578 		if (!fl.fl4_src) {
2579 			if (ipv4_is_multicast(oldflp->fl4_dst))
2580 				fl.fl4_src = inet_select_addr(dev_out, 0,
2581 							      fl.fl4_scope);
2582 			else if (!oldflp->fl4_dst)
2583 				fl.fl4_src = inet_select_addr(dev_out, 0,
2584 							      RT_SCOPE_HOST);
2585 		}
2586 	}
2587 
2588 	if (!fl.fl4_dst) {
2589 		fl.fl4_dst = fl.fl4_src;
2590 		if (!fl.fl4_dst)
2591 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2592 		if (dev_out)
2593 			dev_put(dev_out);
2594 		dev_out = net->loopback_dev;
2595 		dev_hold(dev_out);
2596 		fl.oif = net->loopback_dev->ifindex;
2597 		res.type = RTN_LOCAL;
2598 		flags |= RTCF_LOCAL;
2599 		goto make_route;
2600 	}
2601 
2602 	if (fib_lookup(net, &fl, &res)) {
2603 		res.fi = NULL;
2604 		if (oldflp->oif) {
2605 			/* Apparently, routing tables are wrong. Assume,
2606 			   that the destination is on link.
2607 
2608 			   WHY? DW.
2609 			   Because we are allowed to send to iface
2610 			   even if it has NO routes and NO assigned
2611 			   addresses. When oif is specified, routing
2612 			   tables are looked up with only one purpose:
2613 			   to catch if destination is gatewayed, rather than
2614 			   direct. Moreover, if MSG_DONTROUTE is set,
2615 			   we send packet, ignoring both routing tables
2616 			   and ifaddr state. --ANK
2617 
2618 
2619 			   We could make it even if oif is unknown,
2620 			   likely IPv6, but we do not.
2621 			 */
2622 
2623 			if (fl.fl4_src == 0)
2624 				fl.fl4_src = inet_select_addr(dev_out, 0,
2625 							      RT_SCOPE_LINK);
2626 			res.type = RTN_UNICAST;
2627 			goto make_route;
2628 		}
2629 		if (dev_out)
2630 			dev_put(dev_out);
2631 		err = -ENETUNREACH;
2632 		goto out;
2633 	}
2634 	free_res = 1;
2635 
2636 	if (res.type == RTN_LOCAL) {
2637 		if (!fl.fl4_src)
2638 			fl.fl4_src = fl.fl4_dst;
2639 		if (dev_out)
2640 			dev_put(dev_out);
2641 		dev_out = net->loopback_dev;
2642 		dev_hold(dev_out);
2643 		fl.oif = dev_out->ifindex;
2644 		if (res.fi)
2645 			fib_info_put(res.fi);
2646 		res.fi = NULL;
2647 		flags |= RTCF_LOCAL;
2648 		goto make_route;
2649 	}
2650 
2651 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2652 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2653 		fib_select_multipath(&fl, &res);
2654 	else
2655 #endif
2656 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2657 		fib_select_default(net, &fl, &res);
2658 
2659 	if (!fl.fl4_src)
2660 		fl.fl4_src = FIB_RES_PREFSRC(res);
2661 
2662 	if (dev_out)
2663 		dev_put(dev_out);
2664 	dev_out = FIB_RES_DEV(res);
2665 	dev_hold(dev_out);
2666 	fl.oif = dev_out->ifindex;
2667 
2668 
2669 make_route:
2670 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2671 
2672 
2673 	if (free_res)
2674 		fib_res_put(&res);
2675 	if (dev_out)
2676 		dev_put(dev_out);
2677 out:	return err;
2678 }
2679 
2680 int __ip_route_output_key(struct net *net, struct rtable **rp,
2681 			  const struct flowi *flp)
2682 {
2683 	unsigned hash;
2684 	struct rtable *rth;
2685 
2686 	if (!rt_caching(net))
2687 		goto slow_output;
2688 
2689 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2690 
2691 	rcu_read_lock_bh();
2692 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693 		rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2694 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2695 		    rth->fl.fl4_src == flp->fl4_src &&
2696 		    rth->fl.iif == 0 &&
2697 		    rth->fl.oif == flp->oif &&
2698 		    rth->fl.mark == flp->mark &&
2699 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2700 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2702 		    !rt_is_expired(rth)) {
2703 			dst_use(&rth->u.dst, jiffies);
2704 			RT_CACHE_STAT_INC(out_hit);
2705 			rcu_read_unlock_bh();
2706 			*rp = rth;
2707 			return 0;
2708 		}
2709 		RT_CACHE_STAT_INC(out_hlist_search);
2710 	}
2711 	rcu_read_unlock_bh();
2712 
2713 slow_output:
2714 	return ip_route_output_slow(net, rp, flp);
2715 }
2716 
2717 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718 
2719 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2720 {
2721 }
2722 
2723 static struct dst_ops ipv4_dst_blackhole_ops = {
2724 	.family			=	AF_INET,
2725 	.protocol		=	cpu_to_be16(ETH_P_IP),
2726 	.destroy		=	ipv4_dst_destroy,
2727 	.check			=	ipv4_dst_check,
2728 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2729 	.entries		=	ATOMIC_INIT(0),
2730 };
2731 
2732 
2733 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2734 {
2735 	struct rtable *ort = *rp;
2736 	struct rtable *rt = (struct rtable *)
2737 		dst_alloc(&ipv4_dst_blackhole_ops);
2738 
2739 	if (rt) {
2740 		struct dst_entry *new = &rt->u.dst;
2741 
2742 		atomic_set(&new->__refcnt, 1);
2743 		new->__use = 1;
2744 		new->input = dst_discard;
2745 		new->output = dst_discard;
2746 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2747 
2748 		new->dev = ort->u.dst.dev;
2749 		if (new->dev)
2750 			dev_hold(new->dev);
2751 
2752 		rt->fl = ort->fl;
2753 
2754 		rt->idev = ort->idev;
2755 		if (rt->idev)
2756 			in_dev_hold(rt->idev);
2757 		rt->rt_genid = rt_genid(net);
2758 		rt->rt_flags = ort->rt_flags;
2759 		rt->rt_type = ort->rt_type;
2760 		rt->rt_dst = ort->rt_dst;
2761 		rt->rt_src = ort->rt_src;
2762 		rt->rt_iif = ort->rt_iif;
2763 		rt->rt_gateway = ort->rt_gateway;
2764 		rt->rt_spec_dst = ort->rt_spec_dst;
2765 		rt->peer = ort->peer;
2766 		if (rt->peer)
2767 			atomic_inc(&rt->peer->refcnt);
2768 
2769 		dst_free(new);
2770 	}
2771 
2772 	dst_release(&(*rp)->u.dst);
2773 	*rp = rt;
2774 	return (rt ? 0 : -ENOMEM);
2775 }
2776 
2777 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2778 			 struct sock *sk, int flags)
2779 {
2780 	int err;
2781 
2782 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2783 		return err;
2784 
2785 	if (flp->proto) {
2786 		if (!flp->fl4_src)
2787 			flp->fl4_src = (*rp)->rt_src;
2788 		if (!flp->fl4_dst)
2789 			flp->fl4_dst = (*rp)->rt_dst;
2790 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2791 				    flags ? XFRM_LOOKUP_WAIT : 0);
2792 		if (err == -EREMOTE)
2793 			err = ipv4_dst_blackhole(net, rp, flp);
2794 
2795 		return err;
2796 	}
2797 
2798 	return 0;
2799 }
2800 
2801 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2802 
2803 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2804 {
2805 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2806 }
2807 
2808 static int rt_fill_info(struct net *net,
2809 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2810 			int nowait, unsigned int flags)
2811 {
2812 	struct rtable *rt = skb_rtable(skb);
2813 	struct rtmsg *r;
2814 	struct nlmsghdr *nlh;
2815 	long expires;
2816 	u32 id = 0, ts = 0, tsage = 0, error;
2817 
2818 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2819 	if (nlh == NULL)
2820 		return -EMSGSIZE;
2821 
2822 	r = nlmsg_data(nlh);
2823 	r->rtm_family	 = AF_INET;
2824 	r->rtm_dst_len	= 32;
2825 	r->rtm_src_len	= 0;
2826 	r->rtm_tos	= rt->fl.fl4_tos;
2827 	r->rtm_table	= RT_TABLE_MAIN;
2828 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2829 	r->rtm_type	= rt->rt_type;
2830 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2831 	r->rtm_protocol = RTPROT_UNSPEC;
2832 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2833 	if (rt->rt_flags & RTCF_NOTIFY)
2834 		r->rtm_flags |= RTM_F_NOTIFY;
2835 
2836 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2837 
2838 	if (rt->fl.fl4_src) {
2839 		r->rtm_src_len = 32;
2840 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2841 	}
2842 	if (rt->u.dst.dev)
2843 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2844 #ifdef CONFIG_NET_CLS_ROUTE
2845 	if (rt->u.dst.tclassid)
2846 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2847 #endif
2848 	if (rt->fl.iif)
2849 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2850 	else if (rt->rt_src != rt->fl.fl4_src)
2851 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2852 
2853 	if (rt->rt_dst != rt->rt_gateway)
2854 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2855 
2856 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2857 		goto nla_put_failure;
2858 
2859 	error = rt->u.dst.error;
2860 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2861 	if (rt->peer) {
2862 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2863 		if (rt->peer->tcp_ts_stamp) {
2864 			ts = rt->peer->tcp_ts;
2865 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2866 		}
2867 	}
2868 
2869 	if (rt->fl.iif) {
2870 #ifdef CONFIG_IP_MROUTE
2871 		__be32 dst = rt->rt_dst;
2872 
2873 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2874 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2875 			int err = ipmr_get_route(net, skb, r, nowait);
2876 			if (err <= 0) {
2877 				if (!nowait) {
2878 					if (err == 0)
2879 						return 0;
2880 					goto nla_put_failure;
2881 				} else {
2882 					if (err == -EMSGSIZE)
2883 						goto nla_put_failure;
2884 					error = err;
2885 				}
2886 			}
2887 		} else
2888 #endif
2889 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2890 	}
2891 
2892 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2893 			       expires, error) < 0)
2894 		goto nla_put_failure;
2895 
2896 	return nlmsg_end(skb, nlh);
2897 
2898 nla_put_failure:
2899 	nlmsg_cancel(skb, nlh);
2900 	return -EMSGSIZE;
2901 }
2902 
2903 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2904 {
2905 	struct net *net = sock_net(in_skb->sk);
2906 	struct rtmsg *rtm;
2907 	struct nlattr *tb[RTA_MAX+1];
2908 	struct rtable *rt = NULL;
2909 	__be32 dst = 0;
2910 	__be32 src = 0;
2911 	u32 iif;
2912 	int err;
2913 	struct sk_buff *skb;
2914 
2915 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2916 	if (err < 0)
2917 		goto errout;
2918 
2919 	rtm = nlmsg_data(nlh);
2920 
2921 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2922 	if (skb == NULL) {
2923 		err = -ENOBUFS;
2924 		goto errout;
2925 	}
2926 
2927 	/* Reserve room for dummy headers, this skb can pass
2928 	   through good chunk of routing engine.
2929 	 */
2930 	skb_reset_mac_header(skb);
2931 	skb_reset_network_header(skb);
2932 
2933 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2934 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2935 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2936 
2937 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2940 
2941 	if (iif) {
2942 		struct net_device *dev;
2943 
2944 		dev = __dev_get_by_index(net, iif);
2945 		if (dev == NULL) {
2946 			err = -ENODEV;
2947 			goto errout_free;
2948 		}
2949 
2950 		skb->protocol	= htons(ETH_P_IP);
2951 		skb->dev	= dev;
2952 		local_bh_disable();
2953 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954 		local_bh_enable();
2955 
2956 		rt = skb_rtable(skb);
2957 		if (err == 0 && rt->u.dst.error)
2958 			err = -rt->u.dst.error;
2959 	} else {
2960 		struct flowi fl = {
2961 			.nl_u = {
2962 				.ip4_u = {
2963 					.daddr = dst,
2964 					.saddr = src,
2965 					.tos = rtm->rtm_tos,
2966 				},
2967 			},
2968 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2969 		};
2970 		err = ip_route_output_key(net, &rt, &fl);
2971 	}
2972 
2973 	if (err)
2974 		goto errout_free;
2975 
2976 	skb_dst_set(skb, &rt->u.dst);
2977 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2978 		rt->rt_flags |= RTCF_NOTIFY;
2979 
2980 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2981 			   RTM_NEWROUTE, 0, 0);
2982 	if (err <= 0)
2983 		goto errout_free;
2984 
2985 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2986 errout:
2987 	return err;
2988 
2989 errout_free:
2990 	kfree_skb(skb);
2991 	goto errout;
2992 }
2993 
2994 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2995 {
2996 	struct rtable *rt;
2997 	int h, s_h;
2998 	int idx, s_idx;
2999 	struct net *net;
3000 
3001 	net = sock_net(skb->sk);
3002 
3003 	s_h = cb->args[0];
3004 	if (s_h < 0)
3005 		s_h = 0;
3006 	s_idx = idx = cb->args[1];
3007 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3008 		if (!rt_hash_table[h].chain)
3009 			continue;
3010 		rcu_read_lock_bh();
3011 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3012 		     rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3013 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3014 				continue;
3015 			if (rt_is_expired(rt))
3016 				continue;
3017 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3018 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020 					 1, NLM_F_MULTI) <= 0) {
3021 				skb_dst_drop(skb);
3022 				rcu_read_unlock_bh();
3023 				goto done;
3024 			}
3025 			skb_dst_drop(skb);
3026 		}
3027 		rcu_read_unlock_bh();
3028 	}
3029 
3030 done:
3031 	cb->args[0] = h;
3032 	cb->args[1] = idx;
3033 	return skb->len;
3034 }
3035 
3036 void ip_rt_multicast_event(struct in_device *in_dev)
3037 {
3038 	rt_cache_flush(dev_net(in_dev->dev), 0);
3039 }
3040 
3041 #ifdef CONFIG_SYSCTL
3042 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3043 					void __user *buffer,
3044 					size_t *lenp, loff_t *ppos)
3045 {
3046 	if (write) {
3047 		int flush_delay;
3048 		ctl_table ctl;
3049 		struct net *net;
3050 
3051 		memcpy(&ctl, __ctl, sizeof(ctl));
3052 		ctl.data = &flush_delay;
3053 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3054 
3055 		net = (struct net *)__ctl->extra1;
3056 		rt_cache_flush(net, flush_delay);
3057 		return 0;
3058 	}
3059 
3060 	return -EINVAL;
3061 }
3062 
3063 static void rt_secret_reschedule(int old)
3064 {
3065 	struct net *net;
3066 	int new = ip_rt_secret_interval;
3067 	int diff = new - old;
3068 
3069 	if (!diff)
3070 		return;
3071 
3072 	rtnl_lock();
3073 	for_each_net(net) {
3074 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3075 
3076 		if (!new)
3077 			continue;
3078 
3079 		if (deleted) {
3080 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3081 
3082 			if (time <= 0 || (time += diff) <= 0)
3083 				time = 0;
3084 
3085 			net->ipv4.rt_secret_timer.expires = time;
3086 		} else
3087 			net->ipv4.rt_secret_timer.expires = new;
3088 
3089 		net->ipv4.rt_secret_timer.expires += jiffies;
3090 		add_timer(&net->ipv4.rt_secret_timer);
3091 	}
3092 	rtnl_unlock();
3093 }
3094 
3095 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3096 					  void __user *buffer, size_t *lenp,
3097 					  loff_t *ppos)
3098 {
3099 	int old = ip_rt_secret_interval;
3100 	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3101 
3102 	rt_secret_reschedule(old);
3103 
3104 	return ret;
3105 }
3106 
3107 static ctl_table ipv4_route_table[] = {
3108 	{
3109 		.procname	= "gc_thresh",
3110 		.data		= &ipv4_dst_ops.gc_thresh,
3111 		.maxlen		= sizeof(int),
3112 		.mode		= 0644,
3113 		.proc_handler	= proc_dointvec,
3114 	},
3115 	{
3116 		.procname	= "max_size",
3117 		.data		= &ip_rt_max_size,
3118 		.maxlen		= sizeof(int),
3119 		.mode		= 0644,
3120 		.proc_handler	= proc_dointvec,
3121 	},
3122 	{
3123 		/*  Deprecated. Use gc_min_interval_ms */
3124 
3125 		.procname	= "gc_min_interval",
3126 		.data		= &ip_rt_gc_min_interval,
3127 		.maxlen		= sizeof(int),
3128 		.mode		= 0644,
3129 		.proc_handler	= proc_dointvec_jiffies,
3130 	},
3131 	{
3132 		.procname	= "gc_min_interval_ms",
3133 		.data		= &ip_rt_gc_min_interval,
3134 		.maxlen		= sizeof(int),
3135 		.mode		= 0644,
3136 		.proc_handler	= proc_dointvec_ms_jiffies,
3137 	},
3138 	{
3139 		.procname	= "gc_timeout",
3140 		.data		= &ip_rt_gc_timeout,
3141 		.maxlen		= sizeof(int),
3142 		.mode		= 0644,
3143 		.proc_handler	= proc_dointvec_jiffies,
3144 	},
3145 	{
3146 		.procname	= "gc_interval",
3147 		.data		= &ip_rt_gc_interval,
3148 		.maxlen		= sizeof(int),
3149 		.mode		= 0644,
3150 		.proc_handler	= proc_dointvec_jiffies,
3151 	},
3152 	{
3153 		.procname	= "redirect_load",
3154 		.data		= &ip_rt_redirect_load,
3155 		.maxlen		= sizeof(int),
3156 		.mode		= 0644,
3157 		.proc_handler	= proc_dointvec,
3158 	},
3159 	{
3160 		.procname	= "redirect_number",
3161 		.data		= &ip_rt_redirect_number,
3162 		.maxlen		= sizeof(int),
3163 		.mode		= 0644,
3164 		.proc_handler	= proc_dointvec,
3165 	},
3166 	{
3167 		.procname	= "redirect_silence",
3168 		.data		= &ip_rt_redirect_silence,
3169 		.maxlen		= sizeof(int),
3170 		.mode		= 0644,
3171 		.proc_handler	= proc_dointvec,
3172 	},
3173 	{
3174 		.procname	= "error_cost",
3175 		.data		= &ip_rt_error_cost,
3176 		.maxlen		= sizeof(int),
3177 		.mode		= 0644,
3178 		.proc_handler	= proc_dointvec,
3179 	},
3180 	{
3181 		.procname	= "error_burst",
3182 		.data		= &ip_rt_error_burst,
3183 		.maxlen		= sizeof(int),
3184 		.mode		= 0644,
3185 		.proc_handler	= proc_dointvec,
3186 	},
3187 	{
3188 		.procname	= "gc_elasticity",
3189 		.data		= &ip_rt_gc_elasticity,
3190 		.maxlen		= sizeof(int),
3191 		.mode		= 0644,
3192 		.proc_handler	= proc_dointvec,
3193 	},
3194 	{
3195 		.procname	= "mtu_expires",
3196 		.data		= &ip_rt_mtu_expires,
3197 		.maxlen		= sizeof(int),
3198 		.mode		= 0644,
3199 		.proc_handler	= proc_dointvec_jiffies,
3200 	},
3201 	{
3202 		.procname	= "min_pmtu",
3203 		.data		= &ip_rt_min_pmtu,
3204 		.maxlen		= sizeof(int),
3205 		.mode		= 0644,
3206 		.proc_handler	= proc_dointvec,
3207 	},
3208 	{
3209 		.procname	= "min_adv_mss",
3210 		.data		= &ip_rt_min_advmss,
3211 		.maxlen		= sizeof(int),
3212 		.mode		= 0644,
3213 		.proc_handler	= proc_dointvec,
3214 	},
3215 	{
3216 		.procname	= "secret_interval",
3217 		.data		= &ip_rt_secret_interval,
3218 		.maxlen		= sizeof(int),
3219 		.mode		= 0644,
3220 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3221 	},
3222 	{ }
3223 };
3224 
3225 static struct ctl_table empty[1];
3226 
3227 static struct ctl_table ipv4_skeleton[] =
3228 {
3229 	{ .procname = "route",
3230 	  .mode = 0555, .child = ipv4_route_table},
3231 	{ .procname = "neigh",
3232 	  .mode = 0555, .child = empty},
3233 	{ }
3234 };
3235 
3236 static __net_initdata struct ctl_path ipv4_path[] = {
3237 	{ .procname = "net", },
3238 	{ .procname = "ipv4", },
3239 	{ },
3240 };
3241 
3242 static struct ctl_table ipv4_route_flush_table[] = {
3243 	{
3244 		.procname	= "flush",
3245 		.maxlen		= sizeof(int),
3246 		.mode		= 0200,
3247 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3248 	},
3249 	{ },
3250 };
3251 
3252 static __net_initdata struct ctl_path ipv4_route_path[] = {
3253 	{ .procname = "net", },
3254 	{ .procname = "ipv4", },
3255 	{ .procname = "route", },
3256 	{ },
3257 };
3258 
3259 static __net_init int sysctl_route_net_init(struct net *net)
3260 {
3261 	struct ctl_table *tbl;
3262 
3263 	tbl = ipv4_route_flush_table;
3264 	if (!net_eq(net, &init_net)) {
3265 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3266 		if (tbl == NULL)
3267 			goto err_dup;
3268 	}
3269 	tbl[0].extra1 = net;
3270 
3271 	net->ipv4.route_hdr =
3272 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3273 	if (net->ipv4.route_hdr == NULL)
3274 		goto err_reg;
3275 	return 0;
3276 
3277 err_reg:
3278 	if (tbl != ipv4_route_flush_table)
3279 		kfree(tbl);
3280 err_dup:
3281 	return -ENOMEM;
3282 }
3283 
3284 static __net_exit void sysctl_route_net_exit(struct net *net)
3285 {
3286 	struct ctl_table *tbl;
3287 
3288 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3289 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3290 	BUG_ON(tbl == ipv4_route_flush_table);
3291 	kfree(tbl);
3292 }
3293 
3294 static __net_initdata struct pernet_operations sysctl_route_ops = {
3295 	.init = sysctl_route_net_init,
3296 	.exit = sysctl_route_net_exit,
3297 };
3298 #endif
3299 
3300 
3301 static __net_init int rt_secret_timer_init(struct net *net)
3302 {
3303 	atomic_set(&net->ipv4.rt_genid,
3304 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3305 			(jiffies ^ (jiffies >> 7))));
3306 
3307 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3308 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3309 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3310 
3311 	if (ip_rt_secret_interval) {
3312 		net->ipv4.rt_secret_timer.expires =
3313 			jiffies + net_random() % ip_rt_secret_interval +
3314 			ip_rt_secret_interval;
3315 		add_timer(&net->ipv4.rt_secret_timer);
3316 	}
3317 	return 0;
3318 }
3319 
3320 static __net_exit void rt_secret_timer_exit(struct net *net)
3321 {
3322 	del_timer_sync(&net->ipv4.rt_secret_timer);
3323 }
3324 
3325 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3326 	.init = rt_secret_timer_init,
3327 	.exit = rt_secret_timer_exit,
3328 };
3329 
3330 
3331 #ifdef CONFIG_NET_CLS_ROUTE
3332 struct ip_rt_acct *ip_rt_acct __read_mostly;
3333 #endif /* CONFIG_NET_CLS_ROUTE */
3334 
3335 static __initdata unsigned long rhash_entries;
3336 static int __init set_rhash_entries(char *str)
3337 {
3338 	if (!str)
3339 		return 0;
3340 	rhash_entries = simple_strtoul(str, &str, 0);
3341 	return 1;
3342 }
3343 __setup("rhash_entries=", set_rhash_entries);
3344 
3345 int __init ip_rt_init(void)
3346 {
3347 	int rc = 0;
3348 
3349 #ifdef CONFIG_NET_CLS_ROUTE
3350 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351 	if (!ip_rt_acct)
3352 		panic("IP: failed to allocate ip_rt_acct\n");
3353 #endif
3354 
3355 	ipv4_dst_ops.kmem_cachep =
3356 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3358 
3359 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360 
3361 	rt_hash_table = (struct rt_hash_bucket *)
3362 		alloc_large_system_hash("IP route cache",
3363 					sizeof(struct rt_hash_bucket),
3364 					rhash_entries,
3365 					(totalram_pages >= 128 * 1024) ?
3366 					15 : 17,
3367 					0,
3368 					&rt_hash_log,
3369 					&rt_hash_mask,
3370 					rhash_entries ? 0 : 512 * 1024);
3371 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3372 	rt_hash_lock_init();
3373 
3374 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3375 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3376 
3377 	devinet_init();
3378 	ip_fib_init();
3379 
3380 	/* All the timers, started at system startup tend
3381 	   to synchronize. Perturb it a bit.
3382 	 */
3383 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3384 	expires_ljiffies = jiffies;
3385 	schedule_delayed_work(&expires_work,
3386 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387 
3388 	if (register_pernet_subsys(&rt_secret_timer_ops))
3389 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3390 
3391 	if (ip_rt_proc_init())
3392 		printk(KERN_ERR "Unable to create route proc files\n");
3393 #ifdef CONFIG_XFRM
3394 	xfrm_init();
3395 	xfrm4_init(ip_rt_max_size);
3396 #endif
3397 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3398 
3399 #ifdef CONFIG_SYSCTL
3400 	register_pernet_subsys(&sysctl_route_ops);
3401 #endif
3402 	return rc;
3403 }
3404 
3405 #ifdef CONFIG_SYSCTL
3406 /*
3407  * We really need to sanitize the damn ipv4 init order, then all
3408  * this nonsense will go away.
3409  */
3410 void __init ip_static_sysctl_init(void)
3411 {
3412 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3413 }
3414 #endif
3415 
3416 EXPORT_SYMBOL(__ip_select_ident);
3417 EXPORT_SYMBOL(ip_route_input);
3418 EXPORT_SYMBOL(ip_route_output_key);
3419