xref: /linux/net/ipv4/route.c (revision e27ecdd94d81e5bc3d1f68591701db5adb342f0d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 			   int length, int *eof, void *data)
518 {
519 	unsigned int i;
520 
521 	if ((offset & 3) || (length & 3))
522 		return -EIO;
523 
524 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 		*eof = 1;
526 		return 0;
527 	}
528 
529 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 		length = sizeof(struct ip_rt_acct) * 256 - offset;
531 		*eof = 1;
532 	}
533 
534 	offset /= sizeof(u32);
535 
536 	if (length > 0) {
537 		u32 *dst = (u32 *) buffer;
538 
539 		*start = buffer;
540 		memset(dst, 0, length);
541 
542 		for_each_possible_cpu(i) {
543 			unsigned int j;
544 			u32 *src;
545 
546 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 			for (j = 0; j < length/4; j++)
548 				dst[j] += src[j];
549 		}
550 	}
551 	return length;
552 }
553 #endif
554 
555 static int __net_init ip_rt_do_proc_init(struct net *net)
556 {
557 	struct proc_dir_entry *pde;
558 
559 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 			&rt_cache_seq_fops);
561 	if (!pde)
562 		goto err1;
563 
564 	pde = proc_create("rt_cache", S_IRUGO,
565 			  net->proc_net_stat, &rt_cpu_seq_fops);
566 	if (!pde)
567 		goto err2;
568 
569 #ifdef CONFIG_NET_CLS_ROUTE
570 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 			ip_rt_acct_read, NULL);
572 	if (!pde)
573 		goto err3;
574 #endif
575 	return 0;
576 
577 #ifdef CONFIG_NET_CLS_ROUTE
578 err3:
579 	remove_proc_entry("rt_cache", net->proc_net_stat);
580 #endif
581 err2:
582 	remove_proc_entry("rt_cache", net->proc_net);
583 err1:
584 	return -ENOMEM;
585 }
586 
587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
588 {
589 	remove_proc_entry("rt_cache", net->proc_net_stat);
590 	remove_proc_entry("rt_cache", net->proc_net);
591 	remove_proc_entry("rt_acct", net->proc_net);
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 static void rt_check_expire(void)
784 {
785 	static unsigned int rover;
786 	unsigned int i = rover, goal;
787 	struct rtable *rth, *aux, **rthp;
788 	unsigned long samples = 0;
789 	unsigned long sum = 0, sum2 = 0;
790 	unsigned long delta;
791 	u64 mult;
792 
793 	delta = jiffies - expires_ljiffies;
794 	expires_ljiffies = jiffies;
795 	mult = ((u64)delta) << rt_hash_log;
796 	if (ip_rt_gc_timeout > 1)
797 		do_div(mult, ip_rt_gc_timeout);
798 	goal = (unsigned int)mult;
799 	if (goal > rt_hash_mask)
800 		goal = rt_hash_mask + 1;
801 	for (; goal > 0; goal--) {
802 		unsigned long tmo = ip_rt_gc_timeout;
803 		unsigned long length;
804 
805 		i = (i + 1) & rt_hash_mask;
806 		rthp = &rt_hash_table[i].chain;
807 
808 		if (need_resched())
809 			cond_resched();
810 
811 		samples++;
812 
813 		if (*rthp == NULL)
814 			continue;
815 		length = 0;
816 		spin_lock_bh(rt_hash_lock_addr(i));
817 		while ((rth = *rthp) != NULL) {
818 			prefetch(rth->u.dst.rt_next);
819 			if (rt_is_expired(rth)) {
820 				*rthp = rth->u.dst.rt_next;
821 				rt_free(rth);
822 				continue;
823 			}
824 			if (rth->u.dst.expires) {
825 				/* Entry is expired even if it is in use */
826 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
827 nofree:
828 					tmo >>= 1;
829 					rthp = &rth->u.dst.rt_next;
830 					/*
831 					 * We only count entries on
832 					 * a chain with equal hash inputs once
833 					 * so that entries for different QOS
834 					 * levels, and other non-hash input
835 					 * attributes don't unfairly skew
836 					 * the length computation
837 					 */
838 					for (aux = rt_hash_table[i].chain;;) {
839 						if (aux == rth) {
840 							length += ONE;
841 							break;
842 						}
843 						if (compare_hash_inputs(&aux->fl, &rth->fl))
844 							break;
845 						aux = aux->u.dst.rt_next;
846 					}
847 					continue;
848 				}
849 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 				goto nofree;
851 
852 			/* Cleanup aged off entries. */
853 			*rthp = rth->u.dst.rt_next;
854 			rt_free(rth);
855 		}
856 		spin_unlock_bh(rt_hash_lock_addr(i));
857 		sum += length;
858 		sum2 += length*length;
859 	}
860 	if (samples) {
861 		unsigned long avg = sum / samples;
862 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 		rt_chain_length_max = max_t(unsigned long,
864 					ip_rt_gc_elasticity,
865 					(avg + 4*sd) >> FRACT_BITS);
866 	}
867 	rover = i;
868 }
869 
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876 	rt_check_expire();
877 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879 
880 /*
881  * Pertubation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888 	unsigned char shuffle;
889 
890 	get_random_bytes(&shuffle, sizeof(shuffle));
891 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893 
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900 	rt_cache_invalidate(net);
901 	if (delay >= 0)
902 		rt_do_flush(!in_softirq());
903 }
904 
905 /*
906  * We change rt_genid and let gc do the cleanup
907  */
908 static void rt_secret_rebuild(unsigned long __net)
909 {
910 	struct net *net = (struct net *)__net;
911 	rt_cache_invalidate(net);
912 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913 }
914 
915 static void rt_secret_rebuild_oneshot(struct net *net)
916 {
917 	del_timer_sync(&net->ipv4.rt_secret_timer);
918 	rt_cache_invalidate(net);
919 	if (ip_rt_secret_interval) {
920 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 		add_timer(&net->ipv4.rt_secret_timer);
922 	}
923 }
924 
925 static void rt_emergency_hash_rebuild(struct net *net)
926 {
927 	if (net_ratelimit()) {
928 		printk(KERN_WARNING "Route hash chain too long!\n");
929 		printk(KERN_WARNING "Adjust your secret_interval!\n");
930 	}
931 
932 	rt_secret_rebuild_oneshot(net);
933 }
934 
935 /*
936    Short description of GC goals.
937 
938    We want to build algorithm, which will keep routing cache
939    at some equilibrium point, when number of aged off entries
940    is kept approximately equal to newly generated ones.
941 
942    Current expiration strength is variable "expire".
943    We try to adjust it dynamically, so that if networking
944    is idle expires is large enough to keep enough of warm entries,
945    and when load increases it reduces to limit cache size.
946  */
947 
948 static int rt_garbage_collect(struct dst_ops *ops)
949 {
950 	static unsigned long expire = RT_GC_TIMEOUT;
951 	static unsigned long last_gc;
952 	static int rover;
953 	static int equilibrium;
954 	struct rtable *rth, **rthp;
955 	unsigned long now = jiffies;
956 	int goal;
957 
958 	/*
959 	 * Garbage collection is pretty expensive,
960 	 * do not make it too frequently.
961 	 */
962 
963 	RT_CACHE_STAT_INC(gc_total);
964 
965 	if (now - last_gc < ip_rt_gc_min_interval &&
966 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 		RT_CACHE_STAT_INC(gc_ignored);
968 		goto out;
969 	}
970 
971 	/* Calculate number of entries, which we want to expire now. */
972 	goal = atomic_read(&ipv4_dst_ops.entries) -
973 		(ip_rt_gc_elasticity << rt_hash_log);
974 	if (goal <= 0) {
975 		if (equilibrium < ipv4_dst_ops.gc_thresh)
976 			equilibrium = ipv4_dst_ops.gc_thresh;
977 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 		if (goal > 0) {
979 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 		}
982 	} else {
983 		/* We are in dangerous area. Try to reduce cache really
984 		 * aggressively.
985 		 */
986 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 	}
989 
990 	if (now - last_gc >= ip_rt_gc_min_interval)
991 		last_gc = now;
992 
993 	if (goal <= 0) {
994 		equilibrium += goal;
995 		goto work_done;
996 	}
997 
998 	do {
999 		int i, k;
1000 
1001 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 			unsigned long tmo = expire;
1003 
1004 			k = (k + 1) & rt_hash_mask;
1005 			rthp = &rt_hash_table[k].chain;
1006 			spin_lock_bh(rt_hash_lock_addr(k));
1007 			while ((rth = *rthp) != NULL) {
1008 				if (!rt_is_expired(rth) &&
1009 					!rt_may_expire(rth, tmo, expire)) {
1010 					tmo >>= 1;
1011 					rthp = &rth->u.dst.rt_next;
1012 					continue;
1013 				}
1014 				*rthp = rth->u.dst.rt_next;
1015 				rt_free(rth);
1016 				goal--;
1017 			}
1018 			spin_unlock_bh(rt_hash_lock_addr(k));
1019 			if (goal <= 0)
1020 				break;
1021 		}
1022 		rover = k;
1023 
1024 		if (goal <= 0)
1025 			goto work_done;
1026 
1027 		/* Goal is not achieved. We stop process if:
1028 
1029 		   - if expire reduced to zero. Otherwise, expire is halfed.
1030 		   - if table is not full.
1031 		   - if we are called from interrupt.
1032 		   - jiffies check is just fallback/debug loop breaker.
1033 		     We will not spin here for long time in any case.
1034 		 */
1035 
1036 		RT_CACHE_STAT_INC(gc_goal_miss);
1037 
1038 		if (expire == 0)
1039 			break;
1040 
1041 		expire >>= 1;
1042 #if RT_CACHE_DEBUG >= 2
1043 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1045 #endif
1046 
1047 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 			goto out;
1049 	} while (!in_softirq() && time_before_eq(jiffies, now));
1050 
1051 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 		goto out;
1053 	if (net_ratelimit())
1054 		printk(KERN_WARNING "dst cache overflow\n");
1055 	RT_CACHE_STAT_INC(gc_dst_overflow);
1056 	return 1;
1057 
1058 work_done:
1059 	expire += ip_rt_gc_min_interval;
1060 	if (expire > ip_rt_gc_timeout ||
1061 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 		expire = ip_rt_gc_timeout;
1063 #if RT_CACHE_DEBUG >= 2
1064 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066 #endif
1067 out:	return 0;
1068 }
1069 
1070 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 			  struct rtable **rp, struct sk_buff *skb)
1072 {
1073 	struct rtable	*rth, **rthp;
1074 	unsigned long	now;
1075 	struct rtable *cand, **candp;
1076 	u32 		min_score;
1077 	int		chain_length;
1078 	int attempts = !in_softirq();
1079 
1080 restart:
1081 	chain_length = 0;
1082 	min_score = ~(u32)0;
1083 	cand = NULL;
1084 	candp = NULL;
1085 	now = jiffies;
1086 
1087 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088 		rt_drop(rt);
1089 		return 0;
1090 	}
1091 
1092 	rthp = &rt_hash_table[hash].chain;
1093 
1094 	spin_lock_bh(rt_hash_lock_addr(hash));
1095 	while ((rth = *rthp) != NULL) {
1096 		if (rt_is_expired(rth)) {
1097 			*rthp = rth->u.dst.rt_next;
1098 			rt_free(rth);
1099 			continue;
1100 		}
1101 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1102 			/* Put it first */
1103 			*rthp = rth->u.dst.rt_next;
1104 			/*
1105 			 * Since lookup is lockfree, the deletion
1106 			 * must be visible to another weakly ordered CPU before
1107 			 * the insertion at the start of the hash chain.
1108 			 */
1109 			rcu_assign_pointer(rth->u.dst.rt_next,
1110 					   rt_hash_table[hash].chain);
1111 			/*
1112 			 * Since lookup is lockfree, the update writes
1113 			 * must be ordered for consistency on SMP.
1114 			 */
1115 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1116 
1117 			dst_use(&rth->u.dst, now);
1118 			spin_unlock_bh(rt_hash_lock_addr(hash));
1119 
1120 			rt_drop(rt);
1121 			if (rp)
1122 				*rp = rth;
1123 			else
1124 				skb_dst_set(skb, &rth->u.dst);
1125 			return 0;
1126 		}
1127 
1128 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1129 			u32 score = rt_score(rth);
1130 
1131 			if (score <= min_score) {
1132 				cand = rth;
1133 				candp = rthp;
1134 				min_score = score;
1135 			}
1136 		}
1137 
1138 		chain_length++;
1139 
1140 		rthp = &rth->u.dst.rt_next;
1141 	}
1142 
1143 	if (cand) {
1144 		/* ip_rt_gc_elasticity used to be average length of chain
1145 		 * length, when exceeded gc becomes really aggressive.
1146 		 *
1147 		 * The second limit is less certain. At the moment it allows
1148 		 * only 2 entries per bucket. We will see.
1149 		 */
1150 		if (chain_length > ip_rt_gc_elasticity) {
1151 			*candp = cand->u.dst.rt_next;
1152 			rt_free(cand);
1153 		}
1154 	} else {
1155 		if (chain_length > rt_chain_length_max) {
1156 			struct net *net = dev_net(rt->u.dst.dev);
1157 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1158 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1159 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1160 					rt->u.dst.dev->name, num);
1161 			}
1162 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1163 		}
1164 	}
1165 
1166 	/* Try to bind route to arp only if it is output
1167 	   route or unicast forwarding path.
1168 	 */
1169 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1170 		int err = arp_bind_neighbour(&rt->u.dst);
1171 		if (err) {
1172 			spin_unlock_bh(rt_hash_lock_addr(hash));
1173 
1174 			if (err != -ENOBUFS) {
1175 				rt_drop(rt);
1176 				return err;
1177 			}
1178 
1179 			/* Neighbour tables are full and nothing
1180 			   can be released. Try to shrink route cache,
1181 			   it is most likely it holds some neighbour records.
1182 			 */
1183 			if (attempts-- > 0) {
1184 				int saved_elasticity = ip_rt_gc_elasticity;
1185 				int saved_int = ip_rt_gc_min_interval;
1186 				ip_rt_gc_elasticity	= 1;
1187 				ip_rt_gc_min_interval	= 0;
1188 				rt_garbage_collect(&ipv4_dst_ops);
1189 				ip_rt_gc_min_interval	= saved_int;
1190 				ip_rt_gc_elasticity	= saved_elasticity;
1191 				goto restart;
1192 			}
1193 
1194 			if (net_ratelimit())
1195 				printk(KERN_WARNING "Neighbour table overflow.\n");
1196 			rt_drop(rt);
1197 			return -ENOBUFS;
1198 		}
1199 	}
1200 
1201 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1202 
1203 #if RT_CACHE_DEBUG >= 2
1204 	if (rt->u.dst.rt_next) {
1205 		struct rtable *trt;
1206 		printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1207 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1208 			printk(" . %pI4", &trt->rt_dst);
1209 		printk("\n");
1210 	}
1211 #endif
1212 	/*
1213 	 * Since lookup is lockfree, we must make sure
1214 	 * previous writes to rt are comitted to memory
1215 	 * before making rt visible to other CPUS.
1216 	 */
1217 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1218 
1219 	spin_unlock_bh(rt_hash_lock_addr(hash));
1220 	if (rp)
1221 		*rp = rt;
1222 	else
1223 		skb_dst_set(skb, &rt->u.dst);
1224 	return 0;
1225 }
1226 
1227 void rt_bind_peer(struct rtable *rt, int create)
1228 {
1229 	static DEFINE_SPINLOCK(rt_peer_lock);
1230 	struct inet_peer *peer;
1231 
1232 	peer = inet_getpeer(rt->rt_dst, create);
1233 
1234 	spin_lock_bh(&rt_peer_lock);
1235 	if (rt->peer == NULL) {
1236 		rt->peer = peer;
1237 		peer = NULL;
1238 	}
1239 	spin_unlock_bh(&rt_peer_lock);
1240 	if (peer)
1241 		inet_putpeer(peer);
1242 }
1243 
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1254 	static u32 ip_fallback_id;
1255 	u32 salt;
1256 
1257 	spin_lock_bh(&ip_fb_id_lock);
1258 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259 	iph->id = htons(salt & 0xFFFF);
1260 	ip_fallback_id = salt;
1261 	spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263 
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266 	struct rtable *rt = (struct rtable *) dst;
1267 
1268 	if (rt) {
1269 		if (rt->peer == NULL)
1270 			rt_bind_peer(rt, 1);
1271 
1272 		/* If peer is attached to destination, it is never detached,
1273 		   so that we need not to grab a lock to dereference it.
1274 		 */
1275 		if (rt->peer) {
1276 			iph->id = htons(inet_getid(rt->peer, more));
1277 			return;
1278 		}
1279 	} else
1280 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281 		       __builtin_return_address(0));
1282 
1283 	ip_select_fb_ident(iph);
1284 }
1285 
1286 static void rt_del(unsigned hash, struct rtable *rt)
1287 {
1288 	struct rtable **rthp, *aux;
1289 
1290 	rthp = &rt_hash_table[hash].chain;
1291 	spin_lock_bh(rt_hash_lock_addr(hash));
1292 	ip_rt_put(rt);
1293 	while ((aux = *rthp) != NULL) {
1294 		if (aux == rt || rt_is_expired(aux)) {
1295 			*rthp = aux->u.dst.rt_next;
1296 			rt_free(aux);
1297 			continue;
1298 		}
1299 		rthp = &aux->u.dst.rt_next;
1300 	}
1301 	spin_unlock_bh(rt_hash_lock_addr(hash));
1302 }
1303 
1304 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1305 		    __be32 saddr, struct net_device *dev)
1306 {
1307 	int i, k;
1308 	struct in_device *in_dev = in_dev_get(dev);
1309 	struct rtable *rth, **rthp;
1310 	__be32  skeys[2] = { saddr, 0 };
1311 	int  ikeys[2] = { dev->ifindex, 0 };
1312 	struct netevent_redirect netevent;
1313 	struct net *net;
1314 
1315 	if (!in_dev)
1316 		return;
1317 
1318 	net = dev_net(dev);
1319 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1320 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1321 	    || ipv4_is_zeronet(new_gw))
1322 		goto reject_redirect;
1323 
1324 	if (!rt_caching(net))
1325 		goto reject_redirect;
1326 
1327 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1328 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1329 			goto reject_redirect;
1330 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1331 			goto reject_redirect;
1332 	} else {
1333 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1334 			goto reject_redirect;
1335 	}
1336 
1337 	for (i = 0; i < 2; i++) {
1338 		for (k = 0; k < 2; k++) {
1339 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1340 						rt_genid(net));
1341 
1342 			rthp=&rt_hash_table[hash].chain;
1343 
1344 			rcu_read_lock();
1345 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1346 				struct rtable *rt;
1347 
1348 				if (rth->fl.fl4_dst != daddr ||
1349 				    rth->fl.fl4_src != skeys[i] ||
1350 				    rth->fl.oif != ikeys[k] ||
1351 				    rth->fl.iif != 0 ||
1352 				    rt_is_expired(rth) ||
1353 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1354 					rthp = &rth->u.dst.rt_next;
1355 					continue;
1356 				}
1357 
1358 				if (rth->rt_dst != daddr ||
1359 				    rth->rt_src != saddr ||
1360 				    rth->u.dst.error ||
1361 				    rth->rt_gateway != old_gw ||
1362 				    rth->u.dst.dev != dev)
1363 					break;
1364 
1365 				dst_hold(&rth->u.dst);
1366 				rcu_read_unlock();
1367 
1368 				rt = dst_alloc(&ipv4_dst_ops);
1369 				if (rt == NULL) {
1370 					ip_rt_put(rth);
1371 					in_dev_put(in_dev);
1372 					return;
1373 				}
1374 
1375 				/* Copy all the information. */
1376 				*rt = *rth;
1377 				rt->u.dst.__use		= 1;
1378 				atomic_set(&rt->u.dst.__refcnt, 1);
1379 				rt->u.dst.child		= NULL;
1380 				if (rt->u.dst.dev)
1381 					dev_hold(rt->u.dst.dev);
1382 				if (rt->idev)
1383 					in_dev_hold(rt->idev);
1384 				rt->u.dst.obsolete	= 0;
1385 				rt->u.dst.lastuse	= jiffies;
1386 				rt->u.dst.path		= &rt->u.dst;
1387 				rt->u.dst.neighbour	= NULL;
1388 				rt->u.dst.hh		= NULL;
1389 #ifdef CONFIG_XFRM
1390 				rt->u.dst.xfrm		= NULL;
1391 #endif
1392 				rt->rt_genid		= rt_genid(net);
1393 				rt->rt_flags		|= RTCF_REDIRECTED;
1394 
1395 				/* Gateway is different ... */
1396 				rt->rt_gateway		= new_gw;
1397 
1398 				/* Redirect received -> path was valid */
1399 				dst_confirm(&rth->u.dst);
1400 
1401 				if (rt->peer)
1402 					atomic_inc(&rt->peer->refcnt);
1403 
1404 				if (arp_bind_neighbour(&rt->u.dst) ||
1405 				    !(rt->u.dst.neighbour->nud_state &
1406 					    NUD_VALID)) {
1407 					if (rt->u.dst.neighbour)
1408 						neigh_event_send(rt->u.dst.neighbour, NULL);
1409 					ip_rt_put(rth);
1410 					rt_drop(rt);
1411 					goto do_next;
1412 				}
1413 
1414 				netevent.old = &rth->u.dst;
1415 				netevent.new = &rt->u.dst;
1416 				call_netevent_notifiers(NETEVENT_REDIRECT,
1417 							&netevent);
1418 
1419 				rt_del(hash, rth);
1420 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1421 					ip_rt_put(rt);
1422 				goto do_next;
1423 			}
1424 			rcu_read_unlock();
1425 		do_next:
1426 			;
1427 		}
1428 	}
1429 	in_dev_put(in_dev);
1430 	return;
1431 
1432 reject_redirect:
1433 #ifdef CONFIG_IP_ROUTE_VERBOSE
1434 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1435 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1436 			"  Advised path = %pI4 -> %pI4\n",
1437 		       &old_gw, dev->name, &new_gw,
1438 		       &saddr, &daddr);
1439 #endif
1440 	in_dev_put(in_dev);
1441 }
1442 
1443 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1444 {
1445 	struct rtable *rt = (struct rtable *)dst;
1446 	struct dst_entry *ret = dst;
1447 
1448 	if (rt) {
1449 		if (dst->obsolete) {
1450 			ip_rt_put(rt);
1451 			ret = NULL;
1452 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1453 			   rt->u.dst.expires) {
1454 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1455 						rt->fl.oif,
1456 						rt_genid(dev_net(dst->dev)));
1457 #if RT_CACHE_DEBUG >= 1
1458 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1459 				&rt->rt_dst, rt->fl.fl4_tos);
1460 #endif
1461 			rt_del(hash, rt);
1462 			ret = NULL;
1463 		}
1464 	}
1465 	return ret;
1466 }
1467 
1468 /*
1469  * Algorithm:
1470  *	1. The first ip_rt_redirect_number redirects are sent
1471  *	   with exponential backoff, then we stop sending them at all,
1472  *	   assuming that the host ignores our redirects.
1473  *	2. If we did not see packets requiring redirects
1474  *	   during ip_rt_redirect_silence, we assume that the host
1475  *	   forgot redirected route and start to send redirects again.
1476  *
1477  * This algorithm is much cheaper and more intelligent than dumb load limiting
1478  * in icmp.c.
1479  *
1480  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1481  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1482  */
1483 
1484 void ip_rt_send_redirect(struct sk_buff *skb)
1485 {
1486 	struct rtable *rt = skb_rtable(skb);
1487 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1488 
1489 	if (!in_dev)
1490 		return;
1491 
1492 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1493 		goto out;
1494 
1495 	/* No redirected packets during ip_rt_redirect_silence;
1496 	 * reset the algorithm.
1497 	 */
1498 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1499 		rt->u.dst.rate_tokens = 0;
1500 
1501 	/* Too many ignored redirects; do not send anything
1502 	 * set u.dst.rate_last to the last seen redirected packet.
1503 	 */
1504 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1505 		rt->u.dst.rate_last = jiffies;
1506 		goto out;
1507 	}
1508 
1509 	/* Check for load limit; set rate_last to the latest sent
1510 	 * redirect.
1511 	 */
1512 	if (rt->u.dst.rate_tokens == 0 ||
1513 	    time_after(jiffies,
1514 		       (rt->u.dst.rate_last +
1515 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1516 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1517 		rt->u.dst.rate_last = jiffies;
1518 		++rt->u.dst.rate_tokens;
1519 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1521 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1522 		    net_ratelimit())
1523 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1524 				&rt->rt_src, rt->rt_iif,
1525 				&rt->rt_dst, &rt->rt_gateway);
1526 #endif
1527 	}
1528 out:
1529 	in_dev_put(in_dev);
1530 }
1531 
1532 static int ip_error(struct sk_buff *skb)
1533 {
1534 	struct rtable *rt = skb_rtable(skb);
1535 	unsigned long now;
1536 	int code;
1537 
1538 	switch (rt->u.dst.error) {
1539 		case EINVAL:
1540 		default:
1541 			goto out;
1542 		case EHOSTUNREACH:
1543 			code = ICMP_HOST_UNREACH;
1544 			break;
1545 		case ENETUNREACH:
1546 			code = ICMP_NET_UNREACH;
1547 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1548 					IPSTATS_MIB_INNOROUTES);
1549 			break;
1550 		case EACCES:
1551 			code = ICMP_PKT_FILTERED;
1552 			break;
1553 	}
1554 
1555 	now = jiffies;
1556 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1557 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1558 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1559 	rt->u.dst.rate_last = now;
1560 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1561 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1562 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1563 	}
1564 
1565 out:	kfree_skb(skb);
1566 	return 0;
1567 }
1568 
1569 /*
1570  *	The last two values are not from the RFC but
1571  *	are needed for AMPRnet AX.25 paths.
1572  */
1573 
1574 static const unsigned short mtu_plateau[] =
1575 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1576 
1577 static inline unsigned short guess_mtu(unsigned short old_mtu)
1578 {
1579 	int i;
1580 
1581 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1582 		if (old_mtu > mtu_plateau[i])
1583 			return mtu_plateau[i];
1584 	return 68;
1585 }
1586 
1587 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1588 				 unsigned short new_mtu,
1589 				 struct net_device *dev)
1590 {
1591 	int i, k;
1592 	unsigned short old_mtu = ntohs(iph->tot_len);
1593 	struct rtable *rth;
1594 	int  ikeys[2] = { dev->ifindex, 0 };
1595 	__be32  skeys[2] = { iph->saddr, 0, };
1596 	__be32  daddr = iph->daddr;
1597 	unsigned short est_mtu = 0;
1598 
1599 	if (ipv4_config.no_pmtu_disc)
1600 		return 0;
1601 
1602 	for (k = 0; k < 2; k++) {
1603 		for (i = 0; i < 2; i++) {
1604 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1605 						rt_genid(net));
1606 
1607 			rcu_read_lock();
1608 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1609 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1610 				unsigned short mtu = new_mtu;
1611 
1612 				if (rth->fl.fl4_dst != daddr ||
1613 				    rth->fl.fl4_src != skeys[i] ||
1614 				    rth->rt_dst != daddr ||
1615 				    rth->rt_src != iph->saddr ||
1616 				    rth->fl.oif != ikeys[k] ||
1617 				    rth->fl.iif != 0 ||
1618 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1619 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1620 				    rt_is_expired(rth))
1621 					continue;
1622 
1623 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1624 
1625 					/* BSD 4.2 compatibility hack :-( */
1626 					if (mtu == 0 &&
1627 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1628 					    old_mtu >= 68 + (iph->ihl << 2))
1629 						old_mtu -= iph->ihl << 2;
1630 
1631 					mtu = guess_mtu(old_mtu);
1632 				}
1633 				if (mtu <= dst_mtu(&rth->u.dst)) {
1634 					if (mtu < dst_mtu(&rth->u.dst)) {
1635 						dst_confirm(&rth->u.dst);
1636 						if (mtu < ip_rt_min_pmtu) {
1637 							mtu = ip_rt_min_pmtu;
1638 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1639 								(1 << RTAX_MTU);
1640 						}
1641 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1642 						dst_set_expires(&rth->u.dst,
1643 							ip_rt_mtu_expires);
1644 					}
1645 					est_mtu = mtu;
1646 				}
1647 			}
1648 			rcu_read_unlock();
1649 		}
1650 	}
1651 	return est_mtu ? : new_mtu;
1652 }
1653 
1654 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1655 {
1656 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1657 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1658 		if (mtu < ip_rt_min_pmtu) {
1659 			mtu = ip_rt_min_pmtu;
1660 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1661 		}
1662 		dst->metrics[RTAX_MTU-1] = mtu;
1663 		dst_set_expires(dst, ip_rt_mtu_expires);
1664 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1665 	}
1666 }
1667 
1668 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1669 {
1670 	return NULL;
1671 }
1672 
1673 static void ipv4_dst_destroy(struct dst_entry *dst)
1674 {
1675 	struct rtable *rt = (struct rtable *) dst;
1676 	struct inet_peer *peer = rt->peer;
1677 	struct in_device *idev = rt->idev;
1678 
1679 	if (peer) {
1680 		rt->peer = NULL;
1681 		inet_putpeer(peer);
1682 	}
1683 
1684 	if (idev) {
1685 		rt->idev = NULL;
1686 		in_dev_put(idev);
1687 	}
1688 }
1689 
1690 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1691 			    int how)
1692 {
1693 	struct rtable *rt = (struct rtable *) dst;
1694 	struct in_device *idev = rt->idev;
1695 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1696 		struct in_device *loopback_idev =
1697 			in_dev_get(dev_net(dev)->loopback_dev);
1698 		if (loopback_idev) {
1699 			rt->idev = loopback_idev;
1700 			in_dev_put(idev);
1701 		}
1702 	}
1703 }
1704 
1705 static void ipv4_link_failure(struct sk_buff *skb)
1706 {
1707 	struct rtable *rt;
1708 
1709 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1710 
1711 	rt = skb_rtable(skb);
1712 	if (rt)
1713 		dst_set_expires(&rt->u.dst, 0);
1714 }
1715 
1716 static int ip_rt_bug(struct sk_buff *skb)
1717 {
1718 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1719 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1720 		skb->dev ? skb->dev->name : "?");
1721 	kfree_skb(skb);
1722 	return 0;
1723 }
1724 
1725 /*
1726    We do not cache source address of outgoing interface,
1727    because it is used only by IP RR, TS and SRR options,
1728    so that it out of fast path.
1729 
1730    BTW remember: "addr" is allowed to be not aligned
1731    in IP options!
1732  */
1733 
1734 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1735 {
1736 	__be32 src;
1737 	struct fib_result res;
1738 
1739 	if (rt->fl.iif == 0)
1740 		src = rt->rt_src;
1741 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1742 		src = FIB_RES_PREFSRC(res);
1743 		fib_res_put(&res);
1744 	} else
1745 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1746 					RT_SCOPE_UNIVERSE);
1747 	memcpy(addr, &src, 4);
1748 }
1749 
1750 #ifdef CONFIG_NET_CLS_ROUTE
1751 static void set_class_tag(struct rtable *rt, u32 tag)
1752 {
1753 	if (!(rt->u.dst.tclassid & 0xFFFF))
1754 		rt->u.dst.tclassid |= tag & 0xFFFF;
1755 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1756 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1757 }
1758 #endif
1759 
1760 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1761 {
1762 	struct fib_info *fi = res->fi;
1763 
1764 	if (fi) {
1765 		if (FIB_RES_GW(*res) &&
1766 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1767 			rt->rt_gateway = FIB_RES_GW(*res);
1768 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1769 		       sizeof(rt->u.dst.metrics));
1770 		if (fi->fib_mtu == 0) {
1771 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1772 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1773 			    rt->rt_gateway != rt->rt_dst &&
1774 			    rt->u.dst.dev->mtu > 576)
1775 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1776 		}
1777 #ifdef CONFIG_NET_CLS_ROUTE
1778 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1779 #endif
1780 	} else
1781 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1782 
1783 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1784 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1785 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1786 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1787 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1788 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1789 				       ip_rt_min_advmss);
1790 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1791 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1792 
1793 #ifdef CONFIG_NET_CLS_ROUTE
1794 #ifdef CONFIG_IP_MULTIPLE_TABLES
1795 	set_class_tag(rt, fib_rules_tclass(res));
1796 #endif
1797 	set_class_tag(rt, itag);
1798 #endif
1799 	rt->rt_type = res->type;
1800 }
1801 
1802 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1803 				u8 tos, struct net_device *dev, int our)
1804 {
1805 	unsigned hash;
1806 	struct rtable *rth;
1807 	__be32 spec_dst;
1808 	struct in_device *in_dev = in_dev_get(dev);
1809 	u32 itag = 0;
1810 
1811 	/* Primary sanity checks. */
1812 
1813 	if (in_dev == NULL)
1814 		return -EINVAL;
1815 
1816 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1817 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1818 		goto e_inval;
1819 
1820 	if (ipv4_is_zeronet(saddr)) {
1821 		if (!ipv4_is_local_multicast(daddr))
1822 			goto e_inval;
1823 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1824 	} else if (fib_validate_source(saddr, 0, tos, 0,
1825 					dev, &spec_dst, &itag) < 0)
1826 		goto e_inval;
1827 
1828 	rth = dst_alloc(&ipv4_dst_ops);
1829 	if (!rth)
1830 		goto e_nobufs;
1831 
1832 	rth->u.dst.output= ip_rt_bug;
1833 
1834 	atomic_set(&rth->u.dst.__refcnt, 1);
1835 	rth->u.dst.flags= DST_HOST;
1836 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1837 		rth->u.dst.flags |= DST_NOPOLICY;
1838 	rth->fl.fl4_dst	= daddr;
1839 	rth->rt_dst	= daddr;
1840 	rth->fl.fl4_tos	= tos;
1841 	rth->fl.mark    = skb->mark;
1842 	rth->fl.fl4_src	= saddr;
1843 	rth->rt_src	= saddr;
1844 #ifdef CONFIG_NET_CLS_ROUTE
1845 	rth->u.dst.tclassid = itag;
1846 #endif
1847 	rth->rt_iif	=
1848 	rth->fl.iif	= dev->ifindex;
1849 	rth->u.dst.dev	= init_net.loopback_dev;
1850 	dev_hold(rth->u.dst.dev);
1851 	rth->idev	= in_dev_get(rth->u.dst.dev);
1852 	rth->fl.oif	= 0;
1853 	rth->rt_gateway	= daddr;
1854 	rth->rt_spec_dst= spec_dst;
1855 	rth->rt_genid	= rt_genid(dev_net(dev));
1856 	rth->rt_flags	= RTCF_MULTICAST;
1857 	rth->rt_type	= RTN_MULTICAST;
1858 	if (our) {
1859 		rth->u.dst.input= ip_local_deliver;
1860 		rth->rt_flags |= RTCF_LOCAL;
1861 	}
1862 
1863 #ifdef CONFIG_IP_MROUTE
1864 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1865 		rth->u.dst.input = ip_mr_input;
1866 #endif
1867 	RT_CACHE_STAT_INC(in_slow_mc);
1868 
1869 	in_dev_put(in_dev);
1870 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1871 	return rt_intern_hash(hash, rth, NULL, skb);
1872 
1873 e_nobufs:
1874 	in_dev_put(in_dev);
1875 	return -ENOBUFS;
1876 
1877 e_inval:
1878 	in_dev_put(in_dev);
1879 	return -EINVAL;
1880 }
1881 
1882 
1883 static void ip_handle_martian_source(struct net_device *dev,
1884 				     struct in_device *in_dev,
1885 				     struct sk_buff *skb,
1886 				     __be32 daddr,
1887 				     __be32 saddr)
1888 {
1889 	RT_CACHE_STAT_INC(in_martian_src);
1890 #ifdef CONFIG_IP_ROUTE_VERBOSE
1891 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1892 		/*
1893 		 *	RFC1812 recommendation, if source is martian,
1894 		 *	the only hint is MAC header.
1895 		 */
1896 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1897 			&daddr, &saddr, dev->name);
1898 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1899 			int i;
1900 			const unsigned char *p = skb_mac_header(skb);
1901 			printk(KERN_WARNING "ll header: ");
1902 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1903 				printk("%02x", *p);
1904 				if (i < (dev->hard_header_len - 1))
1905 					printk(":");
1906 			}
1907 			printk("\n");
1908 		}
1909 	}
1910 #endif
1911 }
1912 
1913 static int __mkroute_input(struct sk_buff *skb,
1914 			   struct fib_result *res,
1915 			   struct in_device *in_dev,
1916 			   __be32 daddr, __be32 saddr, u32 tos,
1917 			   struct rtable **result)
1918 {
1919 
1920 	struct rtable *rth;
1921 	int err;
1922 	struct in_device *out_dev;
1923 	unsigned flags = 0;
1924 	__be32 spec_dst;
1925 	u32 itag;
1926 
1927 	/* get a working reference to the output device */
1928 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1929 	if (out_dev == NULL) {
1930 		if (net_ratelimit())
1931 			printk(KERN_CRIT "Bug in ip_route_input" \
1932 			       "_slow(). Please, report\n");
1933 		return -EINVAL;
1934 	}
1935 
1936 
1937 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1938 				  in_dev->dev, &spec_dst, &itag);
1939 	if (err < 0) {
1940 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1941 					 saddr);
1942 
1943 		err = -EINVAL;
1944 		goto cleanup;
1945 	}
1946 
1947 	if (err)
1948 		flags |= RTCF_DIRECTSRC;
1949 
1950 	if (out_dev == in_dev && err &&
1951 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1952 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1953 		flags |= RTCF_DOREDIRECT;
1954 
1955 	if (skb->protocol != htons(ETH_P_IP)) {
1956 		/* Not IP (i.e. ARP). Do not create route, if it is
1957 		 * invalid for proxy arp. DNAT routes are always valid.
1958 		 */
1959 		if (out_dev == in_dev) {
1960 			err = -EINVAL;
1961 			goto cleanup;
1962 		}
1963 	}
1964 
1965 
1966 	rth = dst_alloc(&ipv4_dst_ops);
1967 	if (!rth) {
1968 		err = -ENOBUFS;
1969 		goto cleanup;
1970 	}
1971 
1972 	atomic_set(&rth->u.dst.__refcnt, 1);
1973 	rth->u.dst.flags= DST_HOST;
1974 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1975 		rth->u.dst.flags |= DST_NOPOLICY;
1976 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1977 		rth->u.dst.flags |= DST_NOXFRM;
1978 	rth->fl.fl4_dst	= daddr;
1979 	rth->rt_dst	= daddr;
1980 	rth->fl.fl4_tos	= tos;
1981 	rth->fl.mark    = skb->mark;
1982 	rth->fl.fl4_src	= saddr;
1983 	rth->rt_src	= saddr;
1984 	rth->rt_gateway	= daddr;
1985 	rth->rt_iif 	=
1986 		rth->fl.iif	= in_dev->dev->ifindex;
1987 	rth->u.dst.dev	= (out_dev)->dev;
1988 	dev_hold(rth->u.dst.dev);
1989 	rth->idev	= in_dev_get(rth->u.dst.dev);
1990 	rth->fl.oif 	= 0;
1991 	rth->rt_spec_dst= spec_dst;
1992 
1993 	rth->u.dst.input = ip_forward;
1994 	rth->u.dst.output = ip_output;
1995 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1996 
1997 	rt_set_nexthop(rth, res, itag);
1998 
1999 	rth->rt_flags = flags;
2000 
2001 	*result = rth;
2002 	err = 0;
2003  cleanup:
2004 	/* release the working reference to the output device */
2005 	in_dev_put(out_dev);
2006 	return err;
2007 }
2008 
2009 static int ip_mkroute_input(struct sk_buff *skb,
2010 			    struct fib_result *res,
2011 			    const struct flowi *fl,
2012 			    struct in_device *in_dev,
2013 			    __be32 daddr, __be32 saddr, u32 tos)
2014 {
2015 	struct rtable* rth = NULL;
2016 	int err;
2017 	unsigned hash;
2018 
2019 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2020 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2021 		fib_select_multipath(fl, res);
2022 #endif
2023 
2024 	/* create a routing cache entry */
2025 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2026 	if (err)
2027 		return err;
2028 
2029 	/* put it into the cache */
2030 	hash = rt_hash(daddr, saddr, fl->iif,
2031 		       rt_genid(dev_net(rth->u.dst.dev)));
2032 	return rt_intern_hash(hash, rth, NULL, skb);
2033 }
2034 
2035 /*
2036  *	NOTE. We drop all the packets that has local source
2037  *	addresses, because every properly looped back packet
2038  *	must have correct destination already attached by output routine.
2039  *
2040  *	Such approach solves two big problems:
2041  *	1. Not simplex devices are handled properly.
2042  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2043  */
2044 
2045 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046 			       u8 tos, struct net_device *dev)
2047 {
2048 	struct fib_result res;
2049 	struct in_device *in_dev = in_dev_get(dev);
2050 	struct flowi fl = { .nl_u = { .ip4_u =
2051 				      { .daddr = daddr,
2052 					.saddr = saddr,
2053 					.tos = tos,
2054 					.scope = RT_SCOPE_UNIVERSE,
2055 				      } },
2056 			    .mark = skb->mark,
2057 			    .iif = dev->ifindex };
2058 	unsigned	flags = 0;
2059 	u32		itag = 0;
2060 	struct rtable * rth;
2061 	unsigned	hash;
2062 	__be32		spec_dst;
2063 	int		err = -EINVAL;
2064 	int		free_res = 0;
2065 	struct net    * net = dev_net(dev);
2066 
2067 	/* IP on this device is disabled. */
2068 
2069 	if (!in_dev)
2070 		goto out;
2071 
2072 	/* Check for the most weird martians, which can be not detected
2073 	   by fib_lookup.
2074 	 */
2075 
2076 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2077 	    ipv4_is_loopback(saddr))
2078 		goto martian_source;
2079 
2080 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2081 		goto brd_input;
2082 
2083 	/* Accept zero addresses only to limited broadcast;
2084 	 * I even do not know to fix it or not. Waiting for complains :-)
2085 	 */
2086 	if (ipv4_is_zeronet(saddr))
2087 		goto martian_source;
2088 
2089 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2090 	    ipv4_is_loopback(daddr))
2091 		goto martian_destination;
2092 
2093 	/*
2094 	 *	Now we are ready to route packet.
2095 	 */
2096 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2097 		if (!IN_DEV_FORWARD(in_dev))
2098 			goto e_hostunreach;
2099 		goto no_route;
2100 	}
2101 	free_res = 1;
2102 
2103 	RT_CACHE_STAT_INC(in_slow_tot);
2104 
2105 	if (res.type == RTN_BROADCAST)
2106 		goto brd_input;
2107 
2108 	if (res.type == RTN_LOCAL) {
2109 		int result;
2110 		result = fib_validate_source(saddr, daddr, tos,
2111 					     net->loopback_dev->ifindex,
2112 					     dev, &spec_dst, &itag);
2113 		if (result < 0)
2114 			goto martian_source;
2115 		if (result)
2116 			flags |= RTCF_DIRECTSRC;
2117 		spec_dst = daddr;
2118 		goto local_input;
2119 	}
2120 
2121 	if (!IN_DEV_FORWARD(in_dev))
2122 		goto e_hostunreach;
2123 	if (res.type != RTN_UNICAST)
2124 		goto martian_destination;
2125 
2126 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2127 done:
2128 	in_dev_put(in_dev);
2129 	if (free_res)
2130 		fib_res_put(&res);
2131 out:	return err;
2132 
2133 brd_input:
2134 	if (skb->protocol != htons(ETH_P_IP))
2135 		goto e_inval;
2136 
2137 	if (ipv4_is_zeronet(saddr))
2138 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2139 	else {
2140 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2141 					  &itag);
2142 		if (err < 0)
2143 			goto martian_source;
2144 		if (err)
2145 			flags |= RTCF_DIRECTSRC;
2146 	}
2147 	flags |= RTCF_BROADCAST;
2148 	res.type = RTN_BROADCAST;
2149 	RT_CACHE_STAT_INC(in_brd);
2150 
2151 local_input:
2152 	rth = dst_alloc(&ipv4_dst_ops);
2153 	if (!rth)
2154 		goto e_nobufs;
2155 
2156 	rth->u.dst.output= ip_rt_bug;
2157 	rth->rt_genid = rt_genid(net);
2158 
2159 	atomic_set(&rth->u.dst.__refcnt, 1);
2160 	rth->u.dst.flags= DST_HOST;
2161 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2162 		rth->u.dst.flags |= DST_NOPOLICY;
2163 	rth->fl.fl4_dst	= daddr;
2164 	rth->rt_dst	= daddr;
2165 	rth->fl.fl4_tos	= tos;
2166 	rth->fl.mark    = skb->mark;
2167 	rth->fl.fl4_src	= saddr;
2168 	rth->rt_src	= saddr;
2169 #ifdef CONFIG_NET_CLS_ROUTE
2170 	rth->u.dst.tclassid = itag;
2171 #endif
2172 	rth->rt_iif	=
2173 	rth->fl.iif	= dev->ifindex;
2174 	rth->u.dst.dev	= net->loopback_dev;
2175 	dev_hold(rth->u.dst.dev);
2176 	rth->idev	= in_dev_get(rth->u.dst.dev);
2177 	rth->rt_gateway	= daddr;
2178 	rth->rt_spec_dst= spec_dst;
2179 	rth->u.dst.input= ip_local_deliver;
2180 	rth->rt_flags 	= flags|RTCF_LOCAL;
2181 	if (res.type == RTN_UNREACHABLE) {
2182 		rth->u.dst.input= ip_error;
2183 		rth->u.dst.error= -err;
2184 		rth->rt_flags 	&= ~RTCF_LOCAL;
2185 	}
2186 	rth->rt_type	= res.type;
2187 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2188 	err = rt_intern_hash(hash, rth, NULL, skb);
2189 	goto done;
2190 
2191 no_route:
2192 	RT_CACHE_STAT_INC(in_no_route);
2193 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2194 	res.type = RTN_UNREACHABLE;
2195 	if (err == -ESRCH)
2196 		err = -ENETUNREACH;
2197 	goto local_input;
2198 
2199 	/*
2200 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2201 	 */
2202 martian_destination:
2203 	RT_CACHE_STAT_INC(in_martian_dst);
2204 #ifdef CONFIG_IP_ROUTE_VERBOSE
2205 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2206 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2207 			&daddr, &saddr, dev->name);
2208 #endif
2209 
2210 e_hostunreach:
2211 	err = -EHOSTUNREACH;
2212 	goto done;
2213 
2214 e_inval:
2215 	err = -EINVAL;
2216 	goto done;
2217 
2218 e_nobufs:
2219 	err = -ENOBUFS;
2220 	goto done;
2221 
2222 martian_source:
2223 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2224 	goto e_inval;
2225 }
2226 
2227 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2228 		   u8 tos, struct net_device *dev)
2229 {
2230 	struct rtable * rth;
2231 	unsigned	hash;
2232 	int iif = dev->ifindex;
2233 	struct net *net;
2234 
2235 	net = dev_net(dev);
2236 
2237 	if (!rt_caching(net))
2238 		goto skip_cache;
2239 
2240 	tos &= IPTOS_RT_MASK;
2241 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2242 
2243 	rcu_read_lock();
2244 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2245 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2246 		if (((rth->fl.fl4_dst ^ daddr) |
2247 		     (rth->fl.fl4_src ^ saddr) |
2248 		     (rth->fl.iif ^ iif) |
2249 		     rth->fl.oif |
2250 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2251 		    rth->fl.mark == skb->mark &&
2252 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2253 		    !rt_is_expired(rth)) {
2254 			dst_use(&rth->u.dst, jiffies);
2255 			RT_CACHE_STAT_INC(in_hit);
2256 			rcu_read_unlock();
2257 			skb_dst_set(skb, &rth->u.dst);
2258 			return 0;
2259 		}
2260 		RT_CACHE_STAT_INC(in_hlist_search);
2261 	}
2262 	rcu_read_unlock();
2263 
2264 skip_cache:
2265 	/* Multicast recognition logic is moved from route cache to here.
2266 	   The problem was that too many Ethernet cards have broken/missing
2267 	   hardware multicast filters :-( As result the host on multicasting
2268 	   network acquires a lot of useless route cache entries, sort of
2269 	   SDR messages from all the world. Now we try to get rid of them.
2270 	   Really, provided software IP multicast filter is organized
2271 	   reasonably (at least, hashed), it does not result in a slowdown
2272 	   comparing with route cache reject entries.
2273 	   Note, that multicast routers are not affected, because
2274 	   route cache entry is created eventually.
2275 	 */
2276 	if (ipv4_is_multicast(daddr)) {
2277 		struct in_device *in_dev;
2278 
2279 		rcu_read_lock();
2280 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2281 			int our = ip_check_mc(in_dev, daddr, saddr,
2282 				ip_hdr(skb)->protocol);
2283 			if (our
2284 #ifdef CONFIG_IP_MROUTE
2285 			    || (!ipv4_is_local_multicast(daddr) &&
2286 				IN_DEV_MFORWARD(in_dev))
2287 #endif
2288 			    ) {
2289 				rcu_read_unlock();
2290 				return ip_route_input_mc(skb, daddr, saddr,
2291 							 tos, dev, our);
2292 			}
2293 		}
2294 		rcu_read_unlock();
2295 		return -EINVAL;
2296 	}
2297 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2298 }
2299 
2300 static int __mkroute_output(struct rtable **result,
2301 			    struct fib_result *res,
2302 			    const struct flowi *fl,
2303 			    const struct flowi *oldflp,
2304 			    struct net_device *dev_out,
2305 			    unsigned flags)
2306 {
2307 	struct rtable *rth;
2308 	struct in_device *in_dev;
2309 	u32 tos = RT_FL_TOS(oldflp);
2310 	int err = 0;
2311 
2312 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2313 		return -EINVAL;
2314 
2315 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2316 		res->type = RTN_BROADCAST;
2317 	else if (ipv4_is_multicast(fl->fl4_dst))
2318 		res->type = RTN_MULTICAST;
2319 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2320 		return -EINVAL;
2321 
2322 	if (dev_out->flags & IFF_LOOPBACK)
2323 		flags |= RTCF_LOCAL;
2324 
2325 	/* get work reference to inet device */
2326 	in_dev = in_dev_get(dev_out);
2327 	if (!in_dev)
2328 		return -EINVAL;
2329 
2330 	if (res->type == RTN_BROADCAST) {
2331 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2332 		if (res->fi) {
2333 			fib_info_put(res->fi);
2334 			res->fi = NULL;
2335 		}
2336 	} else if (res->type == RTN_MULTICAST) {
2337 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2338 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2339 				 oldflp->proto))
2340 			flags &= ~RTCF_LOCAL;
2341 		/* If multicast route do not exist use
2342 		   default one, but do not gateway in this case.
2343 		   Yes, it is hack.
2344 		 */
2345 		if (res->fi && res->prefixlen < 4) {
2346 			fib_info_put(res->fi);
2347 			res->fi = NULL;
2348 		}
2349 	}
2350 
2351 
2352 	rth = dst_alloc(&ipv4_dst_ops);
2353 	if (!rth) {
2354 		err = -ENOBUFS;
2355 		goto cleanup;
2356 	}
2357 
2358 	atomic_set(&rth->u.dst.__refcnt, 1);
2359 	rth->u.dst.flags= DST_HOST;
2360 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2361 		rth->u.dst.flags |= DST_NOXFRM;
2362 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2363 		rth->u.dst.flags |= DST_NOPOLICY;
2364 
2365 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2366 	rth->fl.fl4_tos	= tos;
2367 	rth->fl.fl4_src	= oldflp->fl4_src;
2368 	rth->fl.oif	= oldflp->oif;
2369 	rth->fl.mark    = oldflp->mark;
2370 	rth->rt_dst	= fl->fl4_dst;
2371 	rth->rt_src	= fl->fl4_src;
2372 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2373 	/* get references to the devices that are to be hold by the routing
2374 	   cache entry */
2375 	rth->u.dst.dev	= dev_out;
2376 	dev_hold(dev_out);
2377 	rth->idev	= in_dev_get(dev_out);
2378 	rth->rt_gateway = fl->fl4_dst;
2379 	rth->rt_spec_dst= fl->fl4_src;
2380 
2381 	rth->u.dst.output=ip_output;
2382 	rth->rt_genid = rt_genid(dev_net(dev_out));
2383 
2384 	RT_CACHE_STAT_INC(out_slow_tot);
2385 
2386 	if (flags & RTCF_LOCAL) {
2387 		rth->u.dst.input = ip_local_deliver;
2388 		rth->rt_spec_dst = fl->fl4_dst;
2389 	}
2390 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2391 		rth->rt_spec_dst = fl->fl4_src;
2392 		if (flags & RTCF_LOCAL &&
2393 		    !(dev_out->flags & IFF_LOOPBACK)) {
2394 			rth->u.dst.output = ip_mc_output;
2395 			RT_CACHE_STAT_INC(out_slow_mc);
2396 		}
2397 #ifdef CONFIG_IP_MROUTE
2398 		if (res->type == RTN_MULTICAST) {
2399 			if (IN_DEV_MFORWARD(in_dev) &&
2400 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2401 				rth->u.dst.input = ip_mr_input;
2402 				rth->u.dst.output = ip_mc_output;
2403 			}
2404 		}
2405 #endif
2406 	}
2407 
2408 	rt_set_nexthop(rth, res, 0);
2409 
2410 	rth->rt_flags = flags;
2411 
2412 	*result = rth;
2413  cleanup:
2414 	/* release work reference to inet device */
2415 	in_dev_put(in_dev);
2416 
2417 	return err;
2418 }
2419 
2420 static int ip_mkroute_output(struct rtable **rp,
2421 			     struct fib_result *res,
2422 			     const struct flowi *fl,
2423 			     const struct flowi *oldflp,
2424 			     struct net_device *dev_out,
2425 			     unsigned flags)
2426 {
2427 	struct rtable *rth = NULL;
2428 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2429 	unsigned hash;
2430 	if (err == 0) {
2431 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2432 			       rt_genid(dev_net(dev_out)));
2433 		err = rt_intern_hash(hash, rth, rp, NULL);
2434 	}
2435 
2436 	return err;
2437 }
2438 
2439 /*
2440  * Major route resolver routine.
2441  */
2442 
2443 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2444 				const struct flowi *oldflp)
2445 {
2446 	u32 tos	= RT_FL_TOS(oldflp);
2447 	struct flowi fl = { .nl_u = { .ip4_u =
2448 				      { .daddr = oldflp->fl4_dst,
2449 					.saddr = oldflp->fl4_src,
2450 					.tos = tos & IPTOS_RT_MASK,
2451 					.scope = ((tos & RTO_ONLINK) ?
2452 						  RT_SCOPE_LINK :
2453 						  RT_SCOPE_UNIVERSE),
2454 				      } },
2455 			    .mark = oldflp->mark,
2456 			    .iif = net->loopback_dev->ifindex,
2457 			    .oif = oldflp->oif };
2458 	struct fib_result res;
2459 	unsigned flags = 0;
2460 	struct net_device *dev_out = NULL;
2461 	int free_res = 0;
2462 	int err;
2463 
2464 
2465 	res.fi		= NULL;
2466 #ifdef CONFIG_IP_MULTIPLE_TABLES
2467 	res.r		= NULL;
2468 #endif
2469 
2470 	if (oldflp->fl4_src) {
2471 		err = -EINVAL;
2472 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2473 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2474 		    ipv4_is_zeronet(oldflp->fl4_src))
2475 			goto out;
2476 
2477 		/* I removed check for oif == dev_out->oif here.
2478 		   It was wrong for two reasons:
2479 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2480 		      is assigned to multiple interfaces.
2481 		   2. Moreover, we are allowed to send packets with saddr
2482 		      of another iface. --ANK
2483 		 */
2484 
2485 		if (oldflp->oif == 0
2486 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2487 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2488 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2489 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2490 			if (dev_out == NULL)
2491 				goto out;
2492 
2493 			/* Special hack: user can direct multicasts
2494 			   and limited broadcast via necessary interface
2495 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2496 			   This hack is not just for fun, it allows
2497 			   vic,vat and friends to work.
2498 			   They bind socket to loopback, set ttl to zero
2499 			   and expect that it will work.
2500 			   From the viewpoint of routing cache they are broken,
2501 			   because we are not allowed to build multicast path
2502 			   with loopback source addr (look, routing cache
2503 			   cannot know, that ttl is zero, so that packet
2504 			   will not leave this host and route is valid).
2505 			   Luckily, this hack is good workaround.
2506 			 */
2507 
2508 			fl.oif = dev_out->ifindex;
2509 			goto make_route;
2510 		}
2511 
2512 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2513 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2514 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2515 			if (dev_out == NULL)
2516 				goto out;
2517 			dev_put(dev_out);
2518 			dev_out = NULL;
2519 		}
2520 	}
2521 
2522 
2523 	if (oldflp->oif) {
2524 		dev_out = dev_get_by_index(net, oldflp->oif);
2525 		err = -ENODEV;
2526 		if (dev_out == NULL)
2527 			goto out;
2528 
2529 		/* RACE: Check return value of inet_select_addr instead. */
2530 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2531 			dev_put(dev_out);
2532 			goto out;	/* Wrong error code */
2533 		}
2534 
2535 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2536 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2537 			if (!fl.fl4_src)
2538 				fl.fl4_src = inet_select_addr(dev_out, 0,
2539 							      RT_SCOPE_LINK);
2540 			goto make_route;
2541 		}
2542 		if (!fl.fl4_src) {
2543 			if (ipv4_is_multicast(oldflp->fl4_dst))
2544 				fl.fl4_src = inet_select_addr(dev_out, 0,
2545 							      fl.fl4_scope);
2546 			else if (!oldflp->fl4_dst)
2547 				fl.fl4_src = inet_select_addr(dev_out, 0,
2548 							      RT_SCOPE_HOST);
2549 		}
2550 	}
2551 
2552 	if (!fl.fl4_dst) {
2553 		fl.fl4_dst = fl.fl4_src;
2554 		if (!fl.fl4_dst)
2555 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2556 		if (dev_out)
2557 			dev_put(dev_out);
2558 		dev_out = net->loopback_dev;
2559 		dev_hold(dev_out);
2560 		fl.oif = net->loopback_dev->ifindex;
2561 		res.type = RTN_LOCAL;
2562 		flags |= RTCF_LOCAL;
2563 		goto make_route;
2564 	}
2565 
2566 	if (fib_lookup(net, &fl, &res)) {
2567 		res.fi = NULL;
2568 		if (oldflp->oif) {
2569 			/* Apparently, routing tables are wrong. Assume,
2570 			   that the destination is on link.
2571 
2572 			   WHY? DW.
2573 			   Because we are allowed to send to iface
2574 			   even if it has NO routes and NO assigned
2575 			   addresses. When oif is specified, routing
2576 			   tables are looked up with only one purpose:
2577 			   to catch if destination is gatewayed, rather than
2578 			   direct. Moreover, if MSG_DONTROUTE is set,
2579 			   we send packet, ignoring both routing tables
2580 			   and ifaddr state. --ANK
2581 
2582 
2583 			   We could make it even if oif is unknown,
2584 			   likely IPv6, but we do not.
2585 			 */
2586 
2587 			if (fl.fl4_src == 0)
2588 				fl.fl4_src = inet_select_addr(dev_out, 0,
2589 							      RT_SCOPE_LINK);
2590 			res.type = RTN_UNICAST;
2591 			goto make_route;
2592 		}
2593 		if (dev_out)
2594 			dev_put(dev_out);
2595 		err = -ENETUNREACH;
2596 		goto out;
2597 	}
2598 	free_res = 1;
2599 
2600 	if (res.type == RTN_LOCAL) {
2601 		if (!fl.fl4_src)
2602 			fl.fl4_src = fl.fl4_dst;
2603 		if (dev_out)
2604 			dev_put(dev_out);
2605 		dev_out = net->loopback_dev;
2606 		dev_hold(dev_out);
2607 		fl.oif = dev_out->ifindex;
2608 		if (res.fi)
2609 			fib_info_put(res.fi);
2610 		res.fi = NULL;
2611 		flags |= RTCF_LOCAL;
2612 		goto make_route;
2613 	}
2614 
2615 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2616 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2617 		fib_select_multipath(&fl, &res);
2618 	else
2619 #endif
2620 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2621 		fib_select_default(net, &fl, &res);
2622 
2623 	if (!fl.fl4_src)
2624 		fl.fl4_src = FIB_RES_PREFSRC(res);
2625 
2626 	if (dev_out)
2627 		dev_put(dev_out);
2628 	dev_out = FIB_RES_DEV(res);
2629 	dev_hold(dev_out);
2630 	fl.oif = dev_out->ifindex;
2631 
2632 
2633 make_route:
2634 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2635 
2636 
2637 	if (free_res)
2638 		fib_res_put(&res);
2639 	if (dev_out)
2640 		dev_put(dev_out);
2641 out:	return err;
2642 }
2643 
2644 int __ip_route_output_key(struct net *net, struct rtable **rp,
2645 			  const struct flowi *flp)
2646 {
2647 	unsigned hash;
2648 	struct rtable *rth;
2649 
2650 	if (!rt_caching(net))
2651 		goto slow_output;
2652 
2653 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2654 
2655 	rcu_read_lock_bh();
2656 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2657 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2658 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2659 		    rth->fl.fl4_src == flp->fl4_src &&
2660 		    rth->fl.iif == 0 &&
2661 		    rth->fl.oif == flp->oif &&
2662 		    rth->fl.mark == flp->mark &&
2663 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2664 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2665 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2666 		    !rt_is_expired(rth)) {
2667 			dst_use(&rth->u.dst, jiffies);
2668 			RT_CACHE_STAT_INC(out_hit);
2669 			rcu_read_unlock_bh();
2670 			*rp = rth;
2671 			return 0;
2672 		}
2673 		RT_CACHE_STAT_INC(out_hlist_search);
2674 	}
2675 	rcu_read_unlock_bh();
2676 
2677 slow_output:
2678 	return ip_route_output_slow(net, rp, flp);
2679 }
2680 
2681 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2682 
2683 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2684 {
2685 }
2686 
2687 static struct dst_ops ipv4_dst_blackhole_ops = {
2688 	.family			=	AF_INET,
2689 	.protocol		=	cpu_to_be16(ETH_P_IP),
2690 	.destroy		=	ipv4_dst_destroy,
2691 	.check			=	ipv4_dst_check,
2692 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2693 	.entries		=	ATOMIC_INIT(0),
2694 };
2695 
2696 
2697 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2698 {
2699 	struct rtable *ort = *rp;
2700 	struct rtable *rt = (struct rtable *)
2701 		dst_alloc(&ipv4_dst_blackhole_ops);
2702 
2703 	if (rt) {
2704 		struct dst_entry *new = &rt->u.dst;
2705 
2706 		atomic_set(&new->__refcnt, 1);
2707 		new->__use = 1;
2708 		new->input = dst_discard;
2709 		new->output = dst_discard;
2710 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2711 
2712 		new->dev = ort->u.dst.dev;
2713 		if (new->dev)
2714 			dev_hold(new->dev);
2715 
2716 		rt->fl = ort->fl;
2717 
2718 		rt->idev = ort->idev;
2719 		if (rt->idev)
2720 			in_dev_hold(rt->idev);
2721 		rt->rt_genid = rt_genid(net);
2722 		rt->rt_flags = ort->rt_flags;
2723 		rt->rt_type = ort->rt_type;
2724 		rt->rt_dst = ort->rt_dst;
2725 		rt->rt_src = ort->rt_src;
2726 		rt->rt_iif = ort->rt_iif;
2727 		rt->rt_gateway = ort->rt_gateway;
2728 		rt->rt_spec_dst = ort->rt_spec_dst;
2729 		rt->peer = ort->peer;
2730 		if (rt->peer)
2731 			atomic_inc(&rt->peer->refcnt);
2732 
2733 		dst_free(new);
2734 	}
2735 
2736 	dst_release(&(*rp)->u.dst);
2737 	*rp = rt;
2738 	return (rt ? 0 : -ENOMEM);
2739 }
2740 
2741 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2742 			 struct sock *sk, int flags)
2743 {
2744 	int err;
2745 
2746 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2747 		return err;
2748 
2749 	if (flp->proto) {
2750 		if (!flp->fl4_src)
2751 			flp->fl4_src = (*rp)->rt_src;
2752 		if (!flp->fl4_dst)
2753 			flp->fl4_dst = (*rp)->rt_dst;
2754 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2755 				    flags ? XFRM_LOOKUP_WAIT : 0);
2756 		if (err == -EREMOTE)
2757 			err = ipv4_dst_blackhole(net, rp, flp);
2758 
2759 		return err;
2760 	}
2761 
2762 	return 0;
2763 }
2764 
2765 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2766 
2767 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2768 {
2769 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2770 }
2771 
2772 static int rt_fill_info(struct net *net,
2773 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2774 			int nowait, unsigned int flags)
2775 {
2776 	struct rtable *rt = skb_rtable(skb);
2777 	struct rtmsg *r;
2778 	struct nlmsghdr *nlh;
2779 	long expires;
2780 	u32 id = 0, ts = 0, tsage = 0, error;
2781 
2782 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2783 	if (nlh == NULL)
2784 		return -EMSGSIZE;
2785 
2786 	r = nlmsg_data(nlh);
2787 	r->rtm_family	 = AF_INET;
2788 	r->rtm_dst_len	= 32;
2789 	r->rtm_src_len	= 0;
2790 	r->rtm_tos	= rt->fl.fl4_tos;
2791 	r->rtm_table	= RT_TABLE_MAIN;
2792 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2793 	r->rtm_type	= rt->rt_type;
2794 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2795 	r->rtm_protocol = RTPROT_UNSPEC;
2796 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2797 	if (rt->rt_flags & RTCF_NOTIFY)
2798 		r->rtm_flags |= RTM_F_NOTIFY;
2799 
2800 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2801 
2802 	if (rt->fl.fl4_src) {
2803 		r->rtm_src_len = 32;
2804 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2805 	}
2806 	if (rt->u.dst.dev)
2807 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2808 #ifdef CONFIG_NET_CLS_ROUTE
2809 	if (rt->u.dst.tclassid)
2810 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2811 #endif
2812 	if (rt->fl.iif)
2813 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2814 	else if (rt->rt_src != rt->fl.fl4_src)
2815 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2816 
2817 	if (rt->rt_dst != rt->rt_gateway)
2818 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2819 
2820 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2821 		goto nla_put_failure;
2822 
2823 	error = rt->u.dst.error;
2824 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2825 	if (rt->peer) {
2826 		id = rt->peer->ip_id_count;
2827 		if (rt->peer->tcp_ts_stamp) {
2828 			ts = rt->peer->tcp_ts;
2829 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2830 		}
2831 	}
2832 
2833 	if (rt->fl.iif) {
2834 #ifdef CONFIG_IP_MROUTE
2835 		__be32 dst = rt->rt_dst;
2836 
2837 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2838 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2839 			int err = ipmr_get_route(net, skb, r, nowait);
2840 			if (err <= 0) {
2841 				if (!nowait) {
2842 					if (err == 0)
2843 						return 0;
2844 					goto nla_put_failure;
2845 				} else {
2846 					if (err == -EMSGSIZE)
2847 						goto nla_put_failure;
2848 					error = err;
2849 				}
2850 			}
2851 		} else
2852 #endif
2853 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2854 	}
2855 
2856 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2857 			       expires, error) < 0)
2858 		goto nla_put_failure;
2859 
2860 	return nlmsg_end(skb, nlh);
2861 
2862 nla_put_failure:
2863 	nlmsg_cancel(skb, nlh);
2864 	return -EMSGSIZE;
2865 }
2866 
2867 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2868 {
2869 	struct net *net = sock_net(in_skb->sk);
2870 	struct rtmsg *rtm;
2871 	struct nlattr *tb[RTA_MAX+1];
2872 	struct rtable *rt = NULL;
2873 	__be32 dst = 0;
2874 	__be32 src = 0;
2875 	u32 iif;
2876 	int err;
2877 	struct sk_buff *skb;
2878 
2879 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2880 	if (err < 0)
2881 		goto errout;
2882 
2883 	rtm = nlmsg_data(nlh);
2884 
2885 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2886 	if (skb == NULL) {
2887 		err = -ENOBUFS;
2888 		goto errout;
2889 	}
2890 
2891 	/* Reserve room for dummy headers, this skb can pass
2892 	   through good chunk of routing engine.
2893 	 */
2894 	skb_reset_mac_header(skb);
2895 	skb_reset_network_header(skb);
2896 
2897 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2898 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2899 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2900 
2901 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2902 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2903 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2904 
2905 	if (iif) {
2906 		struct net_device *dev;
2907 
2908 		dev = __dev_get_by_index(net, iif);
2909 		if (dev == NULL) {
2910 			err = -ENODEV;
2911 			goto errout_free;
2912 		}
2913 
2914 		skb->protocol	= htons(ETH_P_IP);
2915 		skb->dev	= dev;
2916 		local_bh_disable();
2917 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2918 		local_bh_enable();
2919 
2920 		rt = skb_rtable(skb);
2921 		if (err == 0 && rt->u.dst.error)
2922 			err = -rt->u.dst.error;
2923 	} else {
2924 		struct flowi fl = {
2925 			.nl_u = {
2926 				.ip4_u = {
2927 					.daddr = dst,
2928 					.saddr = src,
2929 					.tos = rtm->rtm_tos,
2930 				},
2931 			},
2932 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2933 		};
2934 		err = ip_route_output_key(net, &rt, &fl);
2935 	}
2936 
2937 	if (err)
2938 		goto errout_free;
2939 
2940 	skb_dst_set(skb, &rt->u.dst);
2941 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2942 		rt->rt_flags |= RTCF_NOTIFY;
2943 
2944 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2945 			   RTM_NEWROUTE, 0, 0);
2946 	if (err <= 0)
2947 		goto errout_free;
2948 
2949 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2950 errout:
2951 	return err;
2952 
2953 errout_free:
2954 	kfree_skb(skb);
2955 	goto errout;
2956 }
2957 
2958 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2959 {
2960 	struct rtable *rt;
2961 	int h, s_h;
2962 	int idx, s_idx;
2963 	struct net *net;
2964 
2965 	net = sock_net(skb->sk);
2966 
2967 	s_h = cb->args[0];
2968 	if (s_h < 0)
2969 		s_h = 0;
2970 	s_idx = idx = cb->args[1];
2971 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2972 		if (!rt_hash_table[h].chain)
2973 			continue;
2974 		rcu_read_lock_bh();
2975 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2976 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2977 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2978 				continue;
2979 			if (rt_is_expired(rt))
2980 				continue;
2981 			skb_dst_set(skb, dst_clone(&rt->u.dst));
2982 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2983 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2984 					 1, NLM_F_MULTI) <= 0) {
2985 				skb_dst_drop(skb);
2986 				rcu_read_unlock_bh();
2987 				goto done;
2988 			}
2989 			skb_dst_drop(skb);
2990 		}
2991 		rcu_read_unlock_bh();
2992 	}
2993 
2994 done:
2995 	cb->args[0] = h;
2996 	cb->args[1] = idx;
2997 	return skb->len;
2998 }
2999 
3000 void ip_rt_multicast_event(struct in_device *in_dev)
3001 {
3002 	rt_cache_flush(dev_net(in_dev->dev), 0);
3003 }
3004 
3005 #ifdef CONFIG_SYSCTL
3006 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3007 					struct file *filp, void __user *buffer,
3008 					size_t *lenp, loff_t *ppos)
3009 {
3010 	if (write) {
3011 		int flush_delay;
3012 		ctl_table ctl;
3013 		struct net *net;
3014 
3015 		memcpy(&ctl, __ctl, sizeof(ctl));
3016 		ctl.data = &flush_delay;
3017 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3018 
3019 		net = (struct net *)__ctl->extra1;
3020 		rt_cache_flush(net, flush_delay);
3021 		return 0;
3022 	}
3023 
3024 	return -EINVAL;
3025 }
3026 
3027 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3028 						void __user *oldval,
3029 						size_t __user *oldlenp,
3030 						void __user *newval,
3031 						size_t newlen)
3032 {
3033 	int delay;
3034 	struct net *net;
3035 	if (newlen != sizeof(int))
3036 		return -EINVAL;
3037 	if (get_user(delay, (int __user *)newval))
3038 		return -EFAULT;
3039 	net = (struct net *)table->extra1;
3040 	rt_cache_flush(net, delay);
3041 	return 0;
3042 }
3043 
3044 static void rt_secret_reschedule(int old)
3045 {
3046 	struct net *net;
3047 	int new = ip_rt_secret_interval;
3048 	int diff = new - old;
3049 
3050 	if (!diff)
3051 		return;
3052 
3053 	rtnl_lock();
3054 	for_each_net(net) {
3055 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3056 
3057 		if (!new)
3058 			continue;
3059 
3060 		if (deleted) {
3061 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3062 
3063 			if (time <= 0 || (time += diff) <= 0)
3064 				time = 0;
3065 
3066 			net->ipv4.rt_secret_timer.expires = time;
3067 		} else
3068 			net->ipv4.rt_secret_timer.expires = new;
3069 
3070 		net->ipv4.rt_secret_timer.expires += jiffies;
3071 		add_timer(&net->ipv4.rt_secret_timer);
3072 	}
3073 	rtnl_unlock();
3074 }
3075 
3076 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3077 					  struct file *filp,
3078 					  void __user *buffer, size_t *lenp,
3079 					  loff_t *ppos)
3080 {
3081 	int old = ip_rt_secret_interval;
3082 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3083 
3084 	rt_secret_reschedule(old);
3085 
3086 	return ret;
3087 }
3088 
3089 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3090 						   void __user *oldval,
3091 						   size_t __user *oldlenp,
3092 						   void __user *newval,
3093 						   size_t newlen)
3094 {
3095 	int old = ip_rt_secret_interval;
3096 	int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3097 
3098 	rt_secret_reschedule(old);
3099 
3100 	return ret;
3101 }
3102 
3103 static ctl_table ipv4_route_table[] = {
3104 	{
3105 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
3106 		.procname	= "gc_thresh",
3107 		.data		= &ipv4_dst_ops.gc_thresh,
3108 		.maxlen		= sizeof(int),
3109 		.mode		= 0644,
3110 		.proc_handler	= proc_dointvec,
3111 	},
3112 	{
3113 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
3114 		.procname	= "max_size",
3115 		.data		= &ip_rt_max_size,
3116 		.maxlen		= sizeof(int),
3117 		.mode		= 0644,
3118 		.proc_handler	= proc_dointvec,
3119 	},
3120 	{
3121 		/*  Deprecated. Use gc_min_interval_ms */
3122 
3123 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3124 		.procname	= "gc_min_interval",
3125 		.data		= &ip_rt_gc_min_interval,
3126 		.maxlen		= sizeof(int),
3127 		.mode		= 0644,
3128 		.proc_handler	= proc_dointvec_jiffies,
3129 		.strategy	= sysctl_jiffies,
3130 	},
3131 	{
3132 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3133 		.procname	= "gc_min_interval_ms",
3134 		.data		= &ip_rt_gc_min_interval,
3135 		.maxlen		= sizeof(int),
3136 		.mode		= 0644,
3137 		.proc_handler	= proc_dointvec_ms_jiffies,
3138 		.strategy	= sysctl_ms_jiffies,
3139 	},
3140 	{
3141 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3142 		.procname	= "gc_timeout",
3143 		.data		= &ip_rt_gc_timeout,
3144 		.maxlen		= sizeof(int),
3145 		.mode		= 0644,
3146 		.proc_handler	= proc_dointvec_jiffies,
3147 		.strategy	= sysctl_jiffies,
3148 	},
3149 	{
3150 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3151 		.procname	= "gc_interval",
3152 		.data		= &ip_rt_gc_interval,
3153 		.maxlen		= sizeof(int),
3154 		.mode		= 0644,
3155 		.proc_handler	= proc_dointvec_jiffies,
3156 		.strategy	= sysctl_jiffies,
3157 	},
3158 	{
3159 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3160 		.procname	= "redirect_load",
3161 		.data		= &ip_rt_redirect_load,
3162 		.maxlen		= sizeof(int),
3163 		.mode		= 0644,
3164 		.proc_handler	= proc_dointvec,
3165 	},
3166 	{
3167 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3168 		.procname	= "redirect_number",
3169 		.data		= &ip_rt_redirect_number,
3170 		.maxlen		= sizeof(int),
3171 		.mode		= 0644,
3172 		.proc_handler	= proc_dointvec,
3173 	},
3174 	{
3175 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3176 		.procname	= "redirect_silence",
3177 		.data		= &ip_rt_redirect_silence,
3178 		.maxlen		= sizeof(int),
3179 		.mode		= 0644,
3180 		.proc_handler	= proc_dointvec,
3181 	},
3182 	{
3183 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3184 		.procname	= "error_cost",
3185 		.data		= &ip_rt_error_cost,
3186 		.maxlen		= sizeof(int),
3187 		.mode		= 0644,
3188 		.proc_handler	= proc_dointvec,
3189 	},
3190 	{
3191 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3192 		.procname	= "error_burst",
3193 		.data		= &ip_rt_error_burst,
3194 		.maxlen		= sizeof(int),
3195 		.mode		= 0644,
3196 		.proc_handler	= proc_dointvec,
3197 	},
3198 	{
3199 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3200 		.procname	= "gc_elasticity",
3201 		.data		= &ip_rt_gc_elasticity,
3202 		.maxlen		= sizeof(int),
3203 		.mode		= 0644,
3204 		.proc_handler	= proc_dointvec,
3205 	},
3206 	{
3207 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3208 		.procname	= "mtu_expires",
3209 		.data		= &ip_rt_mtu_expires,
3210 		.maxlen		= sizeof(int),
3211 		.mode		= 0644,
3212 		.proc_handler	= proc_dointvec_jiffies,
3213 		.strategy	= sysctl_jiffies,
3214 	},
3215 	{
3216 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3217 		.procname	= "min_pmtu",
3218 		.data		= &ip_rt_min_pmtu,
3219 		.maxlen		= sizeof(int),
3220 		.mode		= 0644,
3221 		.proc_handler	= proc_dointvec,
3222 	},
3223 	{
3224 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3225 		.procname	= "min_adv_mss",
3226 		.data		= &ip_rt_min_advmss,
3227 		.maxlen		= sizeof(int),
3228 		.mode		= 0644,
3229 		.proc_handler	= proc_dointvec,
3230 	},
3231 	{
3232 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3233 		.procname	= "secret_interval",
3234 		.data		= &ip_rt_secret_interval,
3235 		.maxlen		= sizeof(int),
3236 		.mode		= 0644,
3237 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3238 		.strategy	= ipv4_sysctl_rt_secret_interval_strategy,
3239 	},
3240 	{ .ctl_name = 0 }
3241 };
3242 
3243 static struct ctl_table empty[1];
3244 
3245 static struct ctl_table ipv4_skeleton[] =
3246 {
3247 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3248 	  .mode = 0555, .child = ipv4_route_table},
3249 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3250 	  .mode = 0555, .child = empty},
3251 	{ }
3252 };
3253 
3254 static __net_initdata struct ctl_path ipv4_path[] = {
3255 	{ .procname = "net", .ctl_name = CTL_NET, },
3256 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3257 	{ },
3258 };
3259 
3260 static struct ctl_table ipv4_route_flush_table[] = {
3261 	{
3262 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3263 		.procname	= "flush",
3264 		.maxlen		= sizeof(int),
3265 		.mode		= 0200,
3266 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3267 		.strategy	= ipv4_sysctl_rtcache_flush_strategy,
3268 	},
3269 	{ .ctl_name = 0 },
3270 };
3271 
3272 static __net_initdata struct ctl_path ipv4_route_path[] = {
3273 	{ .procname = "net", .ctl_name = CTL_NET, },
3274 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3275 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3276 	{ },
3277 };
3278 
3279 static __net_init int sysctl_route_net_init(struct net *net)
3280 {
3281 	struct ctl_table *tbl;
3282 
3283 	tbl = ipv4_route_flush_table;
3284 	if (net != &init_net) {
3285 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3286 		if (tbl == NULL)
3287 			goto err_dup;
3288 	}
3289 	tbl[0].extra1 = net;
3290 
3291 	net->ipv4.route_hdr =
3292 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3293 	if (net->ipv4.route_hdr == NULL)
3294 		goto err_reg;
3295 	return 0;
3296 
3297 err_reg:
3298 	if (tbl != ipv4_route_flush_table)
3299 		kfree(tbl);
3300 err_dup:
3301 	return -ENOMEM;
3302 }
3303 
3304 static __net_exit void sysctl_route_net_exit(struct net *net)
3305 {
3306 	struct ctl_table *tbl;
3307 
3308 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3309 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3310 	BUG_ON(tbl == ipv4_route_flush_table);
3311 	kfree(tbl);
3312 }
3313 
3314 static __net_initdata struct pernet_operations sysctl_route_ops = {
3315 	.init = sysctl_route_net_init,
3316 	.exit = sysctl_route_net_exit,
3317 };
3318 #endif
3319 
3320 
3321 static __net_init int rt_secret_timer_init(struct net *net)
3322 {
3323 	atomic_set(&net->ipv4.rt_genid,
3324 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3325 			(jiffies ^ (jiffies >> 7))));
3326 
3327 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3328 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3329 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3330 
3331 	if (ip_rt_secret_interval) {
3332 		net->ipv4.rt_secret_timer.expires =
3333 			jiffies + net_random() % ip_rt_secret_interval +
3334 			ip_rt_secret_interval;
3335 		add_timer(&net->ipv4.rt_secret_timer);
3336 	}
3337 	return 0;
3338 }
3339 
3340 static __net_exit void rt_secret_timer_exit(struct net *net)
3341 {
3342 	del_timer_sync(&net->ipv4.rt_secret_timer);
3343 }
3344 
3345 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3346 	.init = rt_secret_timer_init,
3347 	.exit = rt_secret_timer_exit,
3348 };
3349 
3350 
3351 #ifdef CONFIG_NET_CLS_ROUTE
3352 struct ip_rt_acct *ip_rt_acct __read_mostly;
3353 #endif /* CONFIG_NET_CLS_ROUTE */
3354 
3355 static __initdata unsigned long rhash_entries;
3356 static int __init set_rhash_entries(char *str)
3357 {
3358 	if (!str)
3359 		return 0;
3360 	rhash_entries = simple_strtoul(str, &str, 0);
3361 	return 1;
3362 }
3363 __setup("rhash_entries=", set_rhash_entries);
3364 
3365 int __init ip_rt_init(void)
3366 {
3367 	int rc = 0;
3368 
3369 #ifdef CONFIG_NET_CLS_ROUTE
3370 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3371 	if (!ip_rt_acct)
3372 		panic("IP: failed to allocate ip_rt_acct\n");
3373 #endif
3374 
3375 	ipv4_dst_ops.kmem_cachep =
3376 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3377 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3378 
3379 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3380 
3381 	rt_hash_table = (struct rt_hash_bucket *)
3382 		alloc_large_system_hash("IP route cache",
3383 					sizeof(struct rt_hash_bucket),
3384 					rhash_entries,
3385 					(num_physpages >= 128 * 1024) ?
3386 					15 : 17,
3387 					0,
3388 					&rt_hash_log,
3389 					&rt_hash_mask,
3390 					rhash_entries ? 0 : 512 * 1024);
3391 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392 	rt_hash_lock_init();
3393 
3394 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3396 
3397 	devinet_init();
3398 	ip_fib_init();
3399 
3400 	/* All the timers, started at system startup tend
3401 	   to synchronize. Perturb it a bit.
3402 	 */
3403 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3404 	expires_ljiffies = jiffies;
3405 	schedule_delayed_work(&expires_work,
3406 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3407 
3408 	if (register_pernet_subsys(&rt_secret_timer_ops))
3409 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3410 
3411 	if (ip_rt_proc_init())
3412 		printk(KERN_ERR "Unable to create route proc files\n");
3413 #ifdef CONFIG_XFRM
3414 	xfrm_init();
3415 	xfrm4_init();
3416 #endif
3417 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3418 
3419 #ifdef CONFIG_SYSCTL
3420 	register_pernet_subsys(&sysctl_route_ops);
3421 #endif
3422 	return rc;
3423 }
3424 
3425 #ifdef CONFIG_SYSCTL
3426 /*
3427  * We really need to sanitize the damn ipv4 init order, then all
3428  * this nonsense will go away.
3429  */
3430 void __init ip_static_sysctl_init(void)
3431 {
3432 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3433 }
3434 #endif
3435 
3436 EXPORT_SYMBOL(__ip_select_ident);
3437 EXPORT_SYMBOL(ip_route_input);
3438 EXPORT_SYMBOL(ip_route_output_key);
3439