xref: /linux/net/ipv4/route.c (revision a5766f11cfd3a0c03450d99c8fe548c2940be884)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static void		 ipv4_dst_destroy(struct dst_entry *dst);
142 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
143 					 struct net_device *dev, int how);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148 
149 
150 static struct dst_ops ipv4_dst_ops = {
151 	.family =		AF_INET,
152 	.protocol =		__constant_htons(ETH_P_IP),
153 	.gc =			rt_garbage_collect,
154 	.check =		ipv4_dst_check,
155 	.destroy =		ipv4_dst_destroy,
156 	.ifdown =		ipv4_dst_ifdown,
157 	.negative_advice =	ipv4_negative_advice,
158 	.link_failure =		ipv4_link_failure,
159 	.update_pmtu =		ip_rt_update_pmtu,
160 	.local_out =		__ip_local_out,
161 	.entry_size =		sizeof(struct rtable),
162 	.entries =		ATOMIC_INIT(0),
163 };
164 
165 #define ECN_OR_COST(class)	TC_PRIO_##class
166 
167 const __u8 ip_tos2prio[16] = {
168 	TC_PRIO_BESTEFFORT,
169 	ECN_OR_COST(FILLER),
170 	TC_PRIO_BESTEFFORT,
171 	ECN_OR_COST(BESTEFFORT),
172 	TC_PRIO_BULK,
173 	ECN_OR_COST(BULK),
174 	TC_PRIO_BULK,
175 	ECN_OR_COST(BULK),
176 	TC_PRIO_INTERACTIVE,
177 	ECN_OR_COST(INTERACTIVE),
178 	TC_PRIO_INTERACTIVE,
179 	ECN_OR_COST(INTERACTIVE),
180 	TC_PRIO_INTERACTIVE_BULK,
181 	ECN_OR_COST(INTERACTIVE_BULK),
182 	TC_PRIO_INTERACTIVE_BULK,
183 	ECN_OR_COST(INTERACTIVE_BULK)
184 };
185 
186 
187 /*
188  * Route cache.
189  */
190 
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200 
201 struct rt_hash_bucket {
202 	struct rtable	*chain;
203 };
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 	defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ	256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ	4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ	2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ	1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ	512
222 # else
223 #  define RT_HASH_LOCK_SZ	256
224 # endif
225 #endif
226 
227 static spinlock_t	*rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229 
230 static __init void rt_hash_lock_init(void)
231 {
232 	int i;
233 
234 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 			GFP_KERNEL);
236 	if (!rt_hash_locks)
237 		panic("IP: failed to allocate rt_hash_locks\n");
238 
239 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 		spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249 
250 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
251 static unsigned			rt_hash_mask __read_mostly;
252 static unsigned int		rt_hash_log  __read_mostly;
253 
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) \
256 	(__raw_get_cpu_var(rt_cache_stat).field++)
257 
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259 		int genid)
260 {
261 	return jhash_3words((__force u32)(__be32)(daddr),
262 			    (__force u32)(__be32)(saddr),
263 			    idx, genid)
264 		& rt_hash_mask;
265 }
266 
267 static inline int rt_genid(struct net *net)
268 {
269 	return atomic_read(&net->ipv4.rt_genid);
270 }
271 
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274 	struct seq_net_private p;
275 	int bucket;
276 	int genid;
277 };
278 
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
280 {
281 	struct rt_cache_iter_state *st = seq->private;
282 	struct rtable *r = NULL;
283 
284 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 		if (!rt_hash_table[st->bucket].chain)
286 			continue;
287 		rcu_read_lock_bh();
288 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
289 		while (r) {
290 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
291 			    r->rt_genid == st->genid)
292 				return r;
293 			r = rcu_dereference(r->u.dst.rt_next);
294 		}
295 		rcu_read_unlock_bh();
296 	}
297 	return r;
298 }
299 
300 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301 					  struct rtable *r)
302 {
303 	struct rt_cache_iter_state *st = seq->private;
304 
305 	r = r->u.dst.rt_next;
306 	while (!r) {
307 		rcu_read_unlock_bh();
308 		do {
309 			if (--st->bucket < 0)
310 				return NULL;
311 		} while (!rt_hash_table[st->bucket].chain);
312 		rcu_read_lock_bh();
313 		r = rt_hash_table[st->bucket].chain;
314 	}
315 	return rcu_dereference(r);
316 }
317 
318 static struct rtable *rt_cache_get_next(struct seq_file *seq,
319 					struct rtable *r)
320 {
321 	struct rt_cache_iter_state *st = seq->private;
322 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
323 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
324 			continue;
325 		if (r->rt_genid == st->genid)
326 			break;
327 	}
328 	return r;
329 }
330 
331 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
332 {
333 	struct rtable *r = rt_cache_get_first(seq);
334 
335 	if (r)
336 		while (pos && (r = rt_cache_get_next(seq, r)))
337 			--pos;
338 	return pos ? NULL : r;
339 }
340 
341 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
342 {
343 	struct rt_cache_iter_state *st = seq->private;
344 	if (*pos)
345 		return rt_cache_get_idx(seq, *pos - 1);
346 	st->genid = rt_genid(seq_file_net(seq));
347 	return SEQ_START_TOKEN;
348 }
349 
350 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
351 {
352 	struct rtable *r;
353 
354 	if (v == SEQ_START_TOKEN)
355 		r = rt_cache_get_first(seq);
356 	else
357 		r = rt_cache_get_next(seq, v);
358 	++*pos;
359 	return r;
360 }
361 
362 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
363 {
364 	if (v && v != SEQ_START_TOKEN)
365 		rcu_read_unlock_bh();
366 }
367 
368 static int rt_cache_seq_show(struct seq_file *seq, void *v)
369 {
370 	if (v == SEQ_START_TOKEN)
371 		seq_printf(seq, "%-127s\n",
372 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
373 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
374 			   "HHUptod\tSpecDst");
375 	else {
376 		struct rtable *r = v;
377 		int len;
378 
379 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
380 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
381 			r->u.dst.dev ? r->u.dst.dev->name : "*",
382 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
383 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
384 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
385 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
386 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
387 			dst_metric(&r->u.dst, RTAX_WINDOW),
388 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
389 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
390 			r->fl.fl4_tos,
391 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
392 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
393 				       dev_queue_xmit) : 0,
394 			r->rt_spec_dst, &len);
395 
396 		seq_printf(seq, "%*s\n", 127 - len, "");
397 	}
398 	return 0;
399 }
400 
401 static const struct seq_operations rt_cache_seq_ops = {
402 	.start  = rt_cache_seq_start,
403 	.next   = rt_cache_seq_next,
404 	.stop   = rt_cache_seq_stop,
405 	.show   = rt_cache_seq_show,
406 };
407 
408 static int rt_cache_seq_open(struct inode *inode, struct file *file)
409 {
410 	return seq_open_net(inode, file, &rt_cache_seq_ops,
411 			sizeof(struct rt_cache_iter_state));
412 }
413 
414 static const struct file_operations rt_cache_seq_fops = {
415 	.owner	 = THIS_MODULE,
416 	.open	 = rt_cache_seq_open,
417 	.read	 = seq_read,
418 	.llseek	 = seq_lseek,
419 	.release = seq_release_net,
420 };
421 
422 
423 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
424 {
425 	int cpu;
426 
427 	if (*pos == 0)
428 		return SEQ_START_TOKEN;
429 
430 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
431 		if (!cpu_possible(cpu))
432 			continue;
433 		*pos = cpu+1;
434 		return &per_cpu(rt_cache_stat, cpu);
435 	}
436 	return NULL;
437 }
438 
439 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
440 {
441 	int cpu;
442 
443 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
444 		if (!cpu_possible(cpu))
445 			continue;
446 		*pos = cpu+1;
447 		return &per_cpu(rt_cache_stat, cpu);
448 	}
449 	return NULL;
450 
451 }
452 
453 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
454 {
455 
456 }
457 
458 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459 {
460 	struct rt_cache_stat *st = v;
461 
462 	if (v == SEQ_START_TOKEN) {
463 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
464 		return 0;
465 	}
466 
467 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
468 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 		   atomic_read(&ipv4_dst_ops.entries),
470 		   st->in_hit,
471 		   st->in_slow_tot,
472 		   st->in_slow_mc,
473 		   st->in_no_route,
474 		   st->in_brd,
475 		   st->in_martian_dst,
476 		   st->in_martian_src,
477 
478 		   st->out_hit,
479 		   st->out_slow_tot,
480 		   st->out_slow_mc,
481 
482 		   st->gc_total,
483 		   st->gc_ignored,
484 		   st->gc_goal_miss,
485 		   st->gc_dst_overflow,
486 		   st->in_hlist_search,
487 		   st->out_hlist_search
488 		);
489 	return 0;
490 }
491 
492 static const struct seq_operations rt_cpu_seq_ops = {
493 	.start  = rt_cpu_seq_start,
494 	.next   = rt_cpu_seq_next,
495 	.stop   = rt_cpu_seq_stop,
496 	.show   = rt_cpu_seq_show,
497 };
498 
499 
500 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501 {
502 	return seq_open(file, &rt_cpu_seq_ops);
503 }
504 
505 static const struct file_operations rt_cpu_seq_fops = {
506 	.owner	 = THIS_MODULE,
507 	.open	 = rt_cpu_seq_open,
508 	.read	 = seq_read,
509 	.llseek	 = seq_lseek,
510 	.release = seq_release,
511 };
512 
513 #ifdef CONFIG_NET_CLS_ROUTE
514 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
515 			   int length, int *eof, void *data)
516 {
517 	unsigned int i;
518 
519 	if ((offset & 3) || (length & 3))
520 		return -EIO;
521 
522 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
523 		*eof = 1;
524 		return 0;
525 	}
526 
527 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
528 		length = sizeof(struct ip_rt_acct) * 256 - offset;
529 		*eof = 1;
530 	}
531 
532 	offset /= sizeof(u32);
533 
534 	if (length > 0) {
535 		u32 *dst = (u32 *) buffer;
536 
537 		*start = buffer;
538 		memset(dst, 0, length);
539 
540 		for_each_possible_cpu(i) {
541 			unsigned int j;
542 			u32 *src;
543 
544 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
545 			for (j = 0; j < length/4; j++)
546 				dst[j] += src[j];
547 		}
548 	}
549 	return length;
550 }
551 #endif
552 
553 static int __net_init ip_rt_do_proc_init(struct net *net)
554 {
555 	struct proc_dir_entry *pde;
556 
557 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
558 			&rt_cache_seq_fops);
559 	if (!pde)
560 		goto err1;
561 
562 	pde = proc_create("rt_cache", S_IRUGO,
563 			  net->proc_net_stat, &rt_cpu_seq_fops);
564 	if (!pde)
565 		goto err2;
566 
567 #ifdef CONFIG_NET_CLS_ROUTE
568 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
569 			ip_rt_acct_read, NULL);
570 	if (!pde)
571 		goto err3;
572 #endif
573 	return 0;
574 
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 	remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 	remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 	return -ENOMEM;
583 }
584 
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587 	remove_proc_entry("rt_cache", net->proc_net_stat);
588 	remove_proc_entry("rt_cache", net->proc_net);
589 	remove_proc_entry("rt_acct", net->proc_net);
590 }
591 
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593 	.init = ip_rt_do_proc_init,
594 	.exit = ip_rt_do_proc_exit,
595 };
596 
597 static int __init ip_rt_proc_init(void)
598 {
599 	return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601 
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605 	return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608 
609 static inline void rt_free(struct rtable *rt)
610 {
611 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 }
613 
614 static inline void rt_drop(struct rtable *rt)
615 {
616 	ip_rt_put(rt);
617 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618 }
619 
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622 	/* Kill broadcast/multicast entries very aggresively, if they
623 	   collide in hash table with more useful entries */
624 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 		rth->fl.iif && rth->u.dst.rt_next;
626 }
627 
628 static inline int rt_valuable(struct rtable *rth)
629 {
630 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 		rth->u.dst.expires;
632 }
633 
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636 	unsigned long age;
637 	int ret = 0;
638 
639 	if (atomic_read(&rth->u.dst.__refcnt))
640 		goto out;
641 
642 	ret = 1;
643 	if (rth->u.dst.expires &&
644 	    time_after_eq(jiffies, rth->u.dst.expires))
645 		goto out;
646 
647 	age = jiffies - rth->u.dst.lastuse;
648 	ret = 0;
649 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 	    (age <= tmo2 && rt_valuable(rth)))
651 		goto out;
652 	ret = 1;
653 out:	return ret;
654 }
655 
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663 	u32 score = jiffies - rt->u.dst.lastuse;
664 
665 	score = ~score & ~(3<<30);
666 
667 	if (rt_valuable(rt))
668 		score |= (1<<31);
669 
670 	if (!rt->fl.iif ||
671 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 		score |= (1<<30);
673 
674 	return score;
675 }
676 
677 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
678 {
679 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
680 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
681 		(fl1->mark ^ fl2->mark) |
682 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
683 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
684 		(fl1->oif ^ fl2->oif) |
685 		(fl1->iif ^ fl2->iif)) == 0;
686 }
687 
688 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
689 {
690 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
691 }
692 
693 static inline int rt_is_expired(struct rtable *rth)
694 {
695 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
696 }
697 
698 /*
699  * Perform a full scan of hash table and free all entries.
700  * Can be called by a softirq or a process.
701  * In the later case, we want to be reschedule if necessary
702  */
703 static void rt_do_flush(int process_context)
704 {
705 	unsigned int i;
706 	struct rtable *rth, *next;
707 	struct rtable * tail;
708 
709 	for (i = 0; i <= rt_hash_mask; i++) {
710 		if (process_context && need_resched())
711 			cond_resched();
712 		rth = rt_hash_table[i].chain;
713 		if (!rth)
714 			continue;
715 
716 		spin_lock_bh(rt_hash_lock_addr(i));
717 #ifdef CONFIG_NET_NS
718 		{
719 		struct rtable ** prev, * p;
720 
721 		rth = rt_hash_table[i].chain;
722 
723 		/* defer releasing the head of the list after spin_unlock */
724 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
725 			if (!rt_is_expired(tail))
726 				break;
727 		if (rth != tail)
728 			rt_hash_table[i].chain = tail;
729 
730 		/* call rt_free on entries after the tail requiring flush */
731 		prev = &rt_hash_table[i].chain;
732 		for (p = *prev; p; p = next) {
733 			next = p->u.dst.rt_next;
734 			if (!rt_is_expired(p)) {
735 				prev = &p->u.dst.rt_next;
736 			} else {
737 				*prev = next;
738 				rt_free(p);
739 			}
740 		}
741 		}
742 #else
743 		rth = rt_hash_table[i].chain;
744 		rt_hash_table[i].chain = NULL;
745 		tail = NULL;
746 #endif
747 		spin_unlock_bh(rt_hash_lock_addr(i));
748 
749 		for (; rth != tail; rth = next) {
750 			next = rth->u.dst.rt_next;
751 			rt_free(rth);
752 		}
753 	}
754 }
755 
756 static void rt_check_expire(void)
757 {
758 	static unsigned int rover;
759 	unsigned int i = rover, goal;
760 	struct rtable *rth, **rthp;
761 	u64 mult;
762 
763 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
764 	if (ip_rt_gc_timeout > 1)
765 		do_div(mult, ip_rt_gc_timeout);
766 	goal = (unsigned int)mult;
767 	if (goal > rt_hash_mask)
768 		goal = rt_hash_mask + 1;
769 	for (; goal > 0; goal--) {
770 		unsigned long tmo = ip_rt_gc_timeout;
771 
772 		i = (i + 1) & rt_hash_mask;
773 		rthp = &rt_hash_table[i].chain;
774 
775 		if (need_resched())
776 			cond_resched();
777 
778 		if (*rthp == NULL)
779 			continue;
780 		spin_lock_bh(rt_hash_lock_addr(i));
781 		while ((rth = *rthp) != NULL) {
782 			if (rt_is_expired(rth)) {
783 				*rthp = rth->u.dst.rt_next;
784 				rt_free(rth);
785 				continue;
786 			}
787 			if (rth->u.dst.expires) {
788 				/* Entry is expired even if it is in use */
789 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
790 					tmo >>= 1;
791 					rthp = &rth->u.dst.rt_next;
792 					continue;
793 				}
794 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
795 				tmo >>= 1;
796 				rthp = &rth->u.dst.rt_next;
797 				continue;
798 			}
799 
800 			/* Cleanup aged off entries. */
801 			*rthp = rth->u.dst.rt_next;
802 			rt_free(rth);
803 		}
804 		spin_unlock_bh(rt_hash_lock_addr(i));
805 	}
806 	rover = i;
807 }
808 
809 /*
810  * rt_worker_func() is run in process context.
811  * we call rt_check_expire() to scan part of the hash table
812  */
813 static void rt_worker_func(struct work_struct *work)
814 {
815 	rt_check_expire();
816 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
817 }
818 
819 /*
820  * Pertubation of rt_genid by a small quantity [1..256]
821  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
822  * many times (2^24) without giving recent rt_genid.
823  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
824  */
825 static void rt_cache_invalidate(struct net *net)
826 {
827 	unsigned char shuffle;
828 
829 	get_random_bytes(&shuffle, sizeof(shuffle));
830 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
831 }
832 
833 /*
834  * delay < 0  : invalidate cache (fast : entries will be deleted later)
835  * delay >= 0 : invalidate & flush cache (can be long)
836  */
837 void rt_cache_flush(struct net *net, int delay)
838 {
839 	rt_cache_invalidate(net);
840 	if (delay >= 0)
841 		rt_do_flush(!in_softirq());
842 }
843 
844 /*
845  * We change rt_genid and let gc do the cleanup
846  */
847 static void rt_secret_rebuild(unsigned long __net)
848 {
849 	struct net *net = (struct net *)__net;
850 	rt_cache_invalidate(net);
851 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
852 }
853 
854 /*
855    Short description of GC goals.
856 
857    We want to build algorithm, which will keep routing cache
858    at some equilibrium point, when number of aged off entries
859    is kept approximately equal to newly generated ones.
860 
861    Current expiration strength is variable "expire".
862    We try to adjust it dynamically, so that if networking
863    is idle expires is large enough to keep enough of warm entries,
864    and when load increases it reduces to limit cache size.
865  */
866 
867 static int rt_garbage_collect(struct dst_ops *ops)
868 {
869 	static unsigned long expire = RT_GC_TIMEOUT;
870 	static unsigned long last_gc;
871 	static int rover;
872 	static int equilibrium;
873 	struct rtable *rth, **rthp;
874 	unsigned long now = jiffies;
875 	int goal;
876 
877 	/*
878 	 * Garbage collection is pretty expensive,
879 	 * do not make it too frequently.
880 	 */
881 
882 	RT_CACHE_STAT_INC(gc_total);
883 
884 	if (now - last_gc < ip_rt_gc_min_interval &&
885 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
886 		RT_CACHE_STAT_INC(gc_ignored);
887 		goto out;
888 	}
889 
890 	/* Calculate number of entries, which we want to expire now. */
891 	goal = atomic_read(&ipv4_dst_ops.entries) -
892 		(ip_rt_gc_elasticity << rt_hash_log);
893 	if (goal <= 0) {
894 		if (equilibrium < ipv4_dst_ops.gc_thresh)
895 			equilibrium = ipv4_dst_ops.gc_thresh;
896 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
897 		if (goal > 0) {
898 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
899 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
900 		}
901 	} else {
902 		/* We are in dangerous area. Try to reduce cache really
903 		 * aggressively.
904 		 */
905 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
906 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
907 	}
908 
909 	if (now - last_gc >= ip_rt_gc_min_interval)
910 		last_gc = now;
911 
912 	if (goal <= 0) {
913 		equilibrium += goal;
914 		goto work_done;
915 	}
916 
917 	do {
918 		int i, k;
919 
920 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
921 			unsigned long tmo = expire;
922 
923 			k = (k + 1) & rt_hash_mask;
924 			rthp = &rt_hash_table[k].chain;
925 			spin_lock_bh(rt_hash_lock_addr(k));
926 			while ((rth = *rthp) != NULL) {
927 				if (!rt_is_expired(rth) &&
928 					!rt_may_expire(rth, tmo, expire)) {
929 					tmo >>= 1;
930 					rthp = &rth->u.dst.rt_next;
931 					continue;
932 				}
933 				*rthp = rth->u.dst.rt_next;
934 				rt_free(rth);
935 				goal--;
936 			}
937 			spin_unlock_bh(rt_hash_lock_addr(k));
938 			if (goal <= 0)
939 				break;
940 		}
941 		rover = k;
942 
943 		if (goal <= 0)
944 			goto work_done;
945 
946 		/* Goal is not achieved. We stop process if:
947 
948 		   - if expire reduced to zero. Otherwise, expire is halfed.
949 		   - if table is not full.
950 		   - if we are called from interrupt.
951 		   - jiffies check is just fallback/debug loop breaker.
952 		     We will not spin here for long time in any case.
953 		 */
954 
955 		RT_CACHE_STAT_INC(gc_goal_miss);
956 
957 		if (expire == 0)
958 			break;
959 
960 		expire >>= 1;
961 #if RT_CACHE_DEBUG >= 2
962 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
963 				atomic_read(&ipv4_dst_ops.entries), goal, i);
964 #endif
965 
966 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
967 			goto out;
968 	} while (!in_softirq() && time_before_eq(jiffies, now));
969 
970 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
971 		goto out;
972 	if (net_ratelimit())
973 		printk(KERN_WARNING "dst cache overflow\n");
974 	RT_CACHE_STAT_INC(gc_dst_overflow);
975 	return 1;
976 
977 work_done:
978 	expire += ip_rt_gc_min_interval;
979 	if (expire > ip_rt_gc_timeout ||
980 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
981 		expire = ip_rt_gc_timeout;
982 #if RT_CACHE_DEBUG >= 2
983 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
984 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
985 #endif
986 out:	return 0;
987 }
988 
989 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
990 {
991 	struct rtable	*rth, **rthp;
992 	unsigned long	now;
993 	struct rtable *cand, **candp;
994 	u32 		min_score;
995 	int		chain_length;
996 	int attempts = !in_softirq();
997 
998 restart:
999 	chain_length = 0;
1000 	min_score = ~(u32)0;
1001 	cand = NULL;
1002 	candp = NULL;
1003 	now = jiffies;
1004 
1005 	rthp = &rt_hash_table[hash].chain;
1006 
1007 	spin_lock_bh(rt_hash_lock_addr(hash));
1008 	while ((rth = *rthp) != NULL) {
1009 		if (rt_is_expired(rth)) {
1010 			*rthp = rth->u.dst.rt_next;
1011 			rt_free(rth);
1012 			continue;
1013 		}
1014 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1015 			/* Put it first */
1016 			*rthp = rth->u.dst.rt_next;
1017 			/*
1018 			 * Since lookup is lockfree, the deletion
1019 			 * must be visible to another weakly ordered CPU before
1020 			 * the insertion at the start of the hash chain.
1021 			 */
1022 			rcu_assign_pointer(rth->u.dst.rt_next,
1023 					   rt_hash_table[hash].chain);
1024 			/*
1025 			 * Since lookup is lockfree, the update writes
1026 			 * must be ordered for consistency on SMP.
1027 			 */
1028 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1029 
1030 			dst_use(&rth->u.dst, now);
1031 			spin_unlock_bh(rt_hash_lock_addr(hash));
1032 
1033 			rt_drop(rt);
1034 			*rp = rth;
1035 			return 0;
1036 		}
1037 
1038 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1039 			u32 score = rt_score(rth);
1040 
1041 			if (score <= min_score) {
1042 				cand = rth;
1043 				candp = rthp;
1044 				min_score = score;
1045 			}
1046 		}
1047 
1048 		chain_length++;
1049 
1050 		rthp = &rth->u.dst.rt_next;
1051 	}
1052 
1053 	if (cand) {
1054 		/* ip_rt_gc_elasticity used to be average length of chain
1055 		 * length, when exceeded gc becomes really aggressive.
1056 		 *
1057 		 * The second limit is less certain. At the moment it allows
1058 		 * only 2 entries per bucket. We will see.
1059 		 */
1060 		if (chain_length > ip_rt_gc_elasticity) {
1061 			*candp = cand->u.dst.rt_next;
1062 			rt_free(cand);
1063 		}
1064 	}
1065 
1066 	/* Try to bind route to arp only if it is output
1067 	   route or unicast forwarding path.
1068 	 */
1069 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1070 		int err = arp_bind_neighbour(&rt->u.dst);
1071 		if (err) {
1072 			spin_unlock_bh(rt_hash_lock_addr(hash));
1073 
1074 			if (err != -ENOBUFS) {
1075 				rt_drop(rt);
1076 				return err;
1077 			}
1078 
1079 			/* Neighbour tables are full and nothing
1080 			   can be released. Try to shrink route cache,
1081 			   it is most likely it holds some neighbour records.
1082 			 */
1083 			if (attempts-- > 0) {
1084 				int saved_elasticity = ip_rt_gc_elasticity;
1085 				int saved_int = ip_rt_gc_min_interval;
1086 				ip_rt_gc_elasticity	= 1;
1087 				ip_rt_gc_min_interval	= 0;
1088 				rt_garbage_collect(&ipv4_dst_ops);
1089 				ip_rt_gc_min_interval	= saved_int;
1090 				ip_rt_gc_elasticity	= saved_elasticity;
1091 				goto restart;
1092 			}
1093 
1094 			if (net_ratelimit())
1095 				printk(KERN_WARNING "Neighbour table overflow.\n");
1096 			rt_drop(rt);
1097 			return -ENOBUFS;
1098 		}
1099 	}
1100 
1101 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1102 #if RT_CACHE_DEBUG >= 2
1103 	if (rt->u.dst.rt_next) {
1104 		struct rtable *trt;
1105 		printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1106 		       NIPQUAD(rt->rt_dst));
1107 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1108 			printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1109 		printk("\n");
1110 	}
1111 #endif
1112 	rt_hash_table[hash].chain = rt;
1113 	spin_unlock_bh(rt_hash_lock_addr(hash));
1114 	*rp = rt;
1115 	return 0;
1116 }
1117 
1118 void rt_bind_peer(struct rtable *rt, int create)
1119 {
1120 	static DEFINE_SPINLOCK(rt_peer_lock);
1121 	struct inet_peer *peer;
1122 
1123 	peer = inet_getpeer(rt->rt_dst, create);
1124 
1125 	spin_lock_bh(&rt_peer_lock);
1126 	if (rt->peer == NULL) {
1127 		rt->peer = peer;
1128 		peer = NULL;
1129 	}
1130 	spin_unlock_bh(&rt_peer_lock);
1131 	if (peer)
1132 		inet_putpeer(peer);
1133 }
1134 
1135 /*
1136  * Peer allocation may fail only in serious out-of-memory conditions.  However
1137  * we still can generate some output.
1138  * Random ID selection looks a bit dangerous because we have no chances to
1139  * select ID being unique in a reasonable period of time.
1140  * But broken packet identifier may be better than no packet at all.
1141  */
1142 static void ip_select_fb_ident(struct iphdr *iph)
1143 {
1144 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1145 	static u32 ip_fallback_id;
1146 	u32 salt;
1147 
1148 	spin_lock_bh(&ip_fb_id_lock);
1149 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1150 	iph->id = htons(salt & 0xFFFF);
1151 	ip_fallback_id = salt;
1152 	spin_unlock_bh(&ip_fb_id_lock);
1153 }
1154 
1155 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1156 {
1157 	struct rtable *rt = (struct rtable *) dst;
1158 
1159 	if (rt) {
1160 		if (rt->peer == NULL)
1161 			rt_bind_peer(rt, 1);
1162 
1163 		/* If peer is attached to destination, it is never detached,
1164 		   so that we need not to grab a lock to dereference it.
1165 		 */
1166 		if (rt->peer) {
1167 			iph->id = htons(inet_getid(rt->peer, more));
1168 			return;
1169 		}
1170 	} else
1171 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1172 		       __builtin_return_address(0));
1173 
1174 	ip_select_fb_ident(iph);
1175 }
1176 
1177 static void rt_del(unsigned hash, struct rtable *rt)
1178 {
1179 	struct rtable **rthp, *aux;
1180 
1181 	rthp = &rt_hash_table[hash].chain;
1182 	spin_lock_bh(rt_hash_lock_addr(hash));
1183 	ip_rt_put(rt);
1184 	while ((aux = *rthp) != NULL) {
1185 		if (aux == rt || rt_is_expired(aux)) {
1186 			*rthp = aux->u.dst.rt_next;
1187 			rt_free(aux);
1188 			continue;
1189 		}
1190 		rthp = &aux->u.dst.rt_next;
1191 	}
1192 	spin_unlock_bh(rt_hash_lock_addr(hash));
1193 }
1194 
1195 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1196 		    __be32 saddr, struct net_device *dev)
1197 {
1198 	int i, k;
1199 	struct in_device *in_dev = in_dev_get(dev);
1200 	struct rtable *rth, **rthp;
1201 	__be32  skeys[2] = { saddr, 0 };
1202 	int  ikeys[2] = { dev->ifindex, 0 };
1203 	struct netevent_redirect netevent;
1204 	struct net *net;
1205 
1206 	if (!in_dev)
1207 		return;
1208 
1209 	net = dev_net(dev);
1210 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1211 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1212 	    || ipv4_is_zeronet(new_gw))
1213 		goto reject_redirect;
1214 
1215 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1216 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1217 			goto reject_redirect;
1218 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1219 			goto reject_redirect;
1220 	} else {
1221 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1222 			goto reject_redirect;
1223 	}
1224 
1225 	for (i = 0; i < 2; i++) {
1226 		for (k = 0; k < 2; k++) {
1227 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1228 						rt_genid(net));
1229 
1230 			rthp=&rt_hash_table[hash].chain;
1231 
1232 			rcu_read_lock();
1233 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1234 				struct rtable *rt;
1235 
1236 				if (rth->fl.fl4_dst != daddr ||
1237 				    rth->fl.fl4_src != skeys[i] ||
1238 				    rth->fl.oif != ikeys[k] ||
1239 				    rth->fl.iif != 0 ||
1240 				    rt_is_expired(rth) ||
1241 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1242 					rthp = &rth->u.dst.rt_next;
1243 					continue;
1244 				}
1245 
1246 				if (rth->rt_dst != daddr ||
1247 				    rth->rt_src != saddr ||
1248 				    rth->u.dst.error ||
1249 				    rth->rt_gateway != old_gw ||
1250 				    rth->u.dst.dev != dev)
1251 					break;
1252 
1253 				dst_hold(&rth->u.dst);
1254 				rcu_read_unlock();
1255 
1256 				rt = dst_alloc(&ipv4_dst_ops);
1257 				if (rt == NULL) {
1258 					ip_rt_put(rth);
1259 					in_dev_put(in_dev);
1260 					return;
1261 				}
1262 
1263 				/* Copy all the information. */
1264 				*rt = *rth;
1265 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1266 				rt->u.dst.__use		= 1;
1267 				atomic_set(&rt->u.dst.__refcnt, 1);
1268 				rt->u.dst.child		= NULL;
1269 				if (rt->u.dst.dev)
1270 					dev_hold(rt->u.dst.dev);
1271 				if (rt->idev)
1272 					in_dev_hold(rt->idev);
1273 				rt->u.dst.obsolete	= 0;
1274 				rt->u.dst.lastuse	= jiffies;
1275 				rt->u.dst.path		= &rt->u.dst;
1276 				rt->u.dst.neighbour	= NULL;
1277 				rt->u.dst.hh		= NULL;
1278 				rt->u.dst.xfrm		= NULL;
1279 				rt->rt_genid		= rt_genid(net);
1280 				rt->rt_flags		|= RTCF_REDIRECTED;
1281 
1282 				/* Gateway is different ... */
1283 				rt->rt_gateway		= new_gw;
1284 
1285 				/* Redirect received -> path was valid */
1286 				dst_confirm(&rth->u.dst);
1287 
1288 				if (rt->peer)
1289 					atomic_inc(&rt->peer->refcnt);
1290 
1291 				if (arp_bind_neighbour(&rt->u.dst) ||
1292 				    !(rt->u.dst.neighbour->nud_state &
1293 					    NUD_VALID)) {
1294 					if (rt->u.dst.neighbour)
1295 						neigh_event_send(rt->u.dst.neighbour, NULL);
1296 					ip_rt_put(rth);
1297 					rt_drop(rt);
1298 					goto do_next;
1299 				}
1300 
1301 				netevent.old = &rth->u.dst;
1302 				netevent.new = &rt->u.dst;
1303 				call_netevent_notifiers(NETEVENT_REDIRECT,
1304 							&netevent);
1305 
1306 				rt_del(hash, rth);
1307 				if (!rt_intern_hash(hash, rt, &rt))
1308 					ip_rt_put(rt);
1309 				goto do_next;
1310 			}
1311 			rcu_read_unlock();
1312 		do_next:
1313 			;
1314 		}
1315 	}
1316 	in_dev_put(in_dev);
1317 	return;
1318 
1319 reject_redirect:
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1322 		printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1323 			NIPQUAD_FMT " ignored.\n"
1324 			"  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1325 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1326 		       NIPQUAD(saddr), NIPQUAD(daddr));
1327 #endif
1328 	in_dev_put(in_dev);
1329 }
1330 
1331 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1332 {
1333 	struct rtable *rt = (struct rtable *)dst;
1334 	struct dst_entry *ret = dst;
1335 
1336 	if (rt) {
1337 		if (dst->obsolete) {
1338 			ip_rt_put(rt);
1339 			ret = NULL;
1340 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1341 			   rt->u.dst.expires) {
1342 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1343 						rt->fl.oif,
1344 						rt_genid(dev_net(dst->dev)));
1345 #if RT_CACHE_DEBUG >= 1
1346 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1347 					  NIPQUAD_FMT "/%02x dropped\n",
1348 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1349 #endif
1350 			rt_del(hash, rt);
1351 			ret = NULL;
1352 		}
1353 	}
1354 	return ret;
1355 }
1356 
1357 /*
1358  * Algorithm:
1359  *	1. The first ip_rt_redirect_number redirects are sent
1360  *	   with exponential backoff, then we stop sending them at all,
1361  *	   assuming that the host ignores our redirects.
1362  *	2. If we did not see packets requiring redirects
1363  *	   during ip_rt_redirect_silence, we assume that the host
1364  *	   forgot redirected route and start to send redirects again.
1365  *
1366  * This algorithm is much cheaper and more intelligent than dumb load limiting
1367  * in icmp.c.
1368  *
1369  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1370  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1371  */
1372 
1373 void ip_rt_send_redirect(struct sk_buff *skb)
1374 {
1375 	struct rtable *rt = skb->rtable;
1376 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1377 
1378 	if (!in_dev)
1379 		return;
1380 
1381 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1382 		goto out;
1383 
1384 	/* No redirected packets during ip_rt_redirect_silence;
1385 	 * reset the algorithm.
1386 	 */
1387 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1388 		rt->u.dst.rate_tokens = 0;
1389 
1390 	/* Too many ignored redirects; do not send anything
1391 	 * set u.dst.rate_last to the last seen redirected packet.
1392 	 */
1393 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1394 		rt->u.dst.rate_last = jiffies;
1395 		goto out;
1396 	}
1397 
1398 	/* Check for load limit; set rate_last to the latest sent
1399 	 * redirect.
1400 	 */
1401 	if (rt->u.dst.rate_tokens == 0 ||
1402 	    time_after(jiffies,
1403 		       (rt->u.dst.rate_last +
1404 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1405 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406 		rt->u.dst.rate_last = jiffies;
1407 		++rt->u.dst.rate_tokens;
1408 #ifdef CONFIG_IP_ROUTE_VERBOSE
1409 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1410 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1411 		    net_ratelimit())
1412 			printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1413 				"redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1414 				NIPQUAD(rt->rt_src), rt->rt_iif,
1415 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1416 #endif
1417 	}
1418 out:
1419 	in_dev_put(in_dev);
1420 }
1421 
1422 static int ip_error(struct sk_buff *skb)
1423 {
1424 	struct rtable *rt = skb->rtable;
1425 	unsigned long now;
1426 	int code;
1427 
1428 	switch (rt->u.dst.error) {
1429 		case EINVAL:
1430 		default:
1431 			goto out;
1432 		case EHOSTUNREACH:
1433 			code = ICMP_HOST_UNREACH;
1434 			break;
1435 		case ENETUNREACH:
1436 			code = ICMP_NET_UNREACH;
1437 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1438 					IPSTATS_MIB_INNOROUTES);
1439 			break;
1440 		case EACCES:
1441 			code = ICMP_PKT_FILTERED;
1442 			break;
1443 	}
1444 
1445 	now = jiffies;
1446 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1447 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1448 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1449 	rt->u.dst.rate_last = now;
1450 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1451 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1452 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1453 	}
1454 
1455 out:	kfree_skb(skb);
1456 	return 0;
1457 }
1458 
1459 /*
1460  *	The last two values are not from the RFC but
1461  *	are needed for AMPRnet AX.25 paths.
1462  */
1463 
1464 static const unsigned short mtu_plateau[] =
1465 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1466 
1467 static inline unsigned short guess_mtu(unsigned short old_mtu)
1468 {
1469 	int i;
1470 
1471 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1472 		if (old_mtu > mtu_plateau[i])
1473 			return mtu_plateau[i];
1474 	return 68;
1475 }
1476 
1477 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1478 				 unsigned short new_mtu,
1479 				 struct net_device *dev)
1480 {
1481 	int i, k;
1482 	unsigned short old_mtu = ntohs(iph->tot_len);
1483 	struct rtable *rth;
1484 	int  ikeys[2] = { dev->ifindex, 0 };
1485 	__be32  skeys[2] = { iph->saddr, 0, };
1486 	__be32  daddr = iph->daddr;
1487 	unsigned short est_mtu = 0;
1488 
1489 	if (ipv4_config.no_pmtu_disc)
1490 		return 0;
1491 
1492 	for (k = 0; k < 2; k++) {
1493 		for (i = 0; i < 2; i++) {
1494 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1495 						rt_genid(net));
1496 
1497 			rcu_read_lock();
1498 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1499 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1500 				unsigned short mtu = new_mtu;
1501 
1502 				if (rth->fl.fl4_dst != daddr ||
1503 				    rth->fl.fl4_src != skeys[i] ||
1504 				    rth->rt_dst != daddr ||
1505 				    rth->rt_src != iph->saddr ||
1506 				    rth->fl.oif != ikeys[k] ||
1507 				    rth->fl.iif != 0 ||
1508 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1509 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1510 				    rt_is_expired(rth))
1511 					continue;
1512 
1513 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1514 
1515 					/* BSD 4.2 compatibility hack :-( */
1516 					if (mtu == 0 &&
1517 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1518 					    old_mtu >= 68 + (iph->ihl << 2))
1519 						old_mtu -= iph->ihl << 2;
1520 
1521 					mtu = guess_mtu(old_mtu);
1522 				}
1523 				if (mtu <= dst_mtu(&rth->u.dst)) {
1524 					if (mtu < dst_mtu(&rth->u.dst)) {
1525 						dst_confirm(&rth->u.dst);
1526 						if (mtu < ip_rt_min_pmtu) {
1527 							mtu = ip_rt_min_pmtu;
1528 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1529 								(1 << RTAX_MTU);
1530 						}
1531 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1532 						dst_set_expires(&rth->u.dst,
1533 							ip_rt_mtu_expires);
1534 					}
1535 					est_mtu = mtu;
1536 				}
1537 			}
1538 			rcu_read_unlock();
1539 		}
1540 	}
1541 	return est_mtu ? : new_mtu;
1542 }
1543 
1544 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1545 {
1546 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1547 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1548 		if (mtu < ip_rt_min_pmtu) {
1549 			mtu = ip_rt_min_pmtu;
1550 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1551 		}
1552 		dst->metrics[RTAX_MTU-1] = mtu;
1553 		dst_set_expires(dst, ip_rt_mtu_expires);
1554 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1555 	}
1556 }
1557 
1558 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1559 {
1560 	return NULL;
1561 }
1562 
1563 static void ipv4_dst_destroy(struct dst_entry *dst)
1564 {
1565 	struct rtable *rt = (struct rtable *) dst;
1566 	struct inet_peer *peer = rt->peer;
1567 	struct in_device *idev = rt->idev;
1568 
1569 	if (peer) {
1570 		rt->peer = NULL;
1571 		inet_putpeer(peer);
1572 	}
1573 
1574 	if (idev) {
1575 		rt->idev = NULL;
1576 		in_dev_put(idev);
1577 	}
1578 }
1579 
1580 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1581 			    int how)
1582 {
1583 	struct rtable *rt = (struct rtable *) dst;
1584 	struct in_device *idev = rt->idev;
1585 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1586 		struct in_device *loopback_idev =
1587 			in_dev_get(dev_net(dev)->loopback_dev);
1588 		if (loopback_idev) {
1589 			rt->idev = loopback_idev;
1590 			in_dev_put(idev);
1591 		}
1592 	}
1593 }
1594 
1595 static void ipv4_link_failure(struct sk_buff *skb)
1596 {
1597 	struct rtable *rt;
1598 
1599 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1600 
1601 	rt = skb->rtable;
1602 	if (rt)
1603 		dst_set_expires(&rt->u.dst, 0);
1604 }
1605 
1606 static int ip_rt_bug(struct sk_buff *skb)
1607 {
1608 	printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1609 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1610 		skb->dev ? skb->dev->name : "?");
1611 	kfree_skb(skb);
1612 	return 0;
1613 }
1614 
1615 /*
1616    We do not cache source address of outgoing interface,
1617    because it is used only by IP RR, TS and SRR options,
1618    so that it out of fast path.
1619 
1620    BTW remember: "addr" is allowed to be not aligned
1621    in IP options!
1622  */
1623 
1624 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1625 {
1626 	__be32 src;
1627 	struct fib_result res;
1628 
1629 	if (rt->fl.iif == 0)
1630 		src = rt->rt_src;
1631 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1632 		src = FIB_RES_PREFSRC(res);
1633 		fib_res_put(&res);
1634 	} else
1635 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1636 					RT_SCOPE_UNIVERSE);
1637 	memcpy(addr, &src, 4);
1638 }
1639 
1640 #ifdef CONFIG_NET_CLS_ROUTE
1641 static void set_class_tag(struct rtable *rt, u32 tag)
1642 {
1643 	if (!(rt->u.dst.tclassid & 0xFFFF))
1644 		rt->u.dst.tclassid |= tag & 0xFFFF;
1645 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1646 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1647 }
1648 #endif
1649 
1650 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1651 {
1652 	struct fib_info *fi = res->fi;
1653 
1654 	if (fi) {
1655 		if (FIB_RES_GW(*res) &&
1656 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1657 			rt->rt_gateway = FIB_RES_GW(*res);
1658 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1659 		       sizeof(rt->u.dst.metrics));
1660 		if (fi->fib_mtu == 0) {
1661 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1662 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1663 			    rt->rt_gateway != rt->rt_dst &&
1664 			    rt->u.dst.dev->mtu > 576)
1665 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1666 		}
1667 #ifdef CONFIG_NET_CLS_ROUTE
1668 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1669 #endif
1670 	} else
1671 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1672 
1673 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1674 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1675 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1676 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1677 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1678 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1679 				       ip_rt_min_advmss);
1680 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1681 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1682 
1683 #ifdef CONFIG_NET_CLS_ROUTE
1684 #ifdef CONFIG_IP_MULTIPLE_TABLES
1685 	set_class_tag(rt, fib_rules_tclass(res));
1686 #endif
1687 	set_class_tag(rt, itag);
1688 #endif
1689 	rt->rt_type = res->type;
1690 }
1691 
1692 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1693 				u8 tos, struct net_device *dev, int our)
1694 {
1695 	unsigned hash;
1696 	struct rtable *rth;
1697 	__be32 spec_dst;
1698 	struct in_device *in_dev = in_dev_get(dev);
1699 	u32 itag = 0;
1700 
1701 	/* Primary sanity checks. */
1702 
1703 	if (in_dev == NULL)
1704 		return -EINVAL;
1705 
1706 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1707 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1708 		goto e_inval;
1709 
1710 	if (ipv4_is_zeronet(saddr)) {
1711 		if (!ipv4_is_local_multicast(daddr))
1712 			goto e_inval;
1713 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1714 	} else if (fib_validate_source(saddr, 0, tos, 0,
1715 					dev, &spec_dst, &itag) < 0)
1716 		goto e_inval;
1717 
1718 	rth = dst_alloc(&ipv4_dst_ops);
1719 	if (!rth)
1720 		goto e_nobufs;
1721 
1722 	rth->u.dst.output= ip_rt_bug;
1723 
1724 	atomic_set(&rth->u.dst.__refcnt, 1);
1725 	rth->u.dst.flags= DST_HOST;
1726 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1727 		rth->u.dst.flags |= DST_NOPOLICY;
1728 	rth->fl.fl4_dst	= daddr;
1729 	rth->rt_dst	= daddr;
1730 	rth->fl.fl4_tos	= tos;
1731 	rth->fl.mark    = skb->mark;
1732 	rth->fl.fl4_src	= saddr;
1733 	rth->rt_src	= saddr;
1734 #ifdef CONFIG_NET_CLS_ROUTE
1735 	rth->u.dst.tclassid = itag;
1736 #endif
1737 	rth->rt_iif	=
1738 	rth->fl.iif	= dev->ifindex;
1739 	rth->u.dst.dev	= init_net.loopback_dev;
1740 	dev_hold(rth->u.dst.dev);
1741 	rth->idev	= in_dev_get(rth->u.dst.dev);
1742 	rth->fl.oif	= 0;
1743 	rth->rt_gateway	= daddr;
1744 	rth->rt_spec_dst= spec_dst;
1745 	rth->rt_genid	= rt_genid(dev_net(dev));
1746 	rth->rt_flags	= RTCF_MULTICAST;
1747 	rth->rt_type	= RTN_MULTICAST;
1748 	if (our) {
1749 		rth->u.dst.input= ip_local_deliver;
1750 		rth->rt_flags |= RTCF_LOCAL;
1751 	}
1752 
1753 #ifdef CONFIG_IP_MROUTE
1754 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1755 		rth->u.dst.input = ip_mr_input;
1756 #endif
1757 	RT_CACHE_STAT_INC(in_slow_mc);
1758 
1759 	in_dev_put(in_dev);
1760 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1761 	return rt_intern_hash(hash, rth, &skb->rtable);
1762 
1763 e_nobufs:
1764 	in_dev_put(in_dev);
1765 	return -ENOBUFS;
1766 
1767 e_inval:
1768 	in_dev_put(in_dev);
1769 	return -EINVAL;
1770 }
1771 
1772 
1773 static void ip_handle_martian_source(struct net_device *dev,
1774 				     struct in_device *in_dev,
1775 				     struct sk_buff *skb,
1776 				     __be32 daddr,
1777 				     __be32 saddr)
1778 {
1779 	RT_CACHE_STAT_INC(in_martian_src);
1780 #ifdef CONFIG_IP_ROUTE_VERBOSE
1781 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1782 		/*
1783 		 *	RFC1812 recommendation, if source is martian,
1784 		 *	the only hint is MAC header.
1785 		 */
1786 		printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1787 			NIPQUAD_FMT", on dev %s\n",
1788 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1789 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1790 			int i;
1791 			const unsigned char *p = skb_mac_header(skb);
1792 			printk(KERN_WARNING "ll header: ");
1793 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1794 				printk("%02x", *p);
1795 				if (i < (dev->hard_header_len - 1))
1796 					printk(":");
1797 			}
1798 			printk("\n");
1799 		}
1800 	}
1801 #endif
1802 }
1803 
1804 static int __mkroute_input(struct sk_buff *skb,
1805 			   struct fib_result *res,
1806 			   struct in_device *in_dev,
1807 			   __be32 daddr, __be32 saddr, u32 tos,
1808 			   struct rtable **result)
1809 {
1810 
1811 	struct rtable *rth;
1812 	int err;
1813 	struct in_device *out_dev;
1814 	unsigned flags = 0;
1815 	__be32 spec_dst;
1816 	u32 itag;
1817 
1818 	/* get a working reference to the output device */
1819 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1820 	if (out_dev == NULL) {
1821 		if (net_ratelimit())
1822 			printk(KERN_CRIT "Bug in ip_route_input" \
1823 			       "_slow(). Please, report\n");
1824 		return -EINVAL;
1825 	}
1826 
1827 
1828 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1829 				  in_dev->dev, &spec_dst, &itag);
1830 	if (err < 0) {
1831 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1832 					 saddr);
1833 
1834 		err = -EINVAL;
1835 		goto cleanup;
1836 	}
1837 
1838 	if (err)
1839 		flags |= RTCF_DIRECTSRC;
1840 
1841 	if (out_dev == in_dev && err &&
1842 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1843 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1844 		flags |= RTCF_DOREDIRECT;
1845 
1846 	if (skb->protocol != htons(ETH_P_IP)) {
1847 		/* Not IP (i.e. ARP). Do not create route, if it is
1848 		 * invalid for proxy arp. DNAT routes are always valid.
1849 		 */
1850 		if (out_dev == in_dev) {
1851 			err = -EINVAL;
1852 			goto cleanup;
1853 		}
1854 	}
1855 
1856 
1857 	rth = dst_alloc(&ipv4_dst_ops);
1858 	if (!rth) {
1859 		err = -ENOBUFS;
1860 		goto cleanup;
1861 	}
1862 
1863 	atomic_set(&rth->u.dst.__refcnt, 1);
1864 	rth->u.dst.flags= DST_HOST;
1865 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1866 		rth->u.dst.flags |= DST_NOPOLICY;
1867 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1868 		rth->u.dst.flags |= DST_NOXFRM;
1869 	rth->fl.fl4_dst	= daddr;
1870 	rth->rt_dst	= daddr;
1871 	rth->fl.fl4_tos	= tos;
1872 	rth->fl.mark    = skb->mark;
1873 	rth->fl.fl4_src	= saddr;
1874 	rth->rt_src	= saddr;
1875 	rth->rt_gateway	= daddr;
1876 	rth->rt_iif 	=
1877 		rth->fl.iif	= in_dev->dev->ifindex;
1878 	rth->u.dst.dev	= (out_dev)->dev;
1879 	dev_hold(rth->u.dst.dev);
1880 	rth->idev	= in_dev_get(rth->u.dst.dev);
1881 	rth->fl.oif 	= 0;
1882 	rth->rt_spec_dst= spec_dst;
1883 
1884 	rth->u.dst.input = ip_forward;
1885 	rth->u.dst.output = ip_output;
1886 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1887 
1888 	rt_set_nexthop(rth, res, itag);
1889 
1890 	rth->rt_flags = flags;
1891 
1892 	*result = rth;
1893 	err = 0;
1894  cleanup:
1895 	/* release the working reference to the output device */
1896 	in_dev_put(out_dev);
1897 	return err;
1898 }
1899 
1900 static int ip_mkroute_input(struct sk_buff *skb,
1901 			    struct fib_result *res,
1902 			    const struct flowi *fl,
1903 			    struct in_device *in_dev,
1904 			    __be32 daddr, __be32 saddr, u32 tos)
1905 {
1906 	struct rtable* rth = NULL;
1907 	int err;
1908 	unsigned hash;
1909 
1910 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1911 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1912 		fib_select_multipath(fl, res);
1913 #endif
1914 
1915 	/* create a routing cache entry */
1916 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1917 	if (err)
1918 		return err;
1919 
1920 	/* put it into the cache */
1921 	hash = rt_hash(daddr, saddr, fl->iif,
1922 		       rt_genid(dev_net(rth->u.dst.dev)));
1923 	return rt_intern_hash(hash, rth, &skb->rtable);
1924 }
1925 
1926 /*
1927  *	NOTE. We drop all the packets that has local source
1928  *	addresses, because every properly looped back packet
1929  *	must have correct destination already attached by output routine.
1930  *
1931  *	Such approach solves two big problems:
1932  *	1. Not simplex devices are handled properly.
1933  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1934  */
1935 
1936 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1937 			       u8 tos, struct net_device *dev)
1938 {
1939 	struct fib_result res;
1940 	struct in_device *in_dev = in_dev_get(dev);
1941 	struct flowi fl = { .nl_u = { .ip4_u =
1942 				      { .daddr = daddr,
1943 					.saddr = saddr,
1944 					.tos = tos,
1945 					.scope = RT_SCOPE_UNIVERSE,
1946 				      } },
1947 			    .mark = skb->mark,
1948 			    .iif = dev->ifindex };
1949 	unsigned	flags = 0;
1950 	u32		itag = 0;
1951 	struct rtable * rth;
1952 	unsigned	hash;
1953 	__be32		spec_dst;
1954 	int		err = -EINVAL;
1955 	int		free_res = 0;
1956 	struct net    * net = dev_net(dev);
1957 
1958 	/* IP on this device is disabled. */
1959 
1960 	if (!in_dev)
1961 		goto out;
1962 
1963 	/* Check for the most weird martians, which can be not detected
1964 	   by fib_lookup.
1965 	 */
1966 
1967 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1968 	    ipv4_is_loopback(saddr))
1969 		goto martian_source;
1970 
1971 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1972 		goto brd_input;
1973 
1974 	/* Accept zero addresses only to limited broadcast;
1975 	 * I even do not know to fix it or not. Waiting for complains :-)
1976 	 */
1977 	if (ipv4_is_zeronet(saddr))
1978 		goto martian_source;
1979 
1980 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1981 	    ipv4_is_loopback(daddr))
1982 		goto martian_destination;
1983 
1984 	/*
1985 	 *	Now we are ready to route packet.
1986 	 */
1987 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1988 		if (!IN_DEV_FORWARD(in_dev))
1989 			goto e_hostunreach;
1990 		goto no_route;
1991 	}
1992 	free_res = 1;
1993 
1994 	RT_CACHE_STAT_INC(in_slow_tot);
1995 
1996 	if (res.type == RTN_BROADCAST)
1997 		goto brd_input;
1998 
1999 	if (res.type == RTN_LOCAL) {
2000 		int result;
2001 		result = fib_validate_source(saddr, daddr, tos,
2002 					     net->loopback_dev->ifindex,
2003 					     dev, &spec_dst, &itag);
2004 		if (result < 0)
2005 			goto martian_source;
2006 		if (result)
2007 			flags |= RTCF_DIRECTSRC;
2008 		spec_dst = daddr;
2009 		goto local_input;
2010 	}
2011 
2012 	if (!IN_DEV_FORWARD(in_dev))
2013 		goto e_hostunreach;
2014 	if (res.type != RTN_UNICAST)
2015 		goto martian_destination;
2016 
2017 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2018 done:
2019 	in_dev_put(in_dev);
2020 	if (free_res)
2021 		fib_res_put(&res);
2022 out:	return err;
2023 
2024 brd_input:
2025 	if (skb->protocol != htons(ETH_P_IP))
2026 		goto e_inval;
2027 
2028 	if (ipv4_is_zeronet(saddr))
2029 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2030 	else {
2031 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2032 					  &itag);
2033 		if (err < 0)
2034 			goto martian_source;
2035 		if (err)
2036 			flags |= RTCF_DIRECTSRC;
2037 	}
2038 	flags |= RTCF_BROADCAST;
2039 	res.type = RTN_BROADCAST;
2040 	RT_CACHE_STAT_INC(in_brd);
2041 
2042 local_input:
2043 	rth = dst_alloc(&ipv4_dst_ops);
2044 	if (!rth)
2045 		goto e_nobufs;
2046 
2047 	rth->u.dst.output= ip_rt_bug;
2048 	rth->rt_genid = rt_genid(net);
2049 
2050 	atomic_set(&rth->u.dst.__refcnt, 1);
2051 	rth->u.dst.flags= DST_HOST;
2052 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2053 		rth->u.dst.flags |= DST_NOPOLICY;
2054 	rth->fl.fl4_dst	= daddr;
2055 	rth->rt_dst	= daddr;
2056 	rth->fl.fl4_tos	= tos;
2057 	rth->fl.mark    = skb->mark;
2058 	rth->fl.fl4_src	= saddr;
2059 	rth->rt_src	= saddr;
2060 #ifdef CONFIG_NET_CLS_ROUTE
2061 	rth->u.dst.tclassid = itag;
2062 #endif
2063 	rth->rt_iif	=
2064 	rth->fl.iif	= dev->ifindex;
2065 	rth->u.dst.dev	= net->loopback_dev;
2066 	dev_hold(rth->u.dst.dev);
2067 	rth->idev	= in_dev_get(rth->u.dst.dev);
2068 	rth->rt_gateway	= daddr;
2069 	rth->rt_spec_dst= spec_dst;
2070 	rth->u.dst.input= ip_local_deliver;
2071 	rth->rt_flags 	= flags|RTCF_LOCAL;
2072 	if (res.type == RTN_UNREACHABLE) {
2073 		rth->u.dst.input= ip_error;
2074 		rth->u.dst.error= -err;
2075 		rth->rt_flags 	&= ~RTCF_LOCAL;
2076 	}
2077 	rth->rt_type	= res.type;
2078 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2079 	err = rt_intern_hash(hash, rth, &skb->rtable);
2080 	goto done;
2081 
2082 no_route:
2083 	RT_CACHE_STAT_INC(in_no_route);
2084 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2085 	res.type = RTN_UNREACHABLE;
2086 	if (err == -ESRCH)
2087 		err = -ENETUNREACH;
2088 	goto local_input;
2089 
2090 	/*
2091 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2092 	 */
2093 martian_destination:
2094 	RT_CACHE_STAT_INC(in_martian_dst);
2095 #ifdef CONFIG_IP_ROUTE_VERBOSE
2096 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2097 		printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2098 			NIPQUAD_FMT ", dev %s\n",
2099 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2100 #endif
2101 
2102 e_hostunreach:
2103 	err = -EHOSTUNREACH;
2104 	goto done;
2105 
2106 e_inval:
2107 	err = -EINVAL;
2108 	goto done;
2109 
2110 e_nobufs:
2111 	err = -ENOBUFS;
2112 	goto done;
2113 
2114 martian_source:
2115 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2116 	goto e_inval;
2117 }
2118 
2119 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120 		   u8 tos, struct net_device *dev)
2121 {
2122 	struct rtable * rth;
2123 	unsigned	hash;
2124 	int iif = dev->ifindex;
2125 	struct net *net;
2126 
2127 	net = dev_net(dev);
2128 	tos &= IPTOS_RT_MASK;
2129 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2130 
2131 	rcu_read_lock();
2132 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2133 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2134 		if (((rth->fl.fl4_dst ^ daddr) |
2135 		     (rth->fl.fl4_src ^ saddr) |
2136 		     (rth->fl.iif ^ iif) |
2137 		     rth->fl.oif |
2138 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2139 		    rth->fl.mark == skb->mark &&
2140 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2141 		    !rt_is_expired(rth)) {
2142 			dst_use(&rth->u.dst, jiffies);
2143 			RT_CACHE_STAT_INC(in_hit);
2144 			rcu_read_unlock();
2145 			skb->rtable = rth;
2146 			return 0;
2147 		}
2148 		RT_CACHE_STAT_INC(in_hlist_search);
2149 	}
2150 	rcu_read_unlock();
2151 
2152 	/* Multicast recognition logic is moved from route cache to here.
2153 	   The problem was that too many Ethernet cards have broken/missing
2154 	   hardware multicast filters :-( As result the host on multicasting
2155 	   network acquires a lot of useless route cache entries, sort of
2156 	   SDR messages from all the world. Now we try to get rid of them.
2157 	   Really, provided software IP multicast filter is organized
2158 	   reasonably (at least, hashed), it does not result in a slowdown
2159 	   comparing with route cache reject entries.
2160 	   Note, that multicast routers are not affected, because
2161 	   route cache entry is created eventually.
2162 	 */
2163 	if (ipv4_is_multicast(daddr)) {
2164 		struct in_device *in_dev;
2165 
2166 		rcu_read_lock();
2167 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2168 			int our = ip_check_mc(in_dev, daddr, saddr,
2169 				ip_hdr(skb)->protocol);
2170 			if (our
2171 #ifdef CONFIG_IP_MROUTE
2172 			    || (!ipv4_is_local_multicast(daddr) &&
2173 				IN_DEV_MFORWARD(in_dev))
2174 #endif
2175 			    ) {
2176 				rcu_read_unlock();
2177 				return ip_route_input_mc(skb, daddr, saddr,
2178 							 tos, dev, our);
2179 			}
2180 		}
2181 		rcu_read_unlock();
2182 		return -EINVAL;
2183 	}
2184 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2185 }
2186 
2187 static int __mkroute_output(struct rtable **result,
2188 			    struct fib_result *res,
2189 			    const struct flowi *fl,
2190 			    const struct flowi *oldflp,
2191 			    struct net_device *dev_out,
2192 			    unsigned flags)
2193 {
2194 	struct rtable *rth;
2195 	struct in_device *in_dev;
2196 	u32 tos = RT_FL_TOS(oldflp);
2197 	int err = 0;
2198 
2199 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2200 		return -EINVAL;
2201 
2202 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2203 		res->type = RTN_BROADCAST;
2204 	else if (ipv4_is_multicast(fl->fl4_dst))
2205 		res->type = RTN_MULTICAST;
2206 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2207 		return -EINVAL;
2208 
2209 	if (dev_out->flags & IFF_LOOPBACK)
2210 		flags |= RTCF_LOCAL;
2211 
2212 	/* get work reference to inet device */
2213 	in_dev = in_dev_get(dev_out);
2214 	if (!in_dev)
2215 		return -EINVAL;
2216 
2217 	if (res->type == RTN_BROADCAST) {
2218 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2219 		if (res->fi) {
2220 			fib_info_put(res->fi);
2221 			res->fi = NULL;
2222 		}
2223 	} else if (res->type == RTN_MULTICAST) {
2224 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2225 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2226 				 oldflp->proto))
2227 			flags &= ~RTCF_LOCAL;
2228 		/* If multicast route do not exist use
2229 		   default one, but do not gateway in this case.
2230 		   Yes, it is hack.
2231 		 */
2232 		if (res->fi && res->prefixlen < 4) {
2233 			fib_info_put(res->fi);
2234 			res->fi = NULL;
2235 		}
2236 	}
2237 
2238 
2239 	rth = dst_alloc(&ipv4_dst_ops);
2240 	if (!rth) {
2241 		err = -ENOBUFS;
2242 		goto cleanup;
2243 	}
2244 
2245 	atomic_set(&rth->u.dst.__refcnt, 1);
2246 	rth->u.dst.flags= DST_HOST;
2247 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2248 		rth->u.dst.flags |= DST_NOXFRM;
2249 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2250 		rth->u.dst.flags |= DST_NOPOLICY;
2251 
2252 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2253 	rth->fl.fl4_tos	= tos;
2254 	rth->fl.fl4_src	= oldflp->fl4_src;
2255 	rth->fl.oif	= oldflp->oif;
2256 	rth->fl.mark    = oldflp->mark;
2257 	rth->rt_dst	= fl->fl4_dst;
2258 	rth->rt_src	= fl->fl4_src;
2259 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2260 	/* get references to the devices that are to be hold by the routing
2261 	   cache entry */
2262 	rth->u.dst.dev	= dev_out;
2263 	dev_hold(dev_out);
2264 	rth->idev	= in_dev_get(dev_out);
2265 	rth->rt_gateway = fl->fl4_dst;
2266 	rth->rt_spec_dst= fl->fl4_src;
2267 
2268 	rth->u.dst.output=ip_output;
2269 	rth->rt_genid = rt_genid(dev_net(dev_out));
2270 
2271 	RT_CACHE_STAT_INC(out_slow_tot);
2272 
2273 	if (flags & RTCF_LOCAL) {
2274 		rth->u.dst.input = ip_local_deliver;
2275 		rth->rt_spec_dst = fl->fl4_dst;
2276 	}
2277 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2278 		rth->rt_spec_dst = fl->fl4_src;
2279 		if (flags & RTCF_LOCAL &&
2280 		    !(dev_out->flags & IFF_LOOPBACK)) {
2281 			rth->u.dst.output = ip_mc_output;
2282 			RT_CACHE_STAT_INC(out_slow_mc);
2283 		}
2284 #ifdef CONFIG_IP_MROUTE
2285 		if (res->type == RTN_MULTICAST) {
2286 			if (IN_DEV_MFORWARD(in_dev) &&
2287 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2288 				rth->u.dst.input = ip_mr_input;
2289 				rth->u.dst.output = ip_mc_output;
2290 			}
2291 		}
2292 #endif
2293 	}
2294 
2295 	rt_set_nexthop(rth, res, 0);
2296 
2297 	rth->rt_flags = flags;
2298 
2299 	*result = rth;
2300  cleanup:
2301 	/* release work reference to inet device */
2302 	in_dev_put(in_dev);
2303 
2304 	return err;
2305 }
2306 
2307 static int ip_mkroute_output(struct rtable **rp,
2308 			     struct fib_result *res,
2309 			     const struct flowi *fl,
2310 			     const struct flowi *oldflp,
2311 			     struct net_device *dev_out,
2312 			     unsigned flags)
2313 {
2314 	struct rtable *rth = NULL;
2315 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2316 	unsigned hash;
2317 	if (err == 0) {
2318 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2319 			       rt_genid(dev_net(dev_out)));
2320 		err = rt_intern_hash(hash, rth, rp);
2321 	}
2322 
2323 	return err;
2324 }
2325 
2326 /*
2327  * Major route resolver routine.
2328  */
2329 
2330 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2331 				const struct flowi *oldflp)
2332 {
2333 	u32 tos	= RT_FL_TOS(oldflp);
2334 	struct flowi fl = { .nl_u = { .ip4_u =
2335 				      { .daddr = oldflp->fl4_dst,
2336 					.saddr = oldflp->fl4_src,
2337 					.tos = tos & IPTOS_RT_MASK,
2338 					.scope = ((tos & RTO_ONLINK) ?
2339 						  RT_SCOPE_LINK :
2340 						  RT_SCOPE_UNIVERSE),
2341 				      } },
2342 			    .mark = oldflp->mark,
2343 			    .iif = net->loopback_dev->ifindex,
2344 			    .oif = oldflp->oif };
2345 	struct fib_result res;
2346 	unsigned flags = 0;
2347 	struct net_device *dev_out = NULL;
2348 	int free_res = 0;
2349 	int err;
2350 
2351 
2352 	res.fi		= NULL;
2353 #ifdef CONFIG_IP_MULTIPLE_TABLES
2354 	res.r		= NULL;
2355 #endif
2356 
2357 	if (oldflp->fl4_src) {
2358 		err = -EINVAL;
2359 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2360 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2361 		    ipv4_is_zeronet(oldflp->fl4_src))
2362 			goto out;
2363 
2364 		/* I removed check for oif == dev_out->oif here.
2365 		   It was wrong for two reasons:
2366 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2367 		      is assigned to multiple interfaces.
2368 		   2. Moreover, we are allowed to send packets with saddr
2369 		      of another iface. --ANK
2370 		 */
2371 
2372 		if (oldflp->oif == 0
2373 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2374 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2375 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2376 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2377 			if (dev_out == NULL)
2378 				goto out;
2379 
2380 			/* Special hack: user can direct multicasts
2381 			   and limited broadcast via necessary interface
2382 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2383 			   This hack is not just for fun, it allows
2384 			   vic,vat and friends to work.
2385 			   They bind socket to loopback, set ttl to zero
2386 			   and expect that it will work.
2387 			   From the viewpoint of routing cache they are broken,
2388 			   because we are not allowed to build multicast path
2389 			   with loopback source addr (look, routing cache
2390 			   cannot know, that ttl is zero, so that packet
2391 			   will not leave this host and route is valid).
2392 			   Luckily, this hack is good workaround.
2393 			 */
2394 
2395 			fl.oif = dev_out->ifindex;
2396 			goto make_route;
2397 		}
2398 
2399 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2400 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2401 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2402 			if (dev_out == NULL)
2403 				goto out;
2404 			dev_put(dev_out);
2405 			dev_out = NULL;
2406 		}
2407 	}
2408 
2409 
2410 	if (oldflp->oif) {
2411 		dev_out = dev_get_by_index(net, oldflp->oif);
2412 		err = -ENODEV;
2413 		if (dev_out == NULL)
2414 			goto out;
2415 
2416 		/* RACE: Check return value of inet_select_addr instead. */
2417 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2418 			dev_put(dev_out);
2419 			goto out;	/* Wrong error code */
2420 		}
2421 
2422 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2423 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2424 			if (!fl.fl4_src)
2425 				fl.fl4_src = inet_select_addr(dev_out, 0,
2426 							      RT_SCOPE_LINK);
2427 			goto make_route;
2428 		}
2429 		if (!fl.fl4_src) {
2430 			if (ipv4_is_multicast(oldflp->fl4_dst))
2431 				fl.fl4_src = inet_select_addr(dev_out, 0,
2432 							      fl.fl4_scope);
2433 			else if (!oldflp->fl4_dst)
2434 				fl.fl4_src = inet_select_addr(dev_out, 0,
2435 							      RT_SCOPE_HOST);
2436 		}
2437 	}
2438 
2439 	if (!fl.fl4_dst) {
2440 		fl.fl4_dst = fl.fl4_src;
2441 		if (!fl.fl4_dst)
2442 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2443 		if (dev_out)
2444 			dev_put(dev_out);
2445 		dev_out = net->loopback_dev;
2446 		dev_hold(dev_out);
2447 		fl.oif = net->loopback_dev->ifindex;
2448 		res.type = RTN_LOCAL;
2449 		flags |= RTCF_LOCAL;
2450 		goto make_route;
2451 	}
2452 
2453 	if (fib_lookup(net, &fl, &res)) {
2454 		res.fi = NULL;
2455 		if (oldflp->oif) {
2456 			/* Apparently, routing tables are wrong. Assume,
2457 			   that the destination is on link.
2458 
2459 			   WHY? DW.
2460 			   Because we are allowed to send to iface
2461 			   even if it has NO routes and NO assigned
2462 			   addresses. When oif is specified, routing
2463 			   tables are looked up with only one purpose:
2464 			   to catch if destination is gatewayed, rather than
2465 			   direct. Moreover, if MSG_DONTROUTE is set,
2466 			   we send packet, ignoring both routing tables
2467 			   and ifaddr state. --ANK
2468 
2469 
2470 			   We could make it even if oif is unknown,
2471 			   likely IPv6, but we do not.
2472 			 */
2473 
2474 			if (fl.fl4_src == 0)
2475 				fl.fl4_src = inet_select_addr(dev_out, 0,
2476 							      RT_SCOPE_LINK);
2477 			res.type = RTN_UNICAST;
2478 			goto make_route;
2479 		}
2480 		if (dev_out)
2481 			dev_put(dev_out);
2482 		err = -ENETUNREACH;
2483 		goto out;
2484 	}
2485 	free_res = 1;
2486 
2487 	if (res.type == RTN_LOCAL) {
2488 		if (!fl.fl4_src)
2489 			fl.fl4_src = fl.fl4_dst;
2490 		if (dev_out)
2491 			dev_put(dev_out);
2492 		dev_out = net->loopback_dev;
2493 		dev_hold(dev_out);
2494 		fl.oif = dev_out->ifindex;
2495 		if (res.fi)
2496 			fib_info_put(res.fi);
2497 		res.fi = NULL;
2498 		flags |= RTCF_LOCAL;
2499 		goto make_route;
2500 	}
2501 
2502 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2503 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2504 		fib_select_multipath(&fl, &res);
2505 	else
2506 #endif
2507 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2508 		fib_select_default(net, &fl, &res);
2509 
2510 	if (!fl.fl4_src)
2511 		fl.fl4_src = FIB_RES_PREFSRC(res);
2512 
2513 	if (dev_out)
2514 		dev_put(dev_out);
2515 	dev_out = FIB_RES_DEV(res);
2516 	dev_hold(dev_out);
2517 	fl.oif = dev_out->ifindex;
2518 
2519 
2520 make_route:
2521 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2522 
2523 
2524 	if (free_res)
2525 		fib_res_put(&res);
2526 	if (dev_out)
2527 		dev_put(dev_out);
2528 out:	return err;
2529 }
2530 
2531 int __ip_route_output_key(struct net *net, struct rtable **rp,
2532 			  const struct flowi *flp)
2533 {
2534 	unsigned hash;
2535 	struct rtable *rth;
2536 
2537 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2538 
2539 	rcu_read_lock_bh();
2540 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2541 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2542 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2543 		    rth->fl.fl4_src == flp->fl4_src &&
2544 		    rth->fl.iif == 0 &&
2545 		    rth->fl.oif == flp->oif &&
2546 		    rth->fl.mark == flp->mark &&
2547 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2548 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2549 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2550 		    !rt_is_expired(rth)) {
2551 			dst_use(&rth->u.dst, jiffies);
2552 			RT_CACHE_STAT_INC(out_hit);
2553 			rcu_read_unlock_bh();
2554 			*rp = rth;
2555 			return 0;
2556 		}
2557 		RT_CACHE_STAT_INC(out_hlist_search);
2558 	}
2559 	rcu_read_unlock_bh();
2560 
2561 	return ip_route_output_slow(net, rp, flp);
2562 }
2563 
2564 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2565 
2566 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2567 {
2568 }
2569 
2570 static struct dst_ops ipv4_dst_blackhole_ops = {
2571 	.family			=	AF_INET,
2572 	.protocol		=	__constant_htons(ETH_P_IP),
2573 	.destroy		=	ipv4_dst_destroy,
2574 	.check			=	ipv4_dst_check,
2575 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2576 	.entry_size		=	sizeof(struct rtable),
2577 	.entries		=	ATOMIC_INIT(0),
2578 };
2579 
2580 
2581 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2582 {
2583 	struct rtable *ort = *rp;
2584 	struct rtable *rt = (struct rtable *)
2585 		dst_alloc(&ipv4_dst_blackhole_ops);
2586 
2587 	if (rt) {
2588 		struct dst_entry *new = &rt->u.dst;
2589 
2590 		atomic_set(&new->__refcnt, 1);
2591 		new->__use = 1;
2592 		new->input = dst_discard;
2593 		new->output = dst_discard;
2594 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2595 
2596 		new->dev = ort->u.dst.dev;
2597 		if (new->dev)
2598 			dev_hold(new->dev);
2599 
2600 		rt->fl = ort->fl;
2601 
2602 		rt->idev = ort->idev;
2603 		if (rt->idev)
2604 			in_dev_hold(rt->idev);
2605 		rt->rt_genid = rt_genid(net);
2606 		rt->rt_flags = ort->rt_flags;
2607 		rt->rt_type = ort->rt_type;
2608 		rt->rt_dst = ort->rt_dst;
2609 		rt->rt_src = ort->rt_src;
2610 		rt->rt_iif = ort->rt_iif;
2611 		rt->rt_gateway = ort->rt_gateway;
2612 		rt->rt_spec_dst = ort->rt_spec_dst;
2613 		rt->peer = ort->peer;
2614 		if (rt->peer)
2615 			atomic_inc(&rt->peer->refcnt);
2616 
2617 		dst_free(new);
2618 	}
2619 
2620 	dst_release(&(*rp)->u.dst);
2621 	*rp = rt;
2622 	return (rt ? 0 : -ENOMEM);
2623 }
2624 
2625 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2626 			 struct sock *sk, int flags)
2627 {
2628 	int err;
2629 
2630 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2631 		return err;
2632 
2633 	if (flp->proto) {
2634 		if (!flp->fl4_src)
2635 			flp->fl4_src = (*rp)->rt_src;
2636 		if (!flp->fl4_dst)
2637 			flp->fl4_dst = (*rp)->rt_dst;
2638 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2639 				    flags ? XFRM_LOOKUP_WAIT : 0);
2640 		if (err == -EREMOTE)
2641 			err = ipv4_dst_blackhole(net, rp, flp);
2642 
2643 		return err;
2644 	}
2645 
2646 	return 0;
2647 }
2648 
2649 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2650 
2651 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2652 {
2653 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2654 }
2655 
2656 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2657 			int nowait, unsigned int flags)
2658 {
2659 	struct rtable *rt = skb->rtable;
2660 	struct rtmsg *r;
2661 	struct nlmsghdr *nlh;
2662 	long expires;
2663 	u32 id = 0, ts = 0, tsage = 0, error;
2664 
2665 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2666 	if (nlh == NULL)
2667 		return -EMSGSIZE;
2668 
2669 	r = nlmsg_data(nlh);
2670 	r->rtm_family	 = AF_INET;
2671 	r->rtm_dst_len	= 32;
2672 	r->rtm_src_len	= 0;
2673 	r->rtm_tos	= rt->fl.fl4_tos;
2674 	r->rtm_table	= RT_TABLE_MAIN;
2675 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2676 	r->rtm_type	= rt->rt_type;
2677 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2678 	r->rtm_protocol = RTPROT_UNSPEC;
2679 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2680 	if (rt->rt_flags & RTCF_NOTIFY)
2681 		r->rtm_flags |= RTM_F_NOTIFY;
2682 
2683 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2684 
2685 	if (rt->fl.fl4_src) {
2686 		r->rtm_src_len = 32;
2687 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2688 	}
2689 	if (rt->u.dst.dev)
2690 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2691 #ifdef CONFIG_NET_CLS_ROUTE
2692 	if (rt->u.dst.tclassid)
2693 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2694 #endif
2695 	if (rt->fl.iif)
2696 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2697 	else if (rt->rt_src != rt->fl.fl4_src)
2698 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2699 
2700 	if (rt->rt_dst != rt->rt_gateway)
2701 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2702 
2703 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2704 		goto nla_put_failure;
2705 
2706 	error = rt->u.dst.error;
2707 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2708 	if (rt->peer) {
2709 		id = rt->peer->ip_id_count;
2710 		if (rt->peer->tcp_ts_stamp) {
2711 			ts = rt->peer->tcp_ts;
2712 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2713 		}
2714 	}
2715 
2716 	if (rt->fl.iif) {
2717 #ifdef CONFIG_IP_MROUTE
2718 		__be32 dst = rt->rt_dst;
2719 
2720 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2721 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2722 			int err = ipmr_get_route(skb, r, nowait);
2723 			if (err <= 0) {
2724 				if (!nowait) {
2725 					if (err == 0)
2726 						return 0;
2727 					goto nla_put_failure;
2728 				} else {
2729 					if (err == -EMSGSIZE)
2730 						goto nla_put_failure;
2731 					error = err;
2732 				}
2733 			}
2734 		} else
2735 #endif
2736 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2737 	}
2738 
2739 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2740 			       expires, error) < 0)
2741 		goto nla_put_failure;
2742 
2743 	return nlmsg_end(skb, nlh);
2744 
2745 nla_put_failure:
2746 	nlmsg_cancel(skb, nlh);
2747 	return -EMSGSIZE;
2748 }
2749 
2750 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2751 {
2752 	struct net *net = sock_net(in_skb->sk);
2753 	struct rtmsg *rtm;
2754 	struct nlattr *tb[RTA_MAX+1];
2755 	struct rtable *rt = NULL;
2756 	__be32 dst = 0;
2757 	__be32 src = 0;
2758 	u32 iif;
2759 	int err;
2760 	struct sk_buff *skb;
2761 
2762 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2763 	if (err < 0)
2764 		goto errout;
2765 
2766 	rtm = nlmsg_data(nlh);
2767 
2768 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2769 	if (skb == NULL) {
2770 		err = -ENOBUFS;
2771 		goto errout;
2772 	}
2773 
2774 	/* Reserve room for dummy headers, this skb can pass
2775 	   through good chunk of routing engine.
2776 	 */
2777 	skb_reset_mac_header(skb);
2778 	skb_reset_network_header(skb);
2779 
2780 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2781 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2782 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2783 
2784 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2785 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2786 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2787 
2788 	if (iif) {
2789 		struct net_device *dev;
2790 
2791 		dev = __dev_get_by_index(net, iif);
2792 		if (dev == NULL) {
2793 			err = -ENODEV;
2794 			goto errout_free;
2795 		}
2796 
2797 		skb->protocol	= htons(ETH_P_IP);
2798 		skb->dev	= dev;
2799 		local_bh_disable();
2800 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2801 		local_bh_enable();
2802 
2803 		rt = skb->rtable;
2804 		if (err == 0 && rt->u.dst.error)
2805 			err = -rt->u.dst.error;
2806 	} else {
2807 		struct flowi fl = {
2808 			.nl_u = {
2809 				.ip4_u = {
2810 					.daddr = dst,
2811 					.saddr = src,
2812 					.tos = rtm->rtm_tos,
2813 				},
2814 			},
2815 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2816 		};
2817 		err = ip_route_output_key(net, &rt, &fl);
2818 	}
2819 
2820 	if (err)
2821 		goto errout_free;
2822 
2823 	skb->rtable = rt;
2824 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2825 		rt->rt_flags |= RTCF_NOTIFY;
2826 
2827 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2828 			   RTM_NEWROUTE, 0, 0);
2829 	if (err <= 0)
2830 		goto errout_free;
2831 
2832 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2833 errout:
2834 	return err;
2835 
2836 errout_free:
2837 	kfree_skb(skb);
2838 	goto errout;
2839 }
2840 
2841 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2842 {
2843 	struct rtable *rt;
2844 	int h, s_h;
2845 	int idx, s_idx;
2846 	struct net *net;
2847 
2848 	net = sock_net(skb->sk);
2849 
2850 	s_h = cb->args[0];
2851 	if (s_h < 0)
2852 		s_h = 0;
2853 	s_idx = idx = cb->args[1];
2854 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2855 		if (!rt_hash_table[h].chain)
2856 			continue;
2857 		rcu_read_lock_bh();
2858 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2859 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2860 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2861 				continue;
2862 			if (rt_is_expired(rt))
2863 				continue;
2864 			skb->dst = dst_clone(&rt->u.dst);
2865 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2866 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2867 					 1, NLM_F_MULTI) <= 0) {
2868 				dst_release(xchg(&skb->dst, NULL));
2869 				rcu_read_unlock_bh();
2870 				goto done;
2871 			}
2872 			dst_release(xchg(&skb->dst, NULL));
2873 		}
2874 		rcu_read_unlock_bh();
2875 	}
2876 
2877 done:
2878 	cb->args[0] = h;
2879 	cb->args[1] = idx;
2880 	return skb->len;
2881 }
2882 
2883 void ip_rt_multicast_event(struct in_device *in_dev)
2884 {
2885 	rt_cache_flush(dev_net(in_dev->dev), 0);
2886 }
2887 
2888 #ifdef CONFIG_SYSCTL
2889 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2890 					struct file *filp, void __user *buffer,
2891 					size_t *lenp, loff_t *ppos)
2892 {
2893 	if (write) {
2894 		int flush_delay;
2895 		ctl_table ctl;
2896 		struct net *net;
2897 
2898 		memcpy(&ctl, __ctl, sizeof(ctl));
2899 		ctl.data = &flush_delay;
2900 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2901 
2902 		net = (struct net *)__ctl->extra1;
2903 		rt_cache_flush(net, flush_delay);
2904 		return 0;
2905 	}
2906 
2907 	return -EINVAL;
2908 }
2909 
2910 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2911 						int __user *name,
2912 						int nlen,
2913 						void __user *oldval,
2914 						size_t __user *oldlenp,
2915 						void __user *newval,
2916 						size_t newlen)
2917 {
2918 	int delay;
2919 	struct net *net;
2920 	if (newlen != sizeof(int))
2921 		return -EINVAL;
2922 	if (get_user(delay, (int __user *)newval))
2923 		return -EFAULT;
2924 	net = (struct net *)table->extra1;
2925 	rt_cache_flush(net, delay);
2926 	return 0;
2927 }
2928 
2929 static void rt_secret_reschedule(int old)
2930 {
2931 	struct net *net;
2932 	int new = ip_rt_secret_interval;
2933 	int diff = new - old;
2934 
2935 	if (!diff)
2936 		return;
2937 
2938 	rtnl_lock();
2939 	for_each_net(net) {
2940 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
2941 
2942 		if (!new)
2943 			continue;
2944 
2945 		if (deleted) {
2946 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
2947 
2948 			if (time <= 0 || (time += diff) <= 0)
2949 				time = 0;
2950 
2951 			net->ipv4.rt_secret_timer.expires = time;
2952 		} else
2953 			net->ipv4.rt_secret_timer.expires = new;
2954 
2955 		net->ipv4.rt_secret_timer.expires += jiffies;
2956 		add_timer(&net->ipv4.rt_secret_timer);
2957 	}
2958 	rtnl_unlock();
2959 }
2960 
2961 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
2962 					  struct file *filp,
2963 					  void __user *buffer, size_t *lenp,
2964 					  loff_t *ppos)
2965 {
2966 	int old = ip_rt_secret_interval;
2967 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
2968 
2969 	rt_secret_reschedule(old);
2970 
2971 	return ret;
2972 }
2973 
2974 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
2975 						   int __user *name,
2976 						   int nlen,
2977 						   void __user *oldval,
2978 						   size_t __user *oldlenp,
2979 						   void __user *newval,
2980 						   size_t newlen)
2981 {
2982 	int old = ip_rt_secret_interval;
2983 	int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval,
2984 				 newlen);
2985 
2986 	rt_secret_reschedule(old);
2987 
2988 	return ret;
2989 }
2990 
2991 static ctl_table ipv4_route_table[] = {
2992 	{
2993 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2994 		.procname	= "gc_thresh",
2995 		.data		= &ipv4_dst_ops.gc_thresh,
2996 		.maxlen		= sizeof(int),
2997 		.mode		= 0644,
2998 		.proc_handler	= &proc_dointvec,
2999 	},
3000 	{
3001 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
3002 		.procname	= "max_size",
3003 		.data		= &ip_rt_max_size,
3004 		.maxlen		= sizeof(int),
3005 		.mode		= 0644,
3006 		.proc_handler	= &proc_dointvec,
3007 	},
3008 	{
3009 		/*  Deprecated. Use gc_min_interval_ms */
3010 
3011 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3012 		.procname	= "gc_min_interval",
3013 		.data		= &ip_rt_gc_min_interval,
3014 		.maxlen		= sizeof(int),
3015 		.mode		= 0644,
3016 		.proc_handler	= &proc_dointvec_jiffies,
3017 		.strategy	= &sysctl_jiffies,
3018 	},
3019 	{
3020 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3021 		.procname	= "gc_min_interval_ms",
3022 		.data		= &ip_rt_gc_min_interval,
3023 		.maxlen		= sizeof(int),
3024 		.mode		= 0644,
3025 		.proc_handler	= &proc_dointvec_ms_jiffies,
3026 		.strategy	= &sysctl_ms_jiffies,
3027 	},
3028 	{
3029 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3030 		.procname	= "gc_timeout",
3031 		.data		= &ip_rt_gc_timeout,
3032 		.maxlen		= sizeof(int),
3033 		.mode		= 0644,
3034 		.proc_handler	= &proc_dointvec_jiffies,
3035 		.strategy	= &sysctl_jiffies,
3036 	},
3037 	{
3038 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3039 		.procname	= "gc_interval",
3040 		.data		= &ip_rt_gc_interval,
3041 		.maxlen		= sizeof(int),
3042 		.mode		= 0644,
3043 		.proc_handler	= &proc_dointvec_jiffies,
3044 		.strategy	= &sysctl_jiffies,
3045 	},
3046 	{
3047 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3048 		.procname	= "redirect_load",
3049 		.data		= &ip_rt_redirect_load,
3050 		.maxlen		= sizeof(int),
3051 		.mode		= 0644,
3052 		.proc_handler	= &proc_dointvec,
3053 	},
3054 	{
3055 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3056 		.procname	= "redirect_number",
3057 		.data		= &ip_rt_redirect_number,
3058 		.maxlen		= sizeof(int),
3059 		.mode		= 0644,
3060 		.proc_handler	= &proc_dointvec,
3061 	},
3062 	{
3063 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3064 		.procname	= "redirect_silence",
3065 		.data		= &ip_rt_redirect_silence,
3066 		.maxlen		= sizeof(int),
3067 		.mode		= 0644,
3068 		.proc_handler	= &proc_dointvec,
3069 	},
3070 	{
3071 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3072 		.procname	= "error_cost",
3073 		.data		= &ip_rt_error_cost,
3074 		.maxlen		= sizeof(int),
3075 		.mode		= 0644,
3076 		.proc_handler	= &proc_dointvec,
3077 	},
3078 	{
3079 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3080 		.procname	= "error_burst",
3081 		.data		= &ip_rt_error_burst,
3082 		.maxlen		= sizeof(int),
3083 		.mode		= 0644,
3084 		.proc_handler	= &proc_dointvec,
3085 	},
3086 	{
3087 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3088 		.procname	= "gc_elasticity",
3089 		.data		= &ip_rt_gc_elasticity,
3090 		.maxlen		= sizeof(int),
3091 		.mode		= 0644,
3092 		.proc_handler	= &proc_dointvec,
3093 	},
3094 	{
3095 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3096 		.procname	= "mtu_expires",
3097 		.data		= &ip_rt_mtu_expires,
3098 		.maxlen		= sizeof(int),
3099 		.mode		= 0644,
3100 		.proc_handler	= &proc_dointvec_jiffies,
3101 		.strategy	= &sysctl_jiffies,
3102 	},
3103 	{
3104 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3105 		.procname	= "min_pmtu",
3106 		.data		= &ip_rt_min_pmtu,
3107 		.maxlen		= sizeof(int),
3108 		.mode		= 0644,
3109 		.proc_handler	= &proc_dointvec,
3110 	},
3111 	{
3112 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3113 		.procname	= "min_adv_mss",
3114 		.data		= &ip_rt_min_advmss,
3115 		.maxlen		= sizeof(int),
3116 		.mode		= 0644,
3117 		.proc_handler	= &proc_dointvec,
3118 	},
3119 	{
3120 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3121 		.procname	= "secret_interval",
3122 		.data		= &ip_rt_secret_interval,
3123 		.maxlen		= sizeof(int),
3124 		.mode		= 0644,
3125 		.proc_handler	= &ipv4_sysctl_rt_secret_interval,
3126 		.strategy	= &ipv4_sysctl_rt_secret_interval_strategy,
3127 	},
3128 	{ .ctl_name = 0 }
3129 };
3130 
3131 static struct ctl_table empty[1];
3132 
3133 static struct ctl_table ipv4_skeleton[] =
3134 {
3135 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3136 	  .mode = 0555, .child = ipv4_route_table},
3137 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3138 	  .mode = 0555, .child = empty},
3139 	{ }
3140 };
3141 
3142 static __net_initdata struct ctl_path ipv4_path[] = {
3143 	{ .procname = "net", .ctl_name = CTL_NET, },
3144 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3145 	{ },
3146 };
3147 
3148 static struct ctl_table ipv4_route_flush_table[] = {
3149 	{
3150 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3151 		.procname	= "flush",
3152 		.maxlen		= sizeof(int),
3153 		.mode		= 0200,
3154 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
3155 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
3156 	},
3157 	{ .ctl_name = 0 },
3158 };
3159 
3160 static __net_initdata struct ctl_path ipv4_route_path[] = {
3161 	{ .procname = "net", .ctl_name = CTL_NET, },
3162 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3163 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3164 	{ },
3165 };
3166 
3167 static __net_init int sysctl_route_net_init(struct net *net)
3168 {
3169 	struct ctl_table *tbl;
3170 
3171 	tbl = ipv4_route_flush_table;
3172 	if (net != &init_net) {
3173 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3174 		if (tbl == NULL)
3175 			goto err_dup;
3176 	}
3177 	tbl[0].extra1 = net;
3178 
3179 	net->ipv4.route_hdr =
3180 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3181 	if (net->ipv4.route_hdr == NULL)
3182 		goto err_reg;
3183 	return 0;
3184 
3185 err_reg:
3186 	if (tbl != ipv4_route_flush_table)
3187 		kfree(tbl);
3188 err_dup:
3189 	return -ENOMEM;
3190 }
3191 
3192 static __net_exit void sysctl_route_net_exit(struct net *net)
3193 {
3194 	struct ctl_table *tbl;
3195 
3196 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3197 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3198 	BUG_ON(tbl == ipv4_route_flush_table);
3199 	kfree(tbl);
3200 }
3201 
3202 static __net_initdata struct pernet_operations sysctl_route_ops = {
3203 	.init = sysctl_route_net_init,
3204 	.exit = sysctl_route_net_exit,
3205 };
3206 #endif
3207 
3208 
3209 static __net_init int rt_secret_timer_init(struct net *net)
3210 {
3211 	atomic_set(&net->ipv4.rt_genid,
3212 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3213 			(jiffies ^ (jiffies >> 7))));
3214 
3215 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3216 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3217 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3218 
3219 	if (ip_rt_secret_interval) {
3220 		net->ipv4.rt_secret_timer.expires =
3221 			jiffies + net_random() % ip_rt_secret_interval +
3222 			ip_rt_secret_interval;
3223 		add_timer(&net->ipv4.rt_secret_timer);
3224 	}
3225 	return 0;
3226 }
3227 
3228 static __net_exit void rt_secret_timer_exit(struct net *net)
3229 {
3230 	del_timer_sync(&net->ipv4.rt_secret_timer);
3231 }
3232 
3233 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3234 	.init = rt_secret_timer_init,
3235 	.exit = rt_secret_timer_exit,
3236 };
3237 
3238 
3239 #ifdef CONFIG_NET_CLS_ROUTE
3240 struct ip_rt_acct *ip_rt_acct __read_mostly;
3241 #endif /* CONFIG_NET_CLS_ROUTE */
3242 
3243 static __initdata unsigned long rhash_entries;
3244 static int __init set_rhash_entries(char *str)
3245 {
3246 	if (!str)
3247 		return 0;
3248 	rhash_entries = simple_strtoul(str, &str, 0);
3249 	return 1;
3250 }
3251 __setup("rhash_entries=", set_rhash_entries);
3252 
3253 int __init ip_rt_init(void)
3254 {
3255 	int rc = 0;
3256 
3257 #ifdef CONFIG_NET_CLS_ROUTE
3258 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3259 	if (!ip_rt_acct)
3260 		panic("IP: failed to allocate ip_rt_acct\n");
3261 #endif
3262 
3263 	ipv4_dst_ops.kmem_cachep =
3264 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3265 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3266 
3267 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3268 
3269 	rt_hash_table = (struct rt_hash_bucket *)
3270 		alloc_large_system_hash("IP route cache",
3271 					sizeof(struct rt_hash_bucket),
3272 					rhash_entries,
3273 					(num_physpages >= 128 * 1024) ?
3274 					15 : 17,
3275 					0,
3276 					&rt_hash_log,
3277 					&rt_hash_mask,
3278 					0);
3279 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3280 	rt_hash_lock_init();
3281 
3282 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3283 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3284 
3285 	devinet_init();
3286 	ip_fib_init();
3287 
3288 	/* All the timers, started at system startup tend
3289 	   to synchronize. Perturb it a bit.
3290 	 */
3291 	schedule_delayed_work(&expires_work,
3292 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3293 
3294 	if (register_pernet_subsys(&rt_secret_timer_ops))
3295 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3296 
3297 	if (ip_rt_proc_init())
3298 		printk(KERN_ERR "Unable to create route proc files\n");
3299 #ifdef CONFIG_XFRM
3300 	xfrm_init();
3301 	xfrm4_init();
3302 #endif
3303 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3304 
3305 #ifdef CONFIG_SYSCTL
3306 	register_pernet_subsys(&sysctl_route_ops);
3307 #endif
3308 	return rc;
3309 }
3310 
3311 #ifdef CONFIG_SYSCTL
3312 /*
3313  * We really need to sanitize the damn ipv4 init order, then all
3314  * this nonsense will go away.
3315  */
3316 void __init ip_static_sysctl_init(void)
3317 {
3318 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3319 }
3320 #endif
3321 
3322 EXPORT_SYMBOL(__ip_select_ident);
3323 EXPORT_SYMBOL(ip_route_input);
3324 EXPORT_SYMBOL(ip_route_output_key);
3325