xref: /linux/net/ipv4/route.c (revision f7511d5f66f01fc451747b24e79f3ada7a3af9af)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 
116 #define IP_MAX_MTU	0xFFF0
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
124 static int ip_rt_redirect_number __read_mostly	= 9;
125 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly	= HZ;
128 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly	= 8;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
134 
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
138 
139 /*
140  *	Interface to generic destination cache.
141  */
142 
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void		 ipv4_dst_destroy(struct dst_entry *dst);
145 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
146 					 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void		 ipv4_link_failure(struct sk_buff *skb);
149 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151 
152 
153 static struct dst_ops ipv4_dst_ops = {
154 	.family =		AF_INET,
155 	.protocol =		__constant_htons(ETH_P_IP),
156 	.gc =			rt_garbage_collect,
157 	.check =		ipv4_dst_check,
158 	.destroy =		ipv4_dst_destroy,
159 	.ifdown =		ipv4_dst_ifdown,
160 	.negative_advice =	ipv4_negative_advice,
161 	.link_failure =		ipv4_link_failure,
162 	.update_pmtu =		ip_rt_update_pmtu,
163 	.local_out =		ip_local_out,
164 	.entry_size =		sizeof(struct rtable),
165 	.entries =		ATOMIC_INIT(0),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 const __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 	defined(CONFIG_PROVE_LOCKING)
209 /*
210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211  * The size of this table is a power of two and depends on the number of CPUS.
212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213  */
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ	256
216 #else
217 # if NR_CPUS >= 32
218 #  define RT_HASH_LOCK_SZ	4096
219 # elif NR_CPUS >= 16
220 #  define RT_HASH_LOCK_SZ	2048
221 # elif NR_CPUS >= 8
222 #  define RT_HASH_LOCK_SZ	1024
223 # elif NR_CPUS >= 4
224 #  define RT_HASH_LOCK_SZ	512
225 # else
226 #  define RT_HASH_LOCK_SZ	256
227 # endif
228 #endif
229 
230 static spinlock_t	*rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 
233 static __init void rt_hash_lock_init(void)
234 {
235 	int i;
236 
237 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 			GFP_KERNEL);
239 	if (!rt_hash_locks)
240 		panic("IP: failed to allocate rt_hash_locks\n");
241 
242 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 		spin_lock_init(&rt_hash_locks[i]);
244 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 
248 static inline void rt_hash_lock_init(void)
249 {
250 }
251 #endif
252 
253 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
254 static unsigned			rt_hash_mask __read_mostly;
255 static unsigned int		rt_hash_log  __read_mostly;
256 static atomic_t			rt_genid __read_mostly;
257 
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260 	(__raw_get_cpu_var(rt_cache_stat).field++)
261 
262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
263 {
264 	return jhash_3words((__force u32)(__be32)(daddr),
265 			    (__force u32)(__be32)(saddr),
266 			    idx, atomic_read(&rt_genid))
267 		& rt_hash_mask;
268 }
269 
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 	struct seq_net_private p;
273 	int bucket;
274 	int genid;
275 };
276 
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279 	struct rt_cache_iter_state *st = seq->private;
280 	struct rtable *r = NULL;
281 
282 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 		rcu_read_lock_bh();
284 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
285 		while (r) {
286 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
287 			    r->rt_genid == st->genid)
288 				return r;
289 			r = rcu_dereference(r->u.dst.rt_next);
290 		}
291 		rcu_read_unlock_bh();
292 	}
293 	return r;
294 }
295 
296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
297 					  struct rtable *r)
298 {
299 	struct rt_cache_iter_state *st = seq->private;
300 	r = r->u.dst.rt_next;
301 	while (!r) {
302 		rcu_read_unlock_bh();
303 		if (--st->bucket < 0)
304 			break;
305 		rcu_read_lock_bh();
306 		r = rt_hash_table[st->bucket].chain;
307 	}
308 	return rcu_dereference(r);
309 }
310 
311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
312 					struct rtable *r)
313 {
314 	struct rt_cache_iter_state *st = seq->private;
315 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
316 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
317 			continue;
318 		if (r->rt_genid == st->genid)
319 			break;
320 	}
321 	return r;
322 }
323 
324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
325 {
326 	struct rtable *r = rt_cache_get_first(seq);
327 
328 	if (r)
329 		while (pos && (r = rt_cache_get_next(seq, r)))
330 			--pos;
331 	return pos ? NULL : r;
332 }
333 
334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
335 {
336 	struct rt_cache_iter_state *st = seq->private;
337 	if (*pos)
338 		return rt_cache_get_idx(seq, *pos - 1);
339 	st->genid = atomic_read(&rt_genid);
340 	return SEQ_START_TOKEN;
341 }
342 
343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
344 {
345 	struct rtable *r;
346 
347 	if (v == SEQ_START_TOKEN)
348 		r = rt_cache_get_first(seq);
349 	else
350 		r = rt_cache_get_next(seq, v);
351 	++*pos;
352 	return r;
353 }
354 
355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
356 {
357 	if (v && v != SEQ_START_TOKEN)
358 		rcu_read_unlock_bh();
359 }
360 
361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
362 {
363 	if (v == SEQ_START_TOKEN)
364 		seq_printf(seq, "%-127s\n",
365 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
366 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 			   "HHUptod\tSpecDst");
368 	else {
369 		struct rtable *r = v;
370 		int len;
371 
372 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
373 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
374 			r->u.dst.dev ? r->u.dst.dev->name : "*",
375 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
376 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
377 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
378 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
379 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
380 			dst_metric(&r->u.dst, RTAX_WINDOW),
381 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
382 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
383 			r->fl.fl4_tos,
384 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
385 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
386 				       dev_queue_xmit) : 0,
387 			r->rt_spec_dst, &len);
388 
389 		seq_printf(seq, "%*s\n", 127 - len, "");
390 	}
391 	return 0;
392 }
393 
394 static const struct seq_operations rt_cache_seq_ops = {
395 	.start  = rt_cache_seq_start,
396 	.next   = rt_cache_seq_next,
397 	.stop   = rt_cache_seq_stop,
398 	.show   = rt_cache_seq_show,
399 };
400 
401 static int rt_cache_seq_open(struct inode *inode, struct file *file)
402 {
403 	return seq_open_net(inode, file, &rt_cache_seq_ops,
404 			sizeof(struct rt_cache_iter_state));
405 }
406 
407 static const struct file_operations rt_cache_seq_fops = {
408 	.owner	 = THIS_MODULE,
409 	.open	 = rt_cache_seq_open,
410 	.read	 = seq_read,
411 	.llseek	 = seq_lseek,
412 	.release = seq_release_net,
413 };
414 
415 
416 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
417 {
418 	int cpu;
419 
420 	if (*pos == 0)
421 		return SEQ_START_TOKEN;
422 
423 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
424 		if (!cpu_possible(cpu))
425 			continue;
426 		*pos = cpu+1;
427 		return &per_cpu(rt_cache_stat, cpu);
428 	}
429 	return NULL;
430 }
431 
432 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
433 {
434 	int cpu;
435 
436 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
437 		if (!cpu_possible(cpu))
438 			continue;
439 		*pos = cpu+1;
440 		return &per_cpu(rt_cache_stat, cpu);
441 	}
442 	return NULL;
443 
444 }
445 
446 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
447 {
448 
449 }
450 
451 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
452 {
453 	struct rt_cache_stat *st = v;
454 
455 	if (v == SEQ_START_TOKEN) {
456 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
457 		return 0;
458 	}
459 
460 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
461 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
462 		   atomic_read(&ipv4_dst_ops.entries),
463 		   st->in_hit,
464 		   st->in_slow_tot,
465 		   st->in_slow_mc,
466 		   st->in_no_route,
467 		   st->in_brd,
468 		   st->in_martian_dst,
469 		   st->in_martian_src,
470 
471 		   st->out_hit,
472 		   st->out_slow_tot,
473 		   st->out_slow_mc,
474 
475 		   st->gc_total,
476 		   st->gc_ignored,
477 		   st->gc_goal_miss,
478 		   st->gc_dst_overflow,
479 		   st->in_hlist_search,
480 		   st->out_hlist_search
481 		);
482 	return 0;
483 }
484 
485 static const struct seq_operations rt_cpu_seq_ops = {
486 	.start  = rt_cpu_seq_start,
487 	.next   = rt_cpu_seq_next,
488 	.stop   = rt_cpu_seq_stop,
489 	.show   = rt_cpu_seq_show,
490 };
491 
492 
493 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
494 {
495 	return seq_open(file, &rt_cpu_seq_ops);
496 }
497 
498 static const struct file_operations rt_cpu_seq_fops = {
499 	.owner	 = THIS_MODULE,
500 	.open	 = rt_cpu_seq_open,
501 	.read	 = seq_read,
502 	.llseek	 = seq_lseek,
503 	.release = seq_release,
504 };
505 
506 #ifdef CONFIG_NET_CLS_ROUTE
507 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
508 			   int length, int *eof, void *data)
509 {
510 	unsigned int i;
511 
512 	if ((offset & 3) || (length & 3))
513 		return -EIO;
514 
515 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
516 		*eof = 1;
517 		return 0;
518 	}
519 
520 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
521 		length = sizeof(struct ip_rt_acct) * 256 - offset;
522 		*eof = 1;
523 	}
524 
525 	offset /= sizeof(u32);
526 
527 	if (length > 0) {
528 		u32 *dst = (u32 *) buffer;
529 
530 		*start = buffer;
531 		memset(dst, 0, length);
532 
533 		for_each_possible_cpu(i) {
534 			unsigned int j;
535 			u32 *src;
536 
537 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
538 			for (j = 0; j < length/4; j++)
539 				dst[j] += src[j];
540 		}
541 	}
542 	return length;
543 }
544 #endif
545 
546 static int __net_init ip_rt_do_proc_init(struct net *net)
547 {
548 	struct proc_dir_entry *pde;
549 
550 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
551 			&rt_cache_seq_fops);
552 	if (!pde)
553 		goto err1;
554 
555 	pde = proc_create("rt_cache", S_IRUGO,
556 			  net->proc_net_stat, &rt_cpu_seq_fops);
557 	if (!pde)
558 		goto err2;
559 
560 #ifdef CONFIG_NET_CLS_ROUTE
561 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
562 			ip_rt_acct_read, NULL);
563 	if (!pde)
564 		goto err3;
565 #endif
566 	return 0;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 err3:
570 	remove_proc_entry("rt_cache", net->proc_net_stat);
571 #endif
572 err2:
573 	remove_proc_entry("rt_cache", net->proc_net);
574 err1:
575 	return -ENOMEM;
576 }
577 
578 static void __net_exit ip_rt_do_proc_exit(struct net *net)
579 {
580 	remove_proc_entry("rt_cache", net->proc_net_stat);
581 	remove_proc_entry("rt_cache", net->proc_net);
582 	remove_proc_entry("rt_acct", net->proc_net);
583 }
584 
585 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
586 	.init = ip_rt_do_proc_init,
587 	.exit = ip_rt_do_proc_exit,
588 };
589 
590 static int __init ip_rt_proc_init(void)
591 {
592 	return register_pernet_subsys(&ip_rt_proc_ops);
593 }
594 
595 #else
596 static inline int ip_rt_proc_init(void)
597 {
598 	return 0;
599 }
600 #endif /* CONFIG_PROC_FS */
601 
602 static inline void rt_free(struct rtable *rt)
603 {
604 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
605 }
606 
607 static inline void rt_drop(struct rtable *rt)
608 {
609 	ip_rt_put(rt);
610 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
611 }
612 
613 static inline int rt_fast_clean(struct rtable *rth)
614 {
615 	/* Kill broadcast/multicast entries very aggresively, if they
616 	   collide in hash table with more useful entries */
617 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
618 		rth->fl.iif && rth->u.dst.rt_next;
619 }
620 
621 static inline int rt_valuable(struct rtable *rth)
622 {
623 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
624 		rth->u.dst.expires;
625 }
626 
627 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
628 {
629 	unsigned long age;
630 	int ret = 0;
631 
632 	if (atomic_read(&rth->u.dst.__refcnt))
633 		goto out;
634 
635 	ret = 1;
636 	if (rth->u.dst.expires &&
637 	    time_after_eq(jiffies, rth->u.dst.expires))
638 		goto out;
639 
640 	age = jiffies - rth->u.dst.lastuse;
641 	ret = 0;
642 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
643 	    (age <= tmo2 && rt_valuable(rth)))
644 		goto out;
645 	ret = 1;
646 out:	return ret;
647 }
648 
649 /* Bits of score are:
650  * 31: very valuable
651  * 30: not quite useless
652  * 29..0: usage counter
653  */
654 static inline u32 rt_score(struct rtable *rt)
655 {
656 	u32 score = jiffies - rt->u.dst.lastuse;
657 
658 	score = ~score & ~(3<<30);
659 
660 	if (rt_valuable(rt))
661 		score |= (1<<31);
662 
663 	if (!rt->fl.iif ||
664 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
665 		score |= (1<<30);
666 
667 	return score;
668 }
669 
670 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
671 {
672 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
673 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
674 		(fl1->mark ^ fl2->mark) |
675 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
676 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
677 		(fl1->oif ^ fl2->oif) |
678 		(fl1->iif ^ fl2->iif)) == 0;
679 }
680 
681 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
682 {
683 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
684 }
685 
686 /*
687  * Perform a full scan of hash table and free all entries.
688  * Can be called by a softirq or a process.
689  * In the later case, we want to be reschedule if necessary
690  */
691 static void rt_do_flush(int process_context)
692 {
693 	unsigned int i;
694 	struct rtable *rth, *next;
695 
696 	for (i = 0; i <= rt_hash_mask; i++) {
697 		if (process_context && need_resched())
698 			cond_resched();
699 		rth = rt_hash_table[i].chain;
700 		if (!rth)
701 			continue;
702 
703 		spin_lock_bh(rt_hash_lock_addr(i));
704 		rth = rt_hash_table[i].chain;
705 		rt_hash_table[i].chain = NULL;
706 		spin_unlock_bh(rt_hash_lock_addr(i));
707 
708 		for (; rth; rth = next) {
709 			next = rth->u.dst.rt_next;
710 			rt_free(rth);
711 		}
712 	}
713 }
714 
715 static void rt_check_expire(void)
716 {
717 	static unsigned int rover;
718 	unsigned int i = rover, goal;
719 	struct rtable *rth, **rthp;
720 	u64 mult;
721 
722 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
723 	if (ip_rt_gc_timeout > 1)
724 		do_div(mult, ip_rt_gc_timeout);
725 	goal = (unsigned int)mult;
726 	if (goal > rt_hash_mask)
727 		goal = rt_hash_mask + 1;
728 	for (; goal > 0; goal--) {
729 		unsigned long tmo = ip_rt_gc_timeout;
730 
731 		i = (i + 1) & rt_hash_mask;
732 		rthp = &rt_hash_table[i].chain;
733 
734 		if (need_resched())
735 			cond_resched();
736 
737 		if (*rthp == NULL)
738 			continue;
739 		spin_lock_bh(rt_hash_lock_addr(i));
740 		while ((rth = *rthp) != NULL) {
741 			if (rth->rt_genid != atomic_read(&rt_genid)) {
742 				*rthp = rth->u.dst.rt_next;
743 				rt_free(rth);
744 				continue;
745 			}
746 			if (rth->u.dst.expires) {
747 				/* Entry is expired even if it is in use */
748 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
749 					tmo >>= 1;
750 					rthp = &rth->u.dst.rt_next;
751 					continue;
752 				}
753 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
754 				tmo >>= 1;
755 				rthp = &rth->u.dst.rt_next;
756 				continue;
757 			}
758 
759 			/* Cleanup aged off entries. */
760 			*rthp = rth->u.dst.rt_next;
761 			rt_free(rth);
762 		}
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 	}
765 	rover = i;
766 }
767 
768 /*
769  * rt_worker_func() is run in process context.
770  * we call rt_check_expire() to scan part of the hash table
771  */
772 static void rt_worker_func(struct work_struct *work)
773 {
774 	rt_check_expire();
775 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
776 }
777 
778 /*
779  * Pertubation of rt_genid by a small quantity [1..256]
780  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
781  * many times (2^24) without giving recent rt_genid.
782  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
783  */
784 static void rt_cache_invalidate(void)
785 {
786 	unsigned char shuffle;
787 
788 	get_random_bytes(&shuffle, sizeof(shuffle));
789 	atomic_add(shuffle + 1U, &rt_genid);
790 }
791 
792 /*
793  * delay < 0  : invalidate cache (fast : entries will be deleted later)
794  * delay >= 0 : invalidate & flush cache (can be long)
795  */
796 void rt_cache_flush(int delay)
797 {
798 	rt_cache_invalidate();
799 	if (delay >= 0)
800 		rt_do_flush(!in_softirq());
801 }
802 
803 /*
804  * We change rt_genid and let gc do the cleanup
805  */
806 static void rt_secret_rebuild(unsigned long dummy)
807 {
808 	rt_cache_invalidate();
809 	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
810 }
811 
812 /*
813    Short description of GC goals.
814 
815    We want to build algorithm, which will keep routing cache
816    at some equilibrium point, when number of aged off entries
817    is kept approximately equal to newly generated ones.
818 
819    Current expiration strength is variable "expire".
820    We try to adjust it dynamically, so that if networking
821    is idle expires is large enough to keep enough of warm entries,
822    and when load increases it reduces to limit cache size.
823  */
824 
825 static int rt_garbage_collect(struct dst_ops *ops)
826 {
827 	static unsigned long expire = RT_GC_TIMEOUT;
828 	static unsigned long last_gc;
829 	static int rover;
830 	static int equilibrium;
831 	struct rtable *rth, **rthp;
832 	unsigned long now = jiffies;
833 	int goal;
834 
835 	/*
836 	 * Garbage collection is pretty expensive,
837 	 * do not make it too frequently.
838 	 */
839 
840 	RT_CACHE_STAT_INC(gc_total);
841 
842 	if (now - last_gc < ip_rt_gc_min_interval &&
843 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
844 		RT_CACHE_STAT_INC(gc_ignored);
845 		goto out;
846 	}
847 
848 	/* Calculate number of entries, which we want to expire now. */
849 	goal = atomic_read(&ipv4_dst_ops.entries) -
850 		(ip_rt_gc_elasticity << rt_hash_log);
851 	if (goal <= 0) {
852 		if (equilibrium < ipv4_dst_ops.gc_thresh)
853 			equilibrium = ipv4_dst_ops.gc_thresh;
854 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
855 		if (goal > 0) {
856 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
857 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858 		}
859 	} else {
860 		/* We are in dangerous area. Try to reduce cache really
861 		 * aggressively.
862 		 */
863 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
864 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
865 	}
866 
867 	if (now - last_gc >= ip_rt_gc_min_interval)
868 		last_gc = now;
869 
870 	if (goal <= 0) {
871 		equilibrium += goal;
872 		goto work_done;
873 	}
874 
875 	do {
876 		int i, k;
877 
878 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
879 			unsigned long tmo = expire;
880 
881 			k = (k + 1) & rt_hash_mask;
882 			rthp = &rt_hash_table[k].chain;
883 			spin_lock_bh(rt_hash_lock_addr(k));
884 			while ((rth = *rthp) != NULL) {
885 				if (rth->rt_genid == atomic_read(&rt_genid) &&
886 					!rt_may_expire(rth, tmo, expire)) {
887 					tmo >>= 1;
888 					rthp = &rth->u.dst.rt_next;
889 					continue;
890 				}
891 				*rthp = rth->u.dst.rt_next;
892 				rt_free(rth);
893 				goal--;
894 			}
895 			spin_unlock_bh(rt_hash_lock_addr(k));
896 			if (goal <= 0)
897 				break;
898 		}
899 		rover = k;
900 
901 		if (goal <= 0)
902 			goto work_done;
903 
904 		/* Goal is not achieved. We stop process if:
905 
906 		   - if expire reduced to zero. Otherwise, expire is halfed.
907 		   - if table is not full.
908 		   - if we are called from interrupt.
909 		   - jiffies check is just fallback/debug loop breaker.
910 		     We will not spin here for long time in any case.
911 		 */
912 
913 		RT_CACHE_STAT_INC(gc_goal_miss);
914 
915 		if (expire == 0)
916 			break;
917 
918 		expire >>= 1;
919 #if RT_CACHE_DEBUG >= 2
920 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
921 				atomic_read(&ipv4_dst_ops.entries), goal, i);
922 #endif
923 
924 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
925 			goto out;
926 	} while (!in_softirq() && time_before_eq(jiffies, now));
927 
928 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
929 		goto out;
930 	if (net_ratelimit())
931 		printk(KERN_WARNING "dst cache overflow\n");
932 	RT_CACHE_STAT_INC(gc_dst_overflow);
933 	return 1;
934 
935 work_done:
936 	expire += ip_rt_gc_min_interval;
937 	if (expire > ip_rt_gc_timeout ||
938 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
939 		expire = ip_rt_gc_timeout;
940 #if RT_CACHE_DEBUG >= 2
941 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
942 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
943 #endif
944 out:	return 0;
945 }
946 
947 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 {
949 	struct rtable	*rth, **rthp;
950 	unsigned long	now;
951 	struct rtable *cand, **candp;
952 	u32 		min_score;
953 	int		chain_length;
954 	int attempts = !in_softirq();
955 
956 restart:
957 	chain_length = 0;
958 	min_score = ~(u32)0;
959 	cand = NULL;
960 	candp = NULL;
961 	now = jiffies;
962 
963 	rthp = &rt_hash_table[hash].chain;
964 
965 	spin_lock_bh(rt_hash_lock_addr(hash));
966 	while ((rth = *rthp) != NULL) {
967 		if (rth->rt_genid != atomic_read(&rt_genid)) {
968 			*rthp = rth->u.dst.rt_next;
969 			rt_free(rth);
970 			continue;
971 		}
972 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
973 			/* Put it first */
974 			*rthp = rth->u.dst.rt_next;
975 			/*
976 			 * Since lookup is lockfree, the deletion
977 			 * must be visible to another weakly ordered CPU before
978 			 * the insertion at the start of the hash chain.
979 			 */
980 			rcu_assign_pointer(rth->u.dst.rt_next,
981 					   rt_hash_table[hash].chain);
982 			/*
983 			 * Since lookup is lockfree, the update writes
984 			 * must be ordered for consistency on SMP.
985 			 */
986 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987 
988 			dst_use(&rth->u.dst, now);
989 			spin_unlock_bh(rt_hash_lock_addr(hash));
990 
991 			rt_drop(rt);
992 			*rp = rth;
993 			return 0;
994 		}
995 
996 		if (!atomic_read(&rth->u.dst.__refcnt)) {
997 			u32 score = rt_score(rth);
998 
999 			if (score <= min_score) {
1000 				cand = rth;
1001 				candp = rthp;
1002 				min_score = score;
1003 			}
1004 		}
1005 
1006 		chain_length++;
1007 
1008 		rthp = &rth->u.dst.rt_next;
1009 	}
1010 
1011 	if (cand) {
1012 		/* ip_rt_gc_elasticity used to be average length of chain
1013 		 * length, when exceeded gc becomes really aggressive.
1014 		 *
1015 		 * The second limit is less certain. At the moment it allows
1016 		 * only 2 entries per bucket. We will see.
1017 		 */
1018 		if (chain_length > ip_rt_gc_elasticity) {
1019 			*candp = cand->u.dst.rt_next;
1020 			rt_free(cand);
1021 		}
1022 	}
1023 
1024 	/* Try to bind route to arp only if it is output
1025 	   route or unicast forwarding path.
1026 	 */
1027 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028 		int err = arp_bind_neighbour(&rt->u.dst);
1029 		if (err) {
1030 			spin_unlock_bh(rt_hash_lock_addr(hash));
1031 
1032 			if (err != -ENOBUFS) {
1033 				rt_drop(rt);
1034 				return err;
1035 			}
1036 
1037 			/* Neighbour tables are full and nothing
1038 			   can be released. Try to shrink route cache,
1039 			   it is most likely it holds some neighbour records.
1040 			 */
1041 			if (attempts-- > 0) {
1042 				int saved_elasticity = ip_rt_gc_elasticity;
1043 				int saved_int = ip_rt_gc_min_interval;
1044 				ip_rt_gc_elasticity	= 1;
1045 				ip_rt_gc_min_interval	= 0;
1046 				rt_garbage_collect(&ipv4_dst_ops);
1047 				ip_rt_gc_min_interval	= saved_int;
1048 				ip_rt_gc_elasticity	= saved_elasticity;
1049 				goto restart;
1050 			}
1051 
1052 			if (net_ratelimit())
1053 				printk(KERN_WARNING "Neighbour table overflow.\n");
1054 			rt_drop(rt);
1055 			return -ENOBUFS;
1056 		}
1057 	}
1058 
1059 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060 #if RT_CACHE_DEBUG >= 2
1061 	if (rt->u.dst.rt_next) {
1062 		struct rtable *trt;
1063 		printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1064 		       NIPQUAD(rt->rt_dst));
1065 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066 			printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1067 		printk("\n");
1068 	}
1069 #endif
1070 	rt_hash_table[hash].chain = rt;
1071 	spin_unlock_bh(rt_hash_lock_addr(hash));
1072 	*rp = rt;
1073 	return 0;
1074 }
1075 
1076 void rt_bind_peer(struct rtable *rt, int create)
1077 {
1078 	static DEFINE_SPINLOCK(rt_peer_lock);
1079 	struct inet_peer *peer;
1080 
1081 	peer = inet_getpeer(rt->rt_dst, create);
1082 
1083 	spin_lock_bh(&rt_peer_lock);
1084 	if (rt->peer == NULL) {
1085 		rt->peer = peer;
1086 		peer = NULL;
1087 	}
1088 	spin_unlock_bh(&rt_peer_lock);
1089 	if (peer)
1090 		inet_putpeer(peer);
1091 }
1092 
1093 /*
1094  * Peer allocation may fail only in serious out-of-memory conditions.  However
1095  * we still can generate some output.
1096  * Random ID selection looks a bit dangerous because we have no chances to
1097  * select ID being unique in a reasonable period of time.
1098  * But broken packet identifier may be better than no packet at all.
1099  */
1100 static void ip_select_fb_ident(struct iphdr *iph)
1101 {
1102 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1103 	static u32 ip_fallback_id;
1104 	u32 salt;
1105 
1106 	spin_lock_bh(&ip_fb_id_lock);
1107 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108 	iph->id = htons(salt & 0xFFFF);
1109 	ip_fallback_id = salt;
1110 	spin_unlock_bh(&ip_fb_id_lock);
1111 }
1112 
1113 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 {
1115 	struct rtable *rt = (struct rtable *) dst;
1116 
1117 	if (rt) {
1118 		if (rt->peer == NULL)
1119 			rt_bind_peer(rt, 1);
1120 
1121 		/* If peer is attached to destination, it is never detached,
1122 		   so that we need not to grab a lock to dereference it.
1123 		 */
1124 		if (rt->peer) {
1125 			iph->id = htons(inet_getid(rt->peer, more));
1126 			return;
1127 		}
1128 	} else
1129 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130 		       __builtin_return_address(0));
1131 
1132 	ip_select_fb_ident(iph);
1133 }
1134 
1135 static void rt_del(unsigned hash, struct rtable *rt)
1136 {
1137 	struct rtable **rthp, *aux;
1138 
1139 	rthp = &rt_hash_table[hash].chain;
1140 	spin_lock_bh(rt_hash_lock_addr(hash));
1141 	ip_rt_put(rt);
1142 	while ((aux = *rthp) != NULL) {
1143 		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144 			*rthp = aux->u.dst.rt_next;
1145 			rt_free(aux);
1146 			continue;
1147 		}
1148 		rthp = &aux->u.dst.rt_next;
1149 	}
1150 	spin_unlock_bh(rt_hash_lock_addr(hash));
1151 }
1152 
1153 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154 		    __be32 saddr, struct net_device *dev)
1155 {
1156 	int i, k;
1157 	struct in_device *in_dev = in_dev_get(dev);
1158 	struct rtable *rth, **rthp;
1159 	__be32  skeys[2] = { saddr, 0 };
1160 	int  ikeys[2] = { dev->ifindex, 0 };
1161 	struct netevent_redirect netevent;
1162 	struct net *net;
1163 
1164 	if (!in_dev)
1165 		return;
1166 
1167 	net = dev_net(dev);
1168 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170 	    || ipv4_is_zeronet(new_gw))
1171 		goto reject_redirect;
1172 
1173 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175 			goto reject_redirect;
1176 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177 			goto reject_redirect;
1178 	} else {
1179 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180 			goto reject_redirect;
1181 	}
1182 
1183 	for (i = 0; i < 2; i++) {
1184 		for (k = 0; k < 2; k++) {
1185 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186 
1187 			rthp=&rt_hash_table[hash].chain;
1188 
1189 			rcu_read_lock();
1190 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1191 				struct rtable *rt;
1192 
1193 				if (rth->fl.fl4_dst != daddr ||
1194 				    rth->fl.fl4_src != skeys[i] ||
1195 				    rth->fl.oif != ikeys[k] ||
1196 				    rth->fl.iif != 0 ||
1197 				    rth->rt_genid != atomic_read(&rt_genid) ||
1198 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1199 					rthp = &rth->u.dst.rt_next;
1200 					continue;
1201 				}
1202 
1203 				if (rth->rt_dst != daddr ||
1204 				    rth->rt_src != saddr ||
1205 				    rth->u.dst.error ||
1206 				    rth->rt_gateway != old_gw ||
1207 				    rth->u.dst.dev != dev)
1208 					break;
1209 
1210 				dst_hold(&rth->u.dst);
1211 				rcu_read_unlock();
1212 
1213 				rt = dst_alloc(&ipv4_dst_ops);
1214 				if (rt == NULL) {
1215 					ip_rt_put(rth);
1216 					in_dev_put(in_dev);
1217 					return;
1218 				}
1219 
1220 				/* Copy all the information. */
1221 				*rt = *rth;
1222 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223 				rt->u.dst.__use		= 1;
1224 				atomic_set(&rt->u.dst.__refcnt, 1);
1225 				rt->u.dst.child		= NULL;
1226 				if (rt->u.dst.dev)
1227 					dev_hold(rt->u.dst.dev);
1228 				if (rt->idev)
1229 					in_dev_hold(rt->idev);
1230 				rt->u.dst.obsolete	= 0;
1231 				rt->u.dst.lastuse	= jiffies;
1232 				rt->u.dst.path		= &rt->u.dst;
1233 				rt->u.dst.neighbour	= NULL;
1234 				rt->u.dst.hh		= NULL;
1235 				rt->u.dst.xfrm		= NULL;
1236 				rt->rt_genid		= atomic_read(&rt_genid);
1237 				rt->rt_flags		|= RTCF_REDIRECTED;
1238 
1239 				/* Gateway is different ... */
1240 				rt->rt_gateway		= new_gw;
1241 
1242 				/* Redirect received -> path was valid */
1243 				dst_confirm(&rth->u.dst);
1244 
1245 				if (rt->peer)
1246 					atomic_inc(&rt->peer->refcnt);
1247 
1248 				if (arp_bind_neighbour(&rt->u.dst) ||
1249 				    !(rt->u.dst.neighbour->nud_state &
1250 					    NUD_VALID)) {
1251 					if (rt->u.dst.neighbour)
1252 						neigh_event_send(rt->u.dst.neighbour, NULL);
1253 					ip_rt_put(rth);
1254 					rt_drop(rt);
1255 					goto do_next;
1256 				}
1257 
1258 				netevent.old = &rth->u.dst;
1259 				netevent.new = &rt->u.dst;
1260 				call_netevent_notifiers(NETEVENT_REDIRECT,
1261 							&netevent);
1262 
1263 				rt_del(hash, rth);
1264 				if (!rt_intern_hash(hash, rt, &rt))
1265 					ip_rt_put(rt);
1266 				goto do_next;
1267 			}
1268 			rcu_read_unlock();
1269 		do_next:
1270 			;
1271 		}
1272 	}
1273 	in_dev_put(in_dev);
1274 	return;
1275 
1276 reject_redirect:
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279 		printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280 			NIPQUAD_FMT " ignored.\n"
1281 			"  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283 		       NIPQUAD(saddr), NIPQUAD(daddr));
1284 #endif
1285 	in_dev_put(in_dev);
1286 }
1287 
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 {
1290 	struct rtable *rt = (struct rtable *)dst;
1291 	struct dst_entry *ret = dst;
1292 
1293 	if (rt) {
1294 		if (dst->obsolete) {
1295 			ip_rt_put(rt);
1296 			ret = NULL;
1297 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298 			   rt->u.dst.expires) {
1299 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300 						rt->fl.oif);
1301 #if RT_CACHE_DEBUG >= 1
1302 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303 					  NIPQUAD_FMT "/%02x dropped\n",
1304 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305 #endif
1306 			rt_del(hash, rt);
1307 			ret = NULL;
1308 		}
1309 	}
1310 	return ret;
1311 }
1312 
1313 /*
1314  * Algorithm:
1315  *	1. The first ip_rt_redirect_number redirects are sent
1316  *	   with exponential backoff, then we stop sending them at all,
1317  *	   assuming that the host ignores our redirects.
1318  *	2. If we did not see packets requiring redirects
1319  *	   during ip_rt_redirect_silence, we assume that the host
1320  *	   forgot redirected route and start to send redirects again.
1321  *
1322  * This algorithm is much cheaper and more intelligent than dumb load limiting
1323  * in icmp.c.
1324  *
1325  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327  */
1328 
1329 void ip_rt_send_redirect(struct sk_buff *skb)
1330 {
1331 	struct rtable *rt = skb->rtable;
1332 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333 
1334 	if (!in_dev)
1335 		return;
1336 
1337 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1338 		goto out;
1339 
1340 	/* No redirected packets during ip_rt_redirect_silence;
1341 	 * reset the algorithm.
1342 	 */
1343 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344 		rt->u.dst.rate_tokens = 0;
1345 
1346 	/* Too many ignored redirects; do not send anything
1347 	 * set u.dst.rate_last to the last seen redirected packet.
1348 	 */
1349 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350 		rt->u.dst.rate_last = jiffies;
1351 		goto out;
1352 	}
1353 
1354 	/* Check for load limit; set rate_last to the latest sent
1355 	 * redirect.
1356 	 */
1357 	if (rt->u.dst.rate_tokens == 0 ||
1358 	    time_after(jiffies,
1359 		       (rt->u.dst.rate_last +
1360 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362 		rt->u.dst.rate_last = jiffies;
1363 		++rt->u.dst.rate_tokens;
1364 #ifdef CONFIG_IP_ROUTE_VERBOSE
1365 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367 		    net_ratelimit())
1368 			printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1369 				"redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1370 				NIPQUAD(rt->rt_src), rt->rt_iif,
1371 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372 #endif
1373 	}
1374 out:
1375 	in_dev_put(in_dev);
1376 }
1377 
1378 static int ip_error(struct sk_buff *skb)
1379 {
1380 	struct rtable *rt = skb->rtable;
1381 	unsigned long now;
1382 	int code;
1383 
1384 	switch (rt->u.dst.error) {
1385 		case EINVAL:
1386 		default:
1387 			goto out;
1388 		case EHOSTUNREACH:
1389 			code = ICMP_HOST_UNREACH;
1390 			break;
1391 		case ENETUNREACH:
1392 			code = ICMP_NET_UNREACH;
1393 			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394 			break;
1395 		case EACCES:
1396 			code = ICMP_PKT_FILTERED;
1397 			break;
1398 	}
1399 
1400 	now = jiffies;
1401 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1404 	rt->u.dst.rate_last = now;
1405 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408 	}
1409 
1410 out:	kfree_skb(skb);
1411 	return 0;
1412 }
1413 
1414 /*
1415  *	The last two values are not from the RFC but
1416  *	are needed for AMPRnet AX.25 paths.
1417  */
1418 
1419 static const unsigned short mtu_plateau[] =
1420 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421 
1422 static inline unsigned short guess_mtu(unsigned short old_mtu)
1423 {
1424 	int i;
1425 
1426 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427 		if (old_mtu > mtu_plateau[i])
1428 			return mtu_plateau[i];
1429 	return 68;
1430 }
1431 
1432 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433 				 unsigned short new_mtu)
1434 {
1435 	int i;
1436 	unsigned short old_mtu = ntohs(iph->tot_len);
1437 	struct rtable *rth;
1438 	__be32  skeys[2] = { iph->saddr, 0, };
1439 	__be32  daddr = iph->daddr;
1440 	unsigned short est_mtu = 0;
1441 
1442 	if (ipv4_config.no_pmtu_disc)
1443 		return 0;
1444 
1445 	for (i = 0; i < 2; i++) {
1446 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1447 
1448 		rcu_read_lock();
1449 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1450 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1451 			if (rth->fl.fl4_dst == daddr &&
1452 			    rth->fl.fl4_src == skeys[i] &&
1453 			    rth->rt_dst  == daddr &&
1454 			    rth->rt_src  == iph->saddr &&
1455 			    rth->fl.iif == 0 &&
1456 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1457 			    net_eq(dev_net(rth->u.dst.dev), net) &&
1458 			    rth->rt_genid == atomic_read(&rt_genid)) {
1459 				unsigned short mtu = new_mtu;
1460 
1461 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1462 
1463 					/* BSD 4.2 compatibility hack :-( */
1464 					if (mtu == 0 &&
1465 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1466 					    old_mtu >= 68 + (iph->ihl << 2))
1467 						old_mtu -= iph->ihl << 2;
1468 
1469 					mtu = guess_mtu(old_mtu);
1470 				}
1471 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1472 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1473 						dst_confirm(&rth->u.dst);
1474 						if (mtu < ip_rt_min_pmtu) {
1475 							mtu = ip_rt_min_pmtu;
1476 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1477 								(1 << RTAX_MTU);
1478 						}
1479 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1480 						dst_set_expires(&rth->u.dst,
1481 							ip_rt_mtu_expires);
1482 					}
1483 					est_mtu = mtu;
1484 				}
1485 			}
1486 		}
1487 		rcu_read_unlock();
1488 	}
1489 	return est_mtu ? : new_mtu;
1490 }
1491 
1492 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1493 {
1494 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1495 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1496 		if (mtu < ip_rt_min_pmtu) {
1497 			mtu = ip_rt_min_pmtu;
1498 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1499 		}
1500 		dst->metrics[RTAX_MTU-1] = mtu;
1501 		dst_set_expires(dst, ip_rt_mtu_expires);
1502 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1503 	}
1504 }
1505 
1506 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1507 {
1508 	return NULL;
1509 }
1510 
1511 static void ipv4_dst_destroy(struct dst_entry *dst)
1512 {
1513 	struct rtable *rt = (struct rtable *) dst;
1514 	struct inet_peer *peer = rt->peer;
1515 	struct in_device *idev = rt->idev;
1516 
1517 	if (peer) {
1518 		rt->peer = NULL;
1519 		inet_putpeer(peer);
1520 	}
1521 
1522 	if (idev) {
1523 		rt->idev = NULL;
1524 		in_dev_put(idev);
1525 	}
1526 }
1527 
1528 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1529 			    int how)
1530 {
1531 	struct rtable *rt = (struct rtable *) dst;
1532 	struct in_device *idev = rt->idev;
1533 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1534 		struct in_device *loopback_idev =
1535 			in_dev_get(dev_net(dev)->loopback_dev);
1536 		if (loopback_idev) {
1537 			rt->idev = loopback_idev;
1538 			in_dev_put(idev);
1539 		}
1540 	}
1541 }
1542 
1543 static void ipv4_link_failure(struct sk_buff *skb)
1544 {
1545 	struct rtable *rt;
1546 
1547 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1548 
1549 	rt = skb->rtable;
1550 	if (rt)
1551 		dst_set_expires(&rt->u.dst, 0);
1552 }
1553 
1554 static int ip_rt_bug(struct sk_buff *skb)
1555 {
1556 	printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1557 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1558 		skb->dev ? skb->dev->name : "?");
1559 	kfree_skb(skb);
1560 	return 0;
1561 }
1562 
1563 /*
1564    We do not cache source address of outgoing interface,
1565    because it is used only by IP RR, TS and SRR options,
1566    so that it out of fast path.
1567 
1568    BTW remember: "addr" is allowed to be not aligned
1569    in IP options!
1570  */
1571 
1572 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1573 {
1574 	__be32 src;
1575 	struct fib_result res;
1576 
1577 	if (rt->fl.iif == 0)
1578 		src = rt->rt_src;
1579 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1580 		src = FIB_RES_PREFSRC(res);
1581 		fib_res_put(&res);
1582 	} else
1583 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1584 					RT_SCOPE_UNIVERSE);
1585 	memcpy(addr, &src, 4);
1586 }
1587 
1588 #ifdef CONFIG_NET_CLS_ROUTE
1589 static void set_class_tag(struct rtable *rt, u32 tag)
1590 {
1591 	if (!(rt->u.dst.tclassid & 0xFFFF))
1592 		rt->u.dst.tclassid |= tag & 0xFFFF;
1593 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1594 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1595 }
1596 #endif
1597 
1598 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1599 {
1600 	struct fib_info *fi = res->fi;
1601 
1602 	if (fi) {
1603 		if (FIB_RES_GW(*res) &&
1604 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1605 			rt->rt_gateway = FIB_RES_GW(*res);
1606 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1607 		       sizeof(rt->u.dst.metrics));
1608 		if (fi->fib_mtu == 0) {
1609 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1610 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1611 			    rt->rt_gateway != rt->rt_dst &&
1612 			    rt->u.dst.dev->mtu > 576)
1613 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1614 		}
1615 #ifdef CONFIG_NET_CLS_ROUTE
1616 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1617 #endif
1618 	} else
1619 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1620 
1621 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1622 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1623 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1624 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1625 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1626 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1627 				       ip_rt_min_advmss);
1628 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1629 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1630 
1631 #ifdef CONFIG_NET_CLS_ROUTE
1632 #ifdef CONFIG_IP_MULTIPLE_TABLES
1633 	set_class_tag(rt, fib_rules_tclass(res));
1634 #endif
1635 	set_class_tag(rt, itag);
1636 #endif
1637 	rt->rt_type = res->type;
1638 }
1639 
1640 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1641 				u8 tos, struct net_device *dev, int our)
1642 {
1643 	unsigned hash;
1644 	struct rtable *rth;
1645 	__be32 spec_dst;
1646 	struct in_device *in_dev = in_dev_get(dev);
1647 	u32 itag = 0;
1648 
1649 	/* Primary sanity checks. */
1650 
1651 	if (in_dev == NULL)
1652 		return -EINVAL;
1653 
1654 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1655 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1656 		goto e_inval;
1657 
1658 	if (ipv4_is_zeronet(saddr)) {
1659 		if (!ipv4_is_local_multicast(daddr))
1660 			goto e_inval;
1661 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1662 	} else if (fib_validate_source(saddr, 0, tos, 0,
1663 					dev, &spec_dst, &itag) < 0)
1664 		goto e_inval;
1665 
1666 	rth = dst_alloc(&ipv4_dst_ops);
1667 	if (!rth)
1668 		goto e_nobufs;
1669 
1670 	rth->u.dst.output= ip_rt_bug;
1671 
1672 	atomic_set(&rth->u.dst.__refcnt, 1);
1673 	rth->u.dst.flags= DST_HOST;
1674 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1675 		rth->u.dst.flags |= DST_NOPOLICY;
1676 	rth->fl.fl4_dst	= daddr;
1677 	rth->rt_dst	= daddr;
1678 	rth->fl.fl4_tos	= tos;
1679 	rth->fl.mark    = skb->mark;
1680 	rth->fl.fl4_src	= saddr;
1681 	rth->rt_src	= saddr;
1682 #ifdef CONFIG_NET_CLS_ROUTE
1683 	rth->u.dst.tclassid = itag;
1684 #endif
1685 	rth->rt_iif	=
1686 	rth->fl.iif	= dev->ifindex;
1687 	rth->u.dst.dev	= init_net.loopback_dev;
1688 	dev_hold(rth->u.dst.dev);
1689 	rth->idev	= in_dev_get(rth->u.dst.dev);
1690 	rth->fl.oif	= 0;
1691 	rth->rt_gateway	= daddr;
1692 	rth->rt_spec_dst= spec_dst;
1693 	rth->rt_genid	= atomic_read(&rt_genid);
1694 	rth->rt_flags	= RTCF_MULTICAST;
1695 	rth->rt_type	= RTN_MULTICAST;
1696 	if (our) {
1697 		rth->u.dst.input= ip_local_deliver;
1698 		rth->rt_flags |= RTCF_LOCAL;
1699 	}
1700 
1701 #ifdef CONFIG_IP_MROUTE
1702 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1703 		rth->u.dst.input = ip_mr_input;
1704 #endif
1705 	RT_CACHE_STAT_INC(in_slow_mc);
1706 
1707 	in_dev_put(in_dev);
1708 	hash = rt_hash(daddr, saddr, dev->ifindex);
1709 	return rt_intern_hash(hash, rth, &skb->rtable);
1710 
1711 e_nobufs:
1712 	in_dev_put(in_dev);
1713 	return -ENOBUFS;
1714 
1715 e_inval:
1716 	in_dev_put(in_dev);
1717 	return -EINVAL;
1718 }
1719 
1720 
1721 static void ip_handle_martian_source(struct net_device *dev,
1722 				     struct in_device *in_dev,
1723 				     struct sk_buff *skb,
1724 				     __be32 daddr,
1725 				     __be32 saddr)
1726 {
1727 	RT_CACHE_STAT_INC(in_martian_src);
1728 #ifdef CONFIG_IP_ROUTE_VERBOSE
1729 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1730 		/*
1731 		 *	RFC1812 recommendation, if source is martian,
1732 		 *	the only hint is MAC header.
1733 		 */
1734 		printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1735 			NIPQUAD_FMT", on dev %s\n",
1736 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1737 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1738 			int i;
1739 			const unsigned char *p = skb_mac_header(skb);
1740 			printk(KERN_WARNING "ll header: ");
1741 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1742 				printk("%02x", *p);
1743 				if (i < (dev->hard_header_len - 1))
1744 					printk(":");
1745 			}
1746 			printk("\n");
1747 		}
1748 	}
1749 #endif
1750 }
1751 
1752 static int __mkroute_input(struct sk_buff *skb,
1753 			   struct fib_result *res,
1754 			   struct in_device *in_dev,
1755 			   __be32 daddr, __be32 saddr, u32 tos,
1756 			   struct rtable **result)
1757 {
1758 
1759 	struct rtable *rth;
1760 	int err;
1761 	struct in_device *out_dev;
1762 	unsigned flags = 0;
1763 	__be32 spec_dst;
1764 	u32 itag;
1765 
1766 	/* get a working reference to the output device */
1767 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1768 	if (out_dev == NULL) {
1769 		if (net_ratelimit())
1770 			printk(KERN_CRIT "Bug in ip_route_input" \
1771 			       "_slow(). Please, report\n");
1772 		return -EINVAL;
1773 	}
1774 
1775 
1776 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1777 				  in_dev->dev, &spec_dst, &itag);
1778 	if (err < 0) {
1779 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1780 					 saddr);
1781 
1782 		err = -EINVAL;
1783 		goto cleanup;
1784 	}
1785 
1786 	if (err)
1787 		flags |= RTCF_DIRECTSRC;
1788 
1789 	if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1790 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1791 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1792 		flags |= RTCF_DOREDIRECT;
1793 
1794 	if (skb->protocol != htons(ETH_P_IP)) {
1795 		/* Not IP (i.e. ARP). Do not create route, if it is
1796 		 * invalid for proxy arp. DNAT routes are always valid.
1797 		 */
1798 		if (out_dev == in_dev) {
1799 			err = -EINVAL;
1800 			goto cleanup;
1801 		}
1802 	}
1803 
1804 
1805 	rth = dst_alloc(&ipv4_dst_ops);
1806 	if (!rth) {
1807 		err = -ENOBUFS;
1808 		goto cleanup;
1809 	}
1810 
1811 	atomic_set(&rth->u.dst.__refcnt, 1);
1812 	rth->u.dst.flags= DST_HOST;
1813 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1814 		rth->u.dst.flags |= DST_NOPOLICY;
1815 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1816 		rth->u.dst.flags |= DST_NOXFRM;
1817 	rth->fl.fl4_dst	= daddr;
1818 	rth->rt_dst	= daddr;
1819 	rth->fl.fl4_tos	= tos;
1820 	rth->fl.mark    = skb->mark;
1821 	rth->fl.fl4_src	= saddr;
1822 	rth->rt_src	= saddr;
1823 	rth->rt_gateway	= daddr;
1824 	rth->rt_iif 	=
1825 		rth->fl.iif	= in_dev->dev->ifindex;
1826 	rth->u.dst.dev	= (out_dev)->dev;
1827 	dev_hold(rth->u.dst.dev);
1828 	rth->idev	= in_dev_get(rth->u.dst.dev);
1829 	rth->fl.oif 	= 0;
1830 	rth->rt_spec_dst= spec_dst;
1831 
1832 	rth->u.dst.input = ip_forward;
1833 	rth->u.dst.output = ip_output;
1834 	rth->rt_genid = atomic_read(&rt_genid);
1835 
1836 	rt_set_nexthop(rth, res, itag);
1837 
1838 	rth->rt_flags = flags;
1839 
1840 	*result = rth;
1841 	err = 0;
1842  cleanup:
1843 	/* release the working reference to the output device */
1844 	in_dev_put(out_dev);
1845 	return err;
1846 }
1847 
1848 static int ip_mkroute_input(struct sk_buff *skb,
1849 			    struct fib_result *res,
1850 			    const struct flowi *fl,
1851 			    struct in_device *in_dev,
1852 			    __be32 daddr, __be32 saddr, u32 tos)
1853 {
1854 	struct rtable* rth = NULL;
1855 	int err;
1856 	unsigned hash;
1857 
1858 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1859 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1860 		fib_select_multipath(fl, res);
1861 #endif
1862 
1863 	/* create a routing cache entry */
1864 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1865 	if (err)
1866 		return err;
1867 
1868 	/* put it into the cache */
1869 	hash = rt_hash(daddr, saddr, fl->iif);
1870 	return rt_intern_hash(hash, rth, &skb->rtable);
1871 }
1872 
1873 /*
1874  *	NOTE. We drop all the packets that has local source
1875  *	addresses, because every properly looped back packet
1876  *	must have correct destination already attached by output routine.
1877  *
1878  *	Such approach solves two big problems:
1879  *	1. Not simplex devices are handled properly.
1880  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1881  */
1882 
1883 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1884 			       u8 tos, struct net_device *dev)
1885 {
1886 	struct fib_result res;
1887 	struct in_device *in_dev = in_dev_get(dev);
1888 	struct flowi fl = { .nl_u = { .ip4_u =
1889 				      { .daddr = daddr,
1890 					.saddr = saddr,
1891 					.tos = tos,
1892 					.scope = RT_SCOPE_UNIVERSE,
1893 				      } },
1894 			    .mark = skb->mark,
1895 			    .iif = dev->ifindex };
1896 	unsigned	flags = 0;
1897 	u32		itag = 0;
1898 	struct rtable * rth;
1899 	unsigned	hash;
1900 	__be32		spec_dst;
1901 	int		err = -EINVAL;
1902 	int		free_res = 0;
1903 	struct net    * net = dev_net(dev);
1904 
1905 	/* IP on this device is disabled. */
1906 
1907 	if (!in_dev)
1908 		goto out;
1909 
1910 	/* Check for the most weird martians, which can be not detected
1911 	   by fib_lookup.
1912 	 */
1913 
1914 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1915 	    ipv4_is_loopback(saddr))
1916 		goto martian_source;
1917 
1918 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1919 		goto brd_input;
1920 
1921 	/* Accept zero addresses only to limited broadcast;
1922 	 * I even do not know to fix it or not. Waiting for complains :-)
1923 	 */
1924 	if (ipv4_is_zeronet(saddr))
1925 		goto martian_source;
1926 
1927 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1928 	    ipv4_is_loopback(daddr))
1929 		goto martian_destination;
1930 
1931 	/*
1932 	 *	Now we are ready to route packet.
1933 	 */
1934 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1935 		if (!IN_DEV_FORWARD(in_dev))
1936 			goto e_hostunreach;
1937 		goto no_route;
1938 	}
1939 	free_res = 1;
1940 
1941 	RT_CACHE_STAT_INC(in_slow_tot);
1942 
1943 	if (res.type == RTN_BROADCAST)
1944 		goto brd_input;
1945 
1946 	if (res.type == RTN_LOCAL) {
1947 		int result;
1948 		result = fib_validate_source(saddr, daddr, tos,
1949 					     net->loopback_dev->ifindex,
1950 					     dev, &spec_dst, &itag);
1951 		if (result < 0)
1952 			goto martian_source;
1953 		if (result)
1954 			flags |= RTCF_DIRECTSRC;
1955 		spec_dst = daddr;
1956 		goto local_input;
1957 	}
1958 
1959 	if (!IN_DEV_FORWARD(in_dev))
1960 		goto e_hostunreach;
1961 	if (res.type != RTN_UNICAST)
1962 		goto martian_destination;
1963 
1964 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1965 done:
1966 	in_dev_put(in_dev);
1967 	if (free_res)
1968 		fib_res_put(&res);
1969 out:	return err;
1970 
1971 brd_input:
1972 	if (skb->protocol != htons(ETH_P_IP))
1973 		goto e_inval;
1974 
1975 	if (ipv4_is_zeronet(saddr))
1976 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1977 	else {
1978 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1979 					  &itag);
1980 		if (err < 0)
1981 			goto martian_source;
1982 		if (err)
1983 			flags |= RTCF_DIRECTSRC;
1984 	}
1985 	flags |= RTCF_BROADCAST;
1986 	res.type = RTN_BROADCAST;
1987 	RT_CACHE_STAT_INC(in_brd);
1988 
1989 local_input:
1990 	rth = dst_alloc(&ipv4_dst_ops);
1991 	if (!rth)
1992 		goto e_nobufs;
1993 
1994 	rth->u.dst.output= ip_rt_bug;
1995 	rth->rt_genid = atomic_read(&rt_genid);
1996 
1997 	atomic_set(&rth->u.dst.__refcnt, 1);
1998 	rth->u.dst.flags= DST_HOST;
1999 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2000 		rth->u.dst.flags |= DST_NOPOLICY;
2001 	rth->fl.fl4_dst	= daddr;
2002 	rth->rt_dst	= daddr;
2003 	rth->fl.fl4_tos	= tos;
2004 	rth->fl.mark    = skb->mark;
2005 	rth->fl.fl4_src	= saddr;
2006 	rth->rt_src	= saddr;
2007 #ifdef CONFIG_NET_CLS_ROUTE
2008 	rth->u.dst.tclassid = itag;
2009 #endif
2010 	rth->rt_iif	=
2011 	rth->fl.iif	= dev->ifindex;
2012 	rth->u.dst.dev	= net->loopback_dev;
2013 	dev_hold(rth->u.dst.dev);
2014 	rth->idev	= in_dev_get(rth->u.dst.dev);
2015 	rth->rt_gateway	= daddr;
2016 	rth->rt_spec_dst= spec_dst;
2017 	rth->u.dst.input= ip_local_deliver;
2018 	rth->rt_flags 	= flags|RTCF_LOCAL;
2019 	if (res.type == RTN_UNREACHABLE) {
2020 		rth->u.dst.input= ip_error;
2021 		rth->u.dst.error= -err;
2022 		rth->rt_flags 	&= ~RTCF_LOCAL;
2023 	}
2024 	rth->rt_type	= res.type;
2025 	hash = rt_hash(daddr, saddr, fl.iif);
2026 	err = rt_intern_hash(hash, rth, &skb->rtable);
2027 	goto done;
2028 
2029 no_route:
2030 	RT_CACHE_STAT_INC(in_no_route);
2031 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2032 	res.type = RTN_UNREACHABLE;
2033 	if (err == -ESRCH)
2034 		err = -ENETUNREACH;
2035 	goto local_input;
2036 
2037 	/*
2038 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2039 	 */
2040 martian_destination:
2041 	RT_CACHE_STAT_INC(in_martian_dst);
2042 #ifdef CONFIG_IP_ROUTE_VERBOSE
2043 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2044 		printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2045 			NIPQUAD_FMT ", dev %s\n",
2046 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2047 #endif
2048 
2049 e_hostunreach:
2050 	err = -EHOSTUNREACH;
2051 	goto done;
2052 
2053 e_inval:
2054 	err = -EINVAL;
2055 	goto done;
2056 
2057 e_nobufs:
2058 	err = -ENOBUFS;
2059 	goto done;
2060 
2061 martian_source:
2062 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2063 	goto e_inval;
2064 }
2065 
2066 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2067 		   u8 tos, struct net_device *dev)
2068 {
2069 	struct rtable * rth;
2070 	unsigned	hash;
2071 	int iif = dev->ifindex;
2072 	struct net *net;
2073 
2074 	net = dev_net(dev);
2075 	tos &= IPTOS_RT_MASK;
2076 	hash = rt_hash(daddr, saddr, iif);
2077 
2078 	rcu_read_lock();
2079 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2080 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2081 		if (((rth->fl.fl4_dst ^ daddr) |
2082 		     (rth->fl.fl4_src ^ saddr) |
2083 		     (rth->fl.iif ^ iif) |
2084 		     rth->fl.oif |
2085 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2086 		    rth->fl.mark == skb->mark &&
2087 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2088 		    rth->rt_genid == atomic_read(&rt_genid)) {
2089 			dst_use(&rth->u.dst, jiffies);
2090 			RT_CACHE_STAT_INC(in_hit);
2091 			rcu_read_unlock();
2092 			skb->rtable = rth;
2093 			return 0;
2094 		}
2095 		RT_CACHE_STAT_INC(in_hlist_search);
2096 	}
2097 	rcu_read_unlock();
2098 
2099 	/* Multicast recognition logic is moved from route cache to here.
2100 	   The problem was that too many Ethernet cards have broken/missing
2101 	   hardware multicast filters :-( As result the host on multicasting
2102 	   network acquires a lot of useless route cache entries, sort of
2103 	   SDR messages from all the world. Now we try to get rid of them.
2104 	   Really, provided software IP multicast filter is organized
2105 	   reasonably (at least, hashed), it does not result in a slowdown
2106 	   comparing with route cache reject entries.
2107 	   Note, that multicast routers are not affected, because
2108 	   route cache entry is created eventually.
2109 	 */
2110 	if (ipv4_is_multicast(daddr)) {
2111 		struct in_device *in_dev;
2112 
2113 		rcu_read_lock();
2114 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2115 			int our = ip_check_mc(in_dev, daddr, saddr,
2116 				ip_hdr(skb)->protocol);
2117 			if (our
2118 #ifdef CONFIG_IP_MROUTE
2119 			    || (!ipv4_is_local_multicast(daddr) &&
2120 				IN_DEV_MFORWARD(in_dev))
2121 #endif
2122 			    ) {
2123 				rcu_read_unlock();
2124 				return ip_route_input_mc(skb, daddr, saddr,
2125 							 tos, dev, our);
2126 			}
2127 		}
2128 		rcu_read_unlock();
2129 		return -EINVAL;
2130 	}
2131 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2132 }
2133 
2134 static int __mkroute_output(struct rtable **result,
2135 			    struct fib_result *res,
2136 			    const struct flowi *fl,
2137 			    const struct flowi *oldflp,
2138 			    struct net_device *dev_out,
2139 			    unsigned flags)
2140 {
2141 	struct rtable *rth;
2142 	struct in_device *in_dev;
2143 	u32 tos = RT_FL_TOS(oldflp);
2144 	int err = 0;
2145 
2146 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2147 		return -EINVAL;
2148 
2149 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2150 		res->type = RTN_BROADCAST;
2151 	else if (ipv4_is_multicast(fl->fl4_dst))
2152 		res->type = RTN_MULTICAST;
2153 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2154 		return -EINVAL;
2155 
2156 	if (dev_out->flags & IFF_LOOPBACK)
2157 		flags |= RTCF_LOCAL;
2158 
2159 	/* get work reference to inet device */
2160 	in_dev = in_dev_get(dev_out);
2161 	if (!in_dev)
2162 		return -EINVAL;
2163 
2164 	if (res->type == RTN_BROADCAST) {
2165 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2166 		if (res->fi) {
2167 			fib_info_put(res->fi);
2168 			res->fi = NULL;
2169 		}
2170 	} else if (res->type == RTN_MULTICAST) {
2171 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2172 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2173 				 oldflp->proto))
2174 			flags &= ~RTCF_LOCAL;
2175 		/* If multicast route do not exist use
2176 		   default one, but do not gateway in this case.
2177 		   Yes, it is hack.
2178 		 */
2179 		if (res->fi && res->prefixlen < 4) {
2180 			fib_info_put(res->fi);
2181 			res->fi = NULL;
2182 		}
2183 	}
2184 
2185 
2186 	rth = dst_alloc(&ipv4_dst_ops);
2187 	if (!rth) {
2188 		err = -ENOBUFS;
2189 		goto cleanup;
2190 	}
2191 
2192 	atomic_set(&rth->u.dst.__refcnt, 1);
2193 	rth->u.dst.flags= DST_HOST;
2194 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2195 		rth->u.dst.flags |= DST_NOXFRM;
2196 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197 		rth->u.dst.flags |= DST_NOPOLICY;
2198 
2199 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2200 	rth->fl.fl4_tos	= tos;
2201 	rth->fl.fl4_src	= oldflp->fl4_src;
2202 	rth->fl.oif	= oldflp->oif;
2203 	rth->fl.mark    = oldflp->mark;
2204 	rth->rt_dst	= fl->fl4_dst;
2205 	rth->rt_src	= fl->fl4_src;
2206 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2207 	/* get references to the devices that are to be hold by the routing
2208 	   cache entry */
2209 	rth->u.dst.dev	= dev_out;
2210 	dev_hold(dev_out);
2211 	rth->idev	= in_dev_get(dev_out);
2212 	rth->rt_gateway = fl->fl4_dst;
2213 	rth->rt_spec_dst= fl->fl4_src;
2214 
2215 	rth->u.dst.output=ip_output;
2216 	rth->rt_genid = atomic_read(&rt_genid);
2217 
2218 	RT_CACHE_STAT_INC(out_slow_tot);
2219 
2220 	if (flags & RTCF_LOCAL) {
2221 		rth->u.dst.input = ip_local_deliver;
2222 		rth->rt_spec_dst = fl->fl4_dst;
2223 	}
2224 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2225 		rth->rt_spec_dst = fl->fl4_src;
2226 		if (flags & RTCF_LOCAL &&
2227 		    !(dev_out->flags & IFF_LOOPBACK)) {
2228 			rth->u.dst.output = ip_mc_output;
2229 			RT_CACHE_STAT_INC(out_slow_mc);
2230 		}
2231 #ifdef CONFIG_IP_MROUTE
2232 		if (res->type == RTN_MULTICAST) {
2233 			if (IN_DEV_MFORWARD(in_dev) &&
2234 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2235 				rth->u.dst.input = ip_mr_input;
2236 				rth->u.dst.output = ip_mc_output;
2237 			}
2238 		}
2239 #endif
2240 	}
2241 
2242 	rt_set_nexthop(rth, res, 0);
2243 
2244 	rth->rt_flags = flags;
2245 
2246 	*result = rth;
2247  cleanup:
2248 	/* release work reference to inet device */
2249 	in_dev_put(in_dev);
2250 
2251 	return err;
2252 }
2253 
2254 static int ip_mkroute_output(struct rtable **rp,
2255 			     struct fib_result *res,
2256 			     const struct flowi *fl,
2257 			     const struct flowi *oldflp,
2258 			     struct net_device *dev_out,
2259 			     unsigned flags)
2260 {
2261 	struct rtable *rth = NULL;
2262 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2263 	unsigned hash;
2264 	if (err == 0) {
2265 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2266 		err = rt_intern_hash(hash, rth, rp);
2267 	}
2268 
2269 	return err;
2270 }
2271 
2272 /*
2273  * Major route resolver routine.
2274  */
2275 
2276 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2277 				const struct flowi *oldflp)
2278 {
2279 	u32 tos	= RT_FL_TOS(oldflp);
2280 	struct flowi fl = { .nl_u = { .ip4_u =
2281 				      { .daddr = oldflp->fl4_dst,
2282 					.saddr = oldflp->fl4_src,
2283 					.tos = tos & IPTOS_RT_MASK,
2284 					.scope = ((tos & RTO_ONLINK) ?
2285 						  RT_SCOPE_LINK :
2286 						  RT_SCOPE_UNIVERSE),
2287 				      } },
2288 			    .mark = oldflp->mark,
2289 			    .iif = net->loopback_dev->ifindex,
2290 			    .oif = oldflp->oif };
2291 	struct fib_result res;
2292 	unsigned flags = 0;
2293 	struct net_device *dev_out = NULL;
2294 	int free_res = 0;
2295 	int err;
2296 
2297 
2298 	res.fi		= NULL;
2299 #ifdef CONFIG_IP_MULTIPLE_TABLES
2300 	res.r		= NULL;
2301 #endif
2302 
2303 	if (oldflp->fl4_src) {
2304 		err = -EINVAL;
2305 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2306 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2307 		    ipv4_is_zeronet(oldflp->fl4_src))
2308 			goto out;
2309 
2310 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2311 		dev_out = ip_dev_find(net, oldflp->fl4_src);
2312 		if (dev_out == NULL)
2313 			goto out;
2314 
2315 		/* I removed check for oif == dev_out->oif here.
2316 		   It was wrong for two reasons:
2317 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2318 		      is assigned to multiple interfaces.
2319 		   2. Moreover, we are allowed to send packets with saddr
2320 		      of another iface. --ANK
2321 		 */
2322 
2323 		if (oldflp->oif == 0
2324 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2325 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2326 			/* Special hack: user can direct multicasts
2327 			   and limited broadcast via necessary interface
2328 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2329 			   This hack is not just for fun, it allows
2330 			   vic,vat and friends to work.
2331 			   They bind socket to loopback, set ttl to zero
2332 			   and expect that it will work.
2333 			   From the viewpoint of routing cache they are broken,
2334 			   because we are not allowed to build multicast path
2335 			   with loopback source addr (look, routing cache
2336 			   cannot know, that ttl is zero, so that packet
2337 			   will not leave this host and route is valid).
2338 			   Luckily, this hack is good workaround.
2339 			 */
2340 
2341 			fl.oif = dev_out->ifindex;
2342 			goto make_route;
2343 		}
2344 		if (dev_out)
2345 			dev_put(dev_out);
2346 		dev_out = NULL;
2347 	}
2348 
2349 
2350 	if (oldflp->oif) {
2351 		dev_out = dev_get_by_index(net, oldflp->oif);
2352 		err = -ENODEV;
2353 		if (dev_out == NULL)
2354 			goto out;
2355 
2356 		/* RACE: Check return value of inet_select_addr instead. */
2357 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2358 			dev_put(dev_out);
2359 			goto out;	/* Wrong error code */
2360 		}
2361 
2362 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2363 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2364 			if (!fl.fl4_src)
2365 				fl.fl4_src = inet_select_addr(dev_out, 0,
2366 							      RT_SCOPE_LINK);
2367 			goto make_route;
2368 		}
2369 		if (!fl.fl4_src) {
2370 			if (ipv4_is_multicast(oldflp->fl4_dst))
2371 				fl.fl4_src = inet_select_addr(dev_out, 0,
2372 							      fl.fl4_scope);
2373 			else if (!oldflp->fl4_dst)
2374 				fl.fl4_src = inet_select_addr(dev_out, 0,
2375 							      RT_SCOPE_HOST);
2376 		}
2377 	}
2378 
2379 	if (!fl.fl4_dst) {
2380 		fl.fl4_dst = fl.fl4_src;
2381 		if (!fl.fl4_dst)
2382 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2383 		if (dev_out)
2384 			dev_put(dev_out);
2385 		dev_out = net->loopback_dev;
2386 		dev_hold(dev_out);
2387 		fl.oif = net->loopback_dev->ifindex;
2388 		res.type = RTN_LOCAL;
2389 		flags |= RTCF_LOCAL;
2390 		goto make_route;
2391 	}
2392 
2393 	if (fib_lookup(net, &fl, &res)) {
2394 		res.fi = NULL;
2395 		if (oldflp->oif) {
2396 			/* Apparently, routing tables are wrong. Assume,
2397 			   that the destination is on link.
2398 
2399 			   WHY? DW.
2400 			   Because we are allowed to send to iface
2401 			   even if it has NO routes and NO assigned
2402 			   addresses. When oif is specified, routing
2403 			   tables are looked up with only one purpose:
2404 			   to catch if destination is gatewayed, rather than
2405 			   direct. Moreover, if MSG_DONTROUTE is set,
2406 			   we send packet, ignoring both routing tables
2407 			   and ifaddr state. --ANK
2408 
2409 
2410 			   We could make it even if oif is unknown,
2411 			   likely IPv6, but we do not.
2412 			 */
2413 
2414 			if (fl.fl4_src == 0)
2415 				fl.fl4_src = inet_select_addr(dev_out, 0,
2416 							      RT_SCOPE_LINK);
2417 			res.type = RTN_UNICAST;
2418 			goto make_route;
2419 		}
2420 		if (dev_out)
2421 			dev_put(dev_out);
2422 		err = -ENETUNREACH;
2423 		goto out;
2424 	}
2425 	free_res = 1;
2426 
2427 	if (res.type == RTN_LOCAL) {
2428 		if (!fl.fl4_src)
2429 			fl.fl4_src = fl.fl4_dst;
2430 		if (dev_out)
2431 			dev_put(dev_out);
2432 		dev_out = net->loopback_dev;
2433 		dev_hold(dev_out);
2434 		fl.oif = dev_out->ifindex;
2435 		if (res.fi)
2436 			fib_info_put(res.fi);
2437 		res.fi = NULL;
2438 		flags |= RTCF_LOCAL;
2439 		goto make_route;
2440 	}
2441 
2442 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2443 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2444 		fib_select_multipath(&fl, &res);
2445 	else
2446 #endif
2447 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2448 		fib_select_default(net, &fl, &res);
2449 
2450 	if (!fl.fl4_src)
2451 		fl.fl4_src = FIB_RES_PREFSRC(res);
2452 
2453 	if (dev_out)
2454 		dev_put(dev_out);
2455 	dev_out = FIB_RES_DEV(res);
2456 	dev_hold(dev_out);
2457 	fl.oif = dev_out->ifindex;
2458 
2459 
2460 make_route:
2461 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2462 
2463 
2464 	if (free_res)
2465 		fib_res_put(&res);
2466 	if (dev_out)
2467 		dev_put(dev_out);
2468 out:	return err;
2469 }
2470 
2471 int __ip_route_output_key(struct net *net, struct rtable **rp,
2472 			  const struct flowi *flp)
2473 {
2474 	unsigned hash;
2475 	struct rtable *rth;
2476 
2477 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2478 
2479 	rcu_read_lock_bh();
2480 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2481 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2482 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2483 		    rth->fl.fl4_src == flp->fl4_src &&
2484 		    rth->fl.iif == 0 &&
2485 		    rth->fl.oif == flp->oif &&
2486 		    rth->fl.mark == flp->mark &&
2487 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2488 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2489 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2490 		    rth->rt_genid == atomic_read(&rt_genid)) {
2491 			dst_use(&rth->u.dst, jiffies);
2492 			RT_CACHE_STAT_INC(out_hit);
2493 			rcu_read_unlock_bh();
2494 			*rp = rth;
2495 			return 0;
2496 		}
2497 		RT_CACHE_STAT_INC(out_hlist_search);
2498 	}
2499 	rcu_read_unlock_bh();
2500 
2501 	return ip_route_output_slow(net, rp, flp);
2502 }
2503 
2504 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2505 
2506 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2507 {
2508 }
2509 
2510 static struct dst_ops ipv4_dst_blackhole_ops = {
2511 	.family			=	AF_INET,
2512 	.protocol		=	__constant_htons(ETH_P_IP),
2513 	.destroy		=	ipv4_dst_destroy,
2514 	.check			=	ipv4_dst_check,
2515 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2516 	.entry_size		=	sizeof(struct rtable),
2517 	.entries		=	ATOMIC_INIT(0),
2518 };
2519 
2520 
2521 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2522 {
2523 	struct rtable *ort = *rp;
2524 	struct rtable *rt = (struct rtable *)
2525 		dst_alloc(&ipv4_dst_blackhole_ops);
2526 
2527 	if (rt) {
2528 		struct dst_entry *new = &rt->u.dst;
2529 
2530 		atomic_set(&new->__refcnt, 1);
2531 		new->__use = 1;
2532 		new->input = dst_discard;
2533 		new->output = dst_discard;
2534 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2535 
2536 		new->dev = ort->u.dst.dev;
2537 		if (new->dev)
2538 			dev_hold(new->dev);
2539 
2540 		rt->fl = ort->fl;
2541 
2542 		rt->idev = ort->idev;
2543 		if (rt->idev)
2544 			in_dev_hold(rt->idev);
2545 		rt->rt_genid = atomic_read(&rt_genid);
2546 		rt->rt_flags = ort->rt_flags;
2547 		rt->rt_type = ort->rt_type;
2548 		rt->rt_dst = ort->rt_dst;
2549 		rt->rt_src = ort->rt_src;
2550 		rt->rt_iif = ort->rt_iif;
2551 		rt->rt_gateway = ort->rt_gateway;
2552 		rt->rt_spec_dst = ort->rt_spec_dst;
2553 		rt->peer = ort->peer;
2554 		if (rt->peer)
2555 			atomic_inc(&rt->peer->refcnt);
2556 
2557 		dst_free(new);
2558 	}
2559 
2560 	dst_release(&(*rp)->u.dst);
2561 	*rp = rt;
2562 	return (rt ? 0 : -ENOMEM);
2563 }
2564 
2565 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2566 			 struct sock *sk, int flags)
2567 {
2568 	int err;
2569 
2570 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2571 		return err;
2572 
2573 	if (flp->proto) {
2574 		if (!flp->fl4_src)
2575 			flp->fl4_src = (*rp)->rt_src;
2576 		if (!flp->fl4_dst)
2577 			flp->fl4_dst = (*rp)->rt_dst;
2578 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2579 				    flags ? XFRM_LOOKUP_WAIT : 0);
2580 		if (err == -EREMOTE)
2581 			err = ipv4_dst_blackhole(rp, flp);
2582 
2583 		return err;
2584 	}
2585 
2586 	return 0;
2587 }
2588 
2589 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590 
2591 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2592 {
2593 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2594 }
2595 
2596 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2597 			int nowait, unsigned int flags)
2598 {
2599 	struct rtable *rt = skb->rtable;
2600 	struct rtmsg *r;
2601 	struct nlmsghdr *nlh;
2602 	long expires;
2603 	u32 id = 0, ts = 0, tsage = 0, error;
2604 
2605 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2606 	if (nlh == NULL)
2607 		return -EMSGSIZE;
2608 
2609 	r = nlmsg_data(nlh);
2610 	r->rtm_family	 = AF_INET;
2611 	r->rtm_dst_len	= 32;
2612 	r->rtm_src_len	= 0;
2613 	r->rtm_tos	= rt->fl.fl4_tos;
2614 	r->rtm_table	= RT_TABLE_MAIN;
2615 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2616 	r->rtm_type	= rt->rt_type;
2617 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2618 	r->rtm_protocol = RTPROT_UNSPEC;
2619 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2620 	if (rt->rt_flags & RTCF_NOTIFY)
2621 		r->rtm_flags |= RTM_F_NOTIFY;
2622 
2623 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2624 
2625 	if (rt->fl.fl4_src) {
2626 		r->rtm_src_len = 32;
2627 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2628 	}
2629 	if (rt->u.dst.dev)
2630 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2631 #ifdef CONFIG_NET_CLS_ROUTE
2632 	if (rt->u.dst.tclassid)
2633 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2634 #endif
2635 	if (rt->fl.iif)
2636 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2637 	else if (rt->rt_src != rt->fl.fl4_src)
2638 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2639 
2640 	if (rt->rt_dst != rt->rt_gateway)
2641 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2642 
2643 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2644 		goto nla_put_failure;
2645 
2646 	error = rt->u.dst.error;
2647 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2648 	if (rt->peer) {
2649 		id = rt->peer->ip_id_count;
2650 		if (rt->peer->tcp_ts_stamp) {
2651 			ts = rt->peer->tcp_ts;
2652 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2653 		}
2654 	}
2655 
2656 	if (rt->fl.iif) {
2657 #ifdef CONFIG_IP_MROUTE
2658 		__be32 dst = rt->rt_dst;
2659 
2660 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2661 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2662 			int err = ipmr_get_route(skb, r, nowait);
2663 			if (err <= 0) {
2664 				if (!nowait) {
2665 					if (err == 0)
2666 						return 0;
2667 					goto nla_put_failure;
2668 				} else {
2669 					if (err == -EMSGSIZE)
2670 						goto nla_put_failure;
2671 					error = err;
2672 				}
2673 			}
2674 		} else
2675 #endif
2676 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2677 	}
2678 
2679 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2680 			       expires, error) < 0)
2681 		goto nla_put_failure;
2682 
2683 	return nlmsg_end(skb, nlh);
2684 
2685 nla_put_failure:
2686 	nlmsg_cancel(skb, nlh);
2687 	return -EMSGSIZE;
2688 }
2689 
2690 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691 {
2692 	struct net *net = sock_net(in_skb->sk);
2693 	struct rtmsg *rtm;
2694 	struct nlattr *tb[RTA_MAX+1];
2695 	struct rtable *rt = NULL;
2696 	__be32 dst = 0;
2697 	__be32 src = 0;
2698 	u32 iif;
2699 	int err;
2700 	struct sk_buff *skb;
2701 
2702 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2703 	if (err < 0)
2704 		goto errout;
2705 
2706 	rtm = nlmsg_data(nlh);
2707 
2708 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2709 	if (skb == NULL) {
2710 		err = -ENOBUFS;
2711 		goto errout;
2712 	}
2713 
2714 	/* Reserve room for dummy headers, this skb can pass
2715 	   through good chunk of routing engine.
2716 	 */
2717 	skb_reset_mac_header(skb);
2718 	skb_reset_network_header(skb);
2719 
2720 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2721 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2722 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2723 
2724 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2725 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2726 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2727 
2728 	if (iif) {
2729 		struct net_device *dev;
2730 
2731 		dev = __dev_get_by_index(net, iif);
2732 		if (dev == NULL) {
2733 			err = -ENODEV;
2734 			goto errout_free;
2735 		}
2736 
2737 		skb->protocol	= htons(ETH_P_IP);
2738 		skb->dev	= dev;
2739 		local_bh_disable();
2740 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2741 		local_bh_enable();
2742 
2743 		rt = skb->rtable;
2744 		if (err == 0 && rt->u.dst.error)
2745 			err = -rt->u.dst.error;
2746 	} else {
2747 		struct flowi fl = {
2748 			.nl_u = {
2749 				.ip4_u = {
2750 					.daddr = dst,
2751 					.saddr = src,
2752 					.tos = rtm->rtm_tos,
2753 				},
2754 			},
2755 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2756 		};
2757 		err = ip_route_output_key(net, &rt, &fl);
2758 	}
2759 
2760 	if (err)
2761 		goto errout_free;
2762 
2763 	skb->rtable = rt;
2764 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2765 		rt->rt_flags |= RTCF_NOTIFY;
2766 
2767 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2768 			   RTM_NEWROUTE, 0, 0);
2769 	if (err <= 0)
2770 		goto errout_free;
2771 
2772 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2773 errout:
2774 	return err;
2775 
2776 errout_free:
2777 	kfree_skb(skb);
2778 	goto errout;
2779 }
2780 
2781 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2782 {
2783 	struct rtable *rt;
2784 	int h, s_h;
2785 	int idx, s_idx;
2786 	struct net *net;
2787 
2788 	net = sock_net(skb->sk);
2789 
2790 	s_h = cb->args[0];
2791 	if (s_h < 0)
2792 		s_h = 0;
2793 	s_idx = idx = cb->args[1];
2794 	for (h = s_h; h <= rt_hash_mask; h++) {
2795 		rcu_read_lock_bh();
2796 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2797 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2798 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2799 				continue;
2800 			if (rt->rt_genid != atomic_read(&rt_genid))
2801 				continue;
2802 			skb->dst = dst_clone(&rt->u.dst);
2803 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2804 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2805 					 1, NLM_F_MULTI) <= 0) {
2806 				dst_release(xchg(&skb->dst, NULL));
2807 				rcu_read_unlock_bh();
2808 				goto done;
2809 			}
2810 			dst_release(xchg(&skb->dst, NULL));
2811 		}
2812 		rcu_read_unlock_bh();
2813 		s_idx = 0;
2814 	}
2815 
2816 done:
2817 	cb->args[0] = h;
2818 	cb->args[1] = idx;
2819 	return skb->len;
2820 }
2821 
2822 void ip_rt_multicast_event(struct in_device *in_dev)
2823 {
2824 	rt_cache_flush(0);
2825 }
2826 
2827 #ifdef CONFIG_SYSCTL
2828 static int flush_delay;
2829 
2830 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2831 					struct file *filp, void __user *buffer,
2832 					size_t *lenp, loff_t *ppos)
2833 {
2834 	if (write) {
2835 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2836 		rt_cache_flush(flush_delay);
2837 		return 0;
2838 	}
2839 
2840 	return -EINVAL;
2841 }
2842 
2843 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2844 						int __user *name,
2845 						int nlen,
2846 						void __user *oldval,
2847 						size_t __user *oldlenp,
2848 						void __user *newval,
2849 						size_t newlen)
2850 {
2851 	int delay;
2852 	if (newlen != sizeof(int))
2853 		return -EINVAL;
2854 	if (get_user(delay, (int __user *)newval))
2855 		return -EFAULT;
2856 	rt_cache_flush(delay);
2857 	return 0;
2858 }
2859 
2860 ctl_table ipv4_route_table[] = {
2861 	{
2862 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2863 		.procname	= "flush",
2864 		.data		= &flush_delay,
2865 		.maxlen		= sizeof(int),
2866 		.mode		= 0200,
2867 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2868 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2869 	},
2870 	{
2871 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2872 		.procname	= "gc_thresh",
2873 		.data		= &ipv4_dst_ops.gc_thresh,
2874 		.maxlen		= sizeof(int),
2875 		.mode		= 0644,
2876 		.proc_handler	= &proc_dointvec,
2877 	},
2878 	{
2879 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2880 		.procname	= "max_size",
2881 		.data		= &ip_rt_max_size,
2882 		.maxlen		= sizeof(int),
2883 		.mode		= 0644,
2884 		.proc_handler	= &proc_dointvec,
2885 	},
2886 	{
2887 		/*  Deprecated. Use gc_min_interval_ms */
2888 
2889 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2890 		.procname	= "gc_min_interval",
2891 		.data		= &ip_rt_gc_min_interval,
2892 		.maxlen		= sizeof(int),
2893 		.mode		= 0644,
2894 		.proc_handler	= &proc_dointvec_jiffies,
2895 		.strategy	= &sysctl_jiffies,
2896 	},
2897 	{
2898 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2899 		.procname	= "gc_min_interval_ms",
2900 		.data		= &ip_rt_gc_min_interval,
2901 		.maxlen		= sizeof(int),
2902 		.mode		= 0644,
2903 		.proc_handler	= &proc_dointvec_ms_jiffies,
2904 		.strategy	= &sysctl_ms_jiffies,
2905 	},
2906 	{
2907 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2908 		.procname	= "gc_timeout",
2909 		.data		= &ip_rt_gc_timeout,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= &proc_dointvec_jiffies,
2913 		.strategy	= &sysctl_jiffies,
2914 	},
2915 	{
2916 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2917 		.procname	= "gc_interval",
2918 		.data		= &ip_rt_gc_interval,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= &proc_dointvec_jiffies,
2922 		.strategy	= &sysctl_jiffies,
2923 	},
2924 	{
2925 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2926 		.procname	= "redirect_load",
2927 		.data		= &ip_rt_redirect_load,
2928 		.maxlen		= sizeof(int),
2929 		.mode		= 0644,
2930 		.proc_handler	= &proc_dointvec,
2931 	},
2932 	{
2933 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2934 		.procname	= "redirect_number",
2935 		.data		= &ip_rt_redirect_number,
2936 		.maxlen		= sizeof(int),
2937 		.mode		= 0644,
2938 		.proc_handler	= &proc_dointvec,
2939 	},
2940 	{
2941 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2942 		.procname	= "redirect_silence",
2943 		.data		= &ip_rt_redirect_silence,
2944 		.maxlen		= sizeof(int),
2945 		.mode		= 0644,
2946 		.proc_handler	= &proc_dointvec,
2947 	},
2948 	{
2949 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2950 		.procname	= "error_cost",
2951 		.data		= &ip_rt_error_cost,
2952 		.maxlen		= sizeof(int),
2953 		.mode		= 0644,
2954 		.proc_handler	= &proc_dointvec,
2955 	},
2956 	{
2957 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2958 		.procname	= "error_burst",
2959 		.data		= &ip_rt_error_burst,
2960 		.maxlen		= sizeof(int),
2961 		.mode		= 0644,
2962 		.proc_handler	= &proc_dointvec,
2963 	},
2964 	{
2965 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2966 		.procname	= "gc_elasticity",
2967 		.data		= &ip_rt_gc_elasticity,
2968 		.maxlen		= sizeof(int),
2969 		.mode		= 0644,
2970 		.proc_handler	= &proc_dointvec,
2971 	},
2972 	{
2973 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2974 		.procname	= "mtu_expires",
2975 		.data		= &ip_rt_mtu_expires,
2976 		.maxlen		= sizeof(int),
2977 		.mode		= 0644,
2978 		.proc_handler	= &proc_dointvec_jiffies,
2979 		.strategy	= &sysctl_jiffies,
2980 	},
2981 	{
2982 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2983 		.procname	= "min_pmtu",
2984 		.data		= &ip_rt_min_pmtu,
2985 		.maxlen		= sizeof(int),
2986 		.mode		= 0644,
2987 		.proc_handler	= &proc_dointvec,
2988 	},
2989 	{
2990 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2991 		.procname	= "min_adv_mss",
2992 		.data		= &ip_rt_min_advmss,
2993 		.maxlen		= sizeof(int),
2994 		.mode		= 0644,
2995 		.proc_handler	= &proc_dointvec,
2996 	},
2997 	{
2998 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2999 		.procname	= "secret_interval",
3000 		.data		= &ip_rt_secret_interval,
3001 		.maxlen		= sizeof(int),
3002 		.mode		= 0644,
3003 		.proc_handler	= &proc_dointvec_jiffies,
3004 		.strategy	= &sysctl_jiffies,
3005 	},
3006 	{ .ctl_name = 0 }
3007 };
3008 #endif
3009 
3010 #ifdef CONFIG_NET_CLS_ROUTE
3011 struct ip_rt_acct *ip_rt_acct __read_mostly;
3012 #endif /* CONFIG_NET_CLS_ROUTE */
3013 
3014 static __initdata unsigned long rhash_entries;
3015 static int __init set_rhash_entries(char *str)
3016 {
3017 	if (!str)
3018 		return 0;
3019 	rhash_entries = simple_strtoul(str, &str, 0);
3020 	return 1;
3021 }
3022 __setup("rhash_entries=", set_rhash_entries);
3023 
3024 int __init ip_rt_init(void)
3025 {
3026 	int rc = 0;
3027 
3028 	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3029 			     (jiffies ^ (jiffies >> 7))));
3030 
3031 #ifdef CONFIG_NET_CLS_ROUTE
3032 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3033 	if (!ip_rt_acct)
3034 		panic("IP: failed to allocate ip_rt_acct\n");
3035 #endif
3036 
3037 	ipv4_dst_ops.kmem_cachep =
3038 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3039 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3040 
3041 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042 
3043 	rt_hash_table = (struct rt_hash_bucket *)
3044 		alloc_large_system_hash("IP route cache",
3045 					sizeof(struct rt_hash_bucket),
3046 					rhash_entries,
3047 					(num_physpages >= 128 * 1024) ?
3048 					15 : 17,
3049 					0,
3050 					&rt_hash_log,
3051 					&rt_hash_mask,
3052 					0);
3053 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3054 	rt_hash_lock_init();
3055 
3056 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3057 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058 
3059 	devinet_init();
3060 	ip_fib_init();
3061 
3062 	rt_secret_timer.function = rt_secret_rebuild;
3063 	rt_secret_timer.data = 0;
3064 	init_timer_deferrable(&rt_secret_timer);
3065 
3066 	/* All the timers, started at system startup tend
3067 	   to synchronize. Perturb it a bit.
3068 	 */
3069 	schedule_delayed_work(&expires_work,
3070 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3071 
3072 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3073 		ip_rt_secret_interval;
3074 	add_timer(&rt_secret_timer);
3075 
3076 	if (ip_rt_proc_init())
3077 		printk(KERN_ERR "Unable to create route proc files\n");
3078 #ifdef CONFIG_XFRM
3079 	xfrm_init();
3080 	xfrm4_init();
3081 #endif
3082 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3083 
3084 	return rc;
3085 }
3086 
3087 EXPORT_SYMBOL(__ip_select_ident);
3088 EXPORT_SYMBOL(ip_route_input);
3089 EXPORT_SYMBOL(ip_route_output_key);
3090