xref: /linux/net/ipv4/route.c (revision 643d1f7fe3aa12c8bdea6fa5b4ba874ff6dd601d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 
116 #define IP_MAX_MTU	0xFFF0
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval		= 60 * HZ;
123 static int ip_rt_gc_min_interval	= HZ / 2;
124 static int ip_rt_redirect_number	= 9;
125 static int ip_rt_redirect_load		= HZ / 50;
126 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost		= HZ;
128 static int ip_rt_error_burst		= 5 * HZ;
129 static int ip_rt_gc_elasticity		= 8;
130 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu		= 512 + 20 + 20;
132 static int ip_rt_min_advmss		= 256;
133 static int ip_rt_secret_interval	= 10 * 60 * HZ;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.local_out =		ip_local_out,
166 	.entry_size =		sizeof(struct rtable),
167 	.entries =		ATOMIC_INIT(0),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 const __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 	defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ	256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ	4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ	2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ	1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ	512
227 # else
228 #  define RT_HASH_LOCK_SZ	256
229 # endif
230 #endif
231 
232 static spinlock_t	*rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 
235 static __init void rt_hash_lock_init(void)
236 {
237 	int i;
238 
239 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240 			GFP_KERNEL);
241 	if (!rt_hash_locks)
242 		panic("IP: failed to allocate rt_hash_locks\n");
243 
244 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245 		spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249 
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254 
255 static struct rt_hash_bucket 	*rt_hash_table;
256 static unsigned			rt_hash_mask;
257 static unsigned int		rt_hash_log;
258 static atomic_t			rt_genid;
259 
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262 	(__raw_get_cpu_var(rt_cache_stat).field++)
263 
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266 	return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267 		& rt_hash_mask;
268 }
269 
270 #define rt_hash(daddr, saddr, idx) \
271 	rt_hash_code((__force u32)(__be32)(daddr),\
272 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	int bucket;
277 	int genid;
278 };
279 
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282 	struct rtable *r = NULL;
283 
284 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 		rcu_read_lock_bh();
286 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 		while (r) {
288 			if (r->rt_genid == st->genid)
289 				return r;
290 			r = rcu_dereference(r->u.dst.rt_next);
291 		}
292 		rcu_read_unlock_bh();
293 	}
294 	return r;
295 }
296 
297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
298 {
299 	r = r->u.dst.rt_next;
300 	while (!r) {
301 		rcu_read_unlock_bh();
302 		if (--st->bucket < 0)
303 			break;
304 		rcu_read_lock_bh();
305 		r = rt_hash_table[st->bucket].chain;
306 	}
307 	return rcu_dereference(r);
308 }
309 
310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
311 {
312 	struct rtable *r = rt_cache_get_first(st);
313 
314 	if (r)
315 		while (pos && (r = rt_cache_get_next(st, r))) {
316 			if (r->rt_genid != st->genid)
317 				continue;
318 			--pos;
319 		}
320 	return pos ? NULL : r;
321 }
322 
323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
324 {
325 	struct rt_cache_iter_state *st = seq->private;
326 
327 	if (*pos)
328 		return rt_cache_get_idx(st, *pos - 1);
329 	st->genid = atomic_read(&rt_genid);
330 	return SEQ_START_TOKEN;
331 }
332 
333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
334 {
335 	struct rtable *r;
336 	struct rt_cache_iter_state *st = seq->private;
337 
338 	if (v == SEQ_START_TOKEN)
339 		r = rt_cache_get_first(st);
340 	else
341 		r = rt_cache_get_next(st, v);
342 	++*pos;
343 	return r;
344 }
345 
346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
347 {
348 	if (v && v != SEQ_START_TOKEN)
349 		rcu_read_unlock_bh();
350 }
351 
352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
353 {
354 	if (v == SEQ_START_TOKEN)
355 		seq_printf(seq, "%-127s\n",
356 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
357 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
358 			   "HHUptod\tSpecDst");
359 	else {
360 		struct rtable *r = v;
361 		char temp[256];
362 
363 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
364 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
365 			r->u.dst.dev ? r->u.dst.dev->name : "*",
366 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
367 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
368 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
369 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
370 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
371 			dst_metric(&r->u.dst, RTAX_WINDOW),
372 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
373 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
374 			r->fl.fl4_tos,
375 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
376 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
377 				       dev_queue_xmit) : 0,
378 			r->rt_spec_dst);
379 		seq_printf(seq, "%-127s\n", temp);
380 	}
381 	return 0;
382 }
383 
384 static const struct seq_operations rt_cache_seq_ops = {
385 	.start  = rt_cache_seq_start,
386 	.next   = rt_cache_seq_next,
387 	.stop   = rt_cache_seq_stop,
388 	.show   = rt_cache_seq_show,
389 };
390 
391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
392 {
393 	return seq_open_private(file, &rt_cache_seq_ops,
394 			sizeof(struct rt_cache_iter_state));
395 }
396 
397 static const struct file_operations rt_cache_seq_fops = {
398 	.owner	 = THIS_MODULE,
399 	.open	 = rt_cache_seq_open,
400 	.read	 = seq_read,
401 	.llseek	 = seq_lseek,
402 	.release = seq_release_private,
403 };
404 
405 
406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 {
408 	int cpu;
409 
410 	if (*pos == 0)
411 		return SEQ_START_TOKEN;
412 
413 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
414 		if (!cpu_possible(cpu))
415 			continue;
416 		*pos = cpu+1;
417 		return &per_cpu(rt_cache_stat, cpu);
418 	}
419 	return NULL;
420 }
421 
422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 {
424 	int cpu;
425 
426 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
427 		if (!cpu_possible(cpu))
428 			continue;
429 		*pos = cpu+1;
430 		return &per_cpu(rt_cache_stat, cpu);
431 	}
432 	return NULL;
433 
434 }
435 
436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
437 {
438 
439 }
440 
441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 {
443 	struct rt_cache_stat *st = v;
444 
445 	if (v == SEQ_START_TOKEN) {
446 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
447 		return 0;
448 	}
449 
450 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
451 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
452 		   atomic_read(&ipv4_dst_ops.entries),
453 		   st->in_hit,
454 		   st->in_slow_tot,
455 		   st->in_slow_mc,
456 		   st->in_no_route,
457 		   st->in_brd,
458 		   st->in_martian_dst,
459 		   st->in_martian_src,
460 
461 		   st->out_hit,
462 		   st->out_slow_tot,
463 		   st->out_slow_mc,
464 
465 		   st->gc_total,
466 		   st->gc_ignored,
467 		   st->gc_goal_miss,
468 		   st->gc_dst_overflow,
469 		   st->in_hlist_search,
470 		   st->out_hlist_search
471 		);
472 	return 0;
473 }
474 
475 static const struct seq_operations rt_cpu_seq_ops = {
476 	.start  = rt_cpu_seq_start,
477 	.next   = rt_cpu_seq_next,
478 	.stop   = rt_cpu_seq_stop,
479 	.show   = rt_cpu_seq_show,
480 };
481 
482 
483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 {
485 	return seq_open(file, &rt_cpu_seq_ops);
486 }
487 
488 static const struct file_operations rt_cpu_seq_fops = {
489 	.owner	 = THIS_MODULE,
490 	.open	 = rt_cpu_seq_open,
491 	.read	 = seq_read,
492 	.llseek	 = seq_lseek,
493 	.release = seq_release,
494 };
495 
496 #ifdef CONFIG_NET_CLS_ROUTE
497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
498 			   int length, int *eof, void *data)
499 {
500 	unsigned int i;
501 
502 	if ((offset & 3) || (length & 3))
503 		return -EIO;
504 
505 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
506 		*eof = 1;
507 		return 0;
508 	}
509 
510 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
511 		length = sizeof(struct ip_rt_acct) * 256 - offset;
512 		*eof = 1;
513 	}
514 
515 	offset /= sizeof(u32);
516 
517 	if (length > 0) {
518 		u32 *dst = (u32 *) buffer;
519 
520 		*start = buffer;
521 		memset(dst, 0, length);
522 
523 		for_each_possible_cpu(i) {
524 			unsigned int j;
525 			u32 *src;
526 
527 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
528 			for (j = 0; j < length/4; j++)
529 				dst[j] += src[j];
530 		}
531 	}
532 	return length;
533 }
534 #endif
535 
536 static __init int ip_rt_proc_init(struct net *net)
537 {
538 	struct proc_dir_entry *pde;
539 
540 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
541 			&rt_cache_seq_fops);
542 	if (!pde)
543 		goto err1;
544 
545 	pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
546 	if (!pde)
547 		goto err2;
548 
549 	pde->proc_fops = &rt_cpu_seq_fops;
550 
551 #ifdef CONFIG_NET_CLS_ROUTE
552 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
553 			ip_rt_acct_read, NULL);
554 	if (!pde)
555 		goto err3;
556 #endif
557 	return 0;
558 
559 #ifdef CONFIG_NET_CLS_ROUTE
560 err3:
561 	remove_proc_entry("rt_cache", net->proc_net_stat);
562 #endif
563 err2:
564 	remove_proc_entry("rt_cache", net->proc_net);
565 err1:
566 	return -ENOMEM;
567 }
568 #else
569 static inline int ip_rt_proc_init(struct net *net)
570 {
571 	return 0;
572 }
573 #endif /* CONFIG_PROC_FS */
574 
575 static __inline__ void rt_free(struct rtable *rt)
576 {
577 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
578 }
579 
580 static __inline__ void rt_drop(struct rtable *rt)
581 {
582 	ip_rt_put(rt);
583 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
584 }
585 
586 static __inline__ int rt_fast_clean(struct rtable *rth)
587 {
588 	/* Kill broadcast/multicast entries very aggresively, if they
589 	   collide in hash table with more useful entries */
590 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
591 		rth->fl.iif && rth->u.dst.rt_next;
592 }
593 
594 static __inline__ int rt_valuable(struct rtable *rth)
595 {
596 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
597 		rth->u.dst.expires;
598 }
599 
600 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
601 {
602 	unsigned long age;
603 	int ret = 0;
604 
605 	if (atomic_read(&rth->u.dst.__refcnt))
606 		goto out;
607 
608 	ret = 1;
609 	if (rth->u.dst.expires &&
610 	    time_after_eq(jiffies, rth->u.dst.expires))
611 		goto out;
612 
613 	age = jiffies - rth->u.dst.lastuse;
614 	ret = 0;
615 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
616 	    (age <= tmo2 && rt_valuable(rth)))
617 		goto out;
618 	ret = 1;
619 out:	return ret;
620 }
621 
622 /* Bits of score are:
623  * 31: very valuable
624  * 30: not quite useless
625  * 29..0: usage counter
626  */
627 static inline u32 rt_score(struct rtable *rt)
628 {
629 	u32 score = jiffies - rt->u.dst.lastuse;
630 
631 	score = ~score & ~(3<<30);
632 
633 	if (rt_valuable(rt))
634 		score |= (1<<31);
635 
636 	if (!rt->fl.iif ||
637 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
638 		score |= (1<<30);
639 
640 	return score;
641 }
642 
643 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
644 {
645 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
646 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
647 		(fl1->mark ^ fl2->mark) |
648 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
649 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
650 		(fl1->oif ^ fl2->oif) |
651 		(fl1->iif ^ fl2->iif)) == 0;
652 }
653 
654 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
655 {
656 	return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
657 }
658 
659 /*
660  * Perform a full scan of hash table and free all entries.
661  * Can be called by a softirq or a process.
662  * In the later case, we want to be reschedule if necessary
663  */
664 static void rt_do_flush(int process_context)
665 {
666 	unsigned int i;
667 	struct rtable *rth, *next;
668 
669 	for (i = 0; i <= rt_hash_mask; i++) {
670 		if (process_context && need_resched())
671 			cond_resched();
672 		rth = rt_hash_table[i].chain;
673 		if (!rth)
674 			continue;
675 
676 		spin_lock_bh(rt_hash_lock_addr(i));
677 		rth = rt_hash_table[i].chain;
678 		rt_hash_table[i].chain = NULL;
679 		spin_unlock_bh(rt_hash_lock_addr(i));
680 
681 		for (; rth; rth = next) {
682 			next = rth->u.dst.rt_next;
683 			rt_free(rth);
684 		}
685 	}
686 }
687 
688 static void rt_check_expire(void)
689 {
690 	static unsigned int rover;
691 	unsigned int i = rover, goal;
692 	struct rtable *rth, **rthp;
693 	u64 mult;
694 
695 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
696 	if (ip_rt_gc_timeout > 1)
697 		do_div(mult, ip_rt_gc_timeout);
698 	goal = (unsigned int)mult;
699 	if (goal > rt_hash_mask)
700 		goal = rt_hash_mask + 1;
701 	for (; goal > 0; goal--) {
702 		unsigned long tmo = ip_rt_gc_timeout;
703 
704 		i = (i + 1) & rt_hash_mask;
705 		rthp = &rt_hash_table[i].chain;
706 
707 		if (need_resched())
708 			cond_resched();
709 
710 		if (*rthp == NULL)
711 			continue;
712 		spin_lock_bh(rt_hash_lock_addr(i));
713 		while ((rth = *rthp) != NULL) {
714 			if (rth->rt_genid != atomic_read(&rt_genid)) {
715 				*rthp = rth->u.dst.rt_next;
716 				rt_free(rth);
717 				continue;
718 			}
719 			if (rth->u.dst.expires) {
720 				/* Entry is expired even if it is in use */
721 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
722 					tmo >>= 1;
723 					rthp = &rth->u.dst.rt_next;
724 					continue;
725 				}
726 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
727 				tmo >>= 1;
728 				rthp = &rth->u.dst.rt_next;
729 				continue;
730 			}
731 
732 			/* Cleanup aged off entries. */
733 			*rthp = rth->u.dst.rt_next;
734 			rt_free(rth);
735 		}
736 		spin_unlock_bh(rt_hash_lock_addr(i));
737 	}
738 	rover = i;
739 }
740 
741 /*
742  * rt_worker_func() is run in process context.
743  * we call rt_check_expire() to scan part of the hash table
744  */
745 static void rt_worker_func(struct work_struct *work)
746 {
747 	rt_check_expire();
748 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
749 }
750 
751 /*
752  * Pertubation of rt_genid by a small quantity [1..256]
753  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
754  * many times (2^24) without giving recent rt_genid.
755  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
756  */
757 static void rt_cache_invalidate(void)
758 {
759 	unsigned char shuffle;
760 
761 	get_random_bytes(&shuffle, sizeof(shuffle));
762 	atomic_add(shuffle + 1U, &rt_genid);
763 }
764 
765 /*
766  * delay < 0  : invalidate cache (fast : entries will be deleted later)
767  * delay >= 0 : invalidate & flush cache (can be long)
768  */
769 void rt_cache_flush(int delay)
770 {
771 	rt_cache_invalidate();
772 	if (delay >= 0)
773 		rt_do_flush(!in_softirq());
774 }
775 
776 /*
777  * We change rt_genid and let gc do the cleanup
778  */
779 static void rt_secret_rebuild(unsigned long dummy)
780 {
781 	rt_cache_invalidate();
782 	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
783 }
784 
785 /*
786    Short description of GC goals.
787 
788    We want to build algorithm, which will keep routing cache
789    at some equilibrium point, when number of aged off entries
790    is kept approximately equal to newly generated ones.
791 
792    Current expiration strength is variable "expire".
793    We try to adjust it dynamically, so that if networking
794    is idle expires is large enough to keep enough of warm entries,
795    and when load increases it reduces to limit cache size.
796  */
797 
798 static int rt_garbage_collect(struct dst_ops *ops)
799 {
800 	static unsigned long expire = RT_GC_TIMEOUT;
801 	static unsigned long last_gc;
802 	static int rover;
803 	static int equilibrium;
804 	struct rtable *rth, **rthp;
805 	unsigned long now = jiffies;
806 	int goal;
807 
808 	/*
809 	 * Garbage collection is pretty expensive,
810 	 * do not make it too frequently.
811 	 */
812 
813 	RT_CACHE_STAT_INC(gc_total);
814 
815 	if (now - last_gc < ip_rt_gc_min_interval &&
816 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
817 		RT_CACHE_STAT_INC(gc_ignored);
818 		goto out;
819 	}
820 
821 	/* Calculate number of entries, which we want to expire now. */
822 	goal = atomic_read(&ipv4_dst_ops.entries) -
823 		(ip_rt_gc_elasticity << rt_hash_log);
824 	if (goal <= 0) {
825 		if (equilibrium < ipv4_dst_ops.gc_thresh)
826 			equilibrium = ipv4_dst_ops.gc_thresh;
827 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
828 		if (goal > 0) {
829 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
830 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
831 		}
832 	} else {
833 		/* We are in dangerous area. Try to reduce cache really
834 		 * aggressively.
835 		 */
836 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
837 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
838 	}
839 
840 	if (now - last_gc >= ip_rt_gc_min_interval)
841 		last_gc = now;
842 
843 	if (goal <= 0) {
844 		equilibrium += goal;
845 		goto work_done;
846 	}
847 
848 	do {
849 		int i, k;
850 
851 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
852 			unsigned long tmo = expire;
853 
854 			k = (k + 1) & rt_hash_mask;
855 			rthp = &rt_hash_table[k].chain;
856 			spin_lock_bh(rt_hash_lock_addr(k));
857 			while ((rth = *rthp) != NULL) {
858 				if (rth->rt_genid == atomic_read(&rt_genid) &&
859 					!rt_may_expire(rth, tmo, expire)) {
860 					tmo >>= 1;
861 					rthp = &rth->u.dst.rt_next;
862 					continue;
863 				}
864 				*rthp = rth->u.dst.rt_next;
865 				rt_free(rth);
866 				goal--;
867 			}
868 			spin_unlock_bh(rt_hash_lock_addr(k));
869 			if (goal <= 0)
870 				break;
871 		}
872 		rover = k;
873 
874 		if (goal <= 0)
875 			goto work_done;
876 
877 		/* Goal is not achieved. We stop process if:
878 
879 		   - if expire reduced to zero. Otherwise, expire is halfed.
880 		   - if table is not full.
881 		   - if we are called from interrupt.
882 		   - jiffies check is just fallback/debug loop breaker.
883 		     We will not spin here for long time in any case.
884 		 */
885 
886 		RT_CACHE_STAT_INC(gc_goal_miss);
887 
888 		if (expire == 0)
889 			break;
890 
891 		expire >>= 1;
892 #if RT_CACHE_DEBUG >= 2
893 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
894 				atomic_read(&ipv4_dst_ops.entries), goal, i);
895 #endif
896 
897 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
898 			goto out;
899 	} while (!in_softirq() && time_before_eq(jiffies, now));
900 
901 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
902 		goto out;
903 	if (net_ratelimit())
904 		printk(KERN_WARNING "dst cache overflow\n");
905 	RT_CACHE_STAT_INC(gc_dst_overflow);
906 	return 1;
907 
908 work_done:
909 	expire += ip_rt_gc_min_interval;
910 	if (expire > ip_rt_gc_timeout ||
911 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
912 		expire = ip_rt_gc_timeout;
913 #if RT_CACHE_DEBUG >= 2
914 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
915 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
916 #endif
917 out:	return 0;
918 }
919 
920 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
921 {
922 	struct rtable	*rth, **rthp;
923 	unsigned long	now;
924 	struct rtable *cand, **candp;
925 	u32 		min_score;
926 	int		chain_length;
927 	int attempts = !in_softirq();
928 
929 restart:
930 	chain_length = 0;
931 	min_score = ~(u32)0;
932 	cand = NULL;
933 	candp = NULL;
934 	now = jiffies;
935 
936 	rthp = &rt_hash_table[hash].chain;
937 
938 	spin_lock_bh(rt_hash_lock_addr(hash));
939 	while ((rth = *rthp) != NULL) {
940 		if (rth->rt_genid != atomic_read(&rt_genid)) {
941 			*rthp = rth->u.dst.rt_next;
942 			rt_free(rth);
943 			continue;
944 		}
945 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
946 			/* Put it first */
947 			*rthp = rth->u.dst.rt_next;
948 			/*
949 			 * Since lookup is lockfree, the deletion
950 			 * must be visible to another weakly ordered CPU before
951 			 * the insertion at the start of the hash chain.
952 			 */
953 			rcu_assign_pointer(rth->u.dst.rt_next,
954 					   rt_hash_table[hash].chain);
955 			/*
956 			 * Since lookup is lockfree, the update writes
957 			 * must be ordered for consistency on SMP.
958 			 */
959 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
960 
961 			dst_use(&rth->u.dst, now);
962 			spin_unlock_bh(rt_hash_lock_addr(hash));
963 
964 			rt_drop(rt);
965 			*rp = rth;
966 			return 0;
967 		}
968 
969 		if (!atomic_read(&rth->u.dst.__refcnt)) {
970 			u32 score = rt_score(rth);
971 
972 			if (score <= min_score) {
973 				cand = rth;
974 				candp = rthp;
975 				min_score = score;
976 			}
977 		}
978 
979 		chain_length++;
980 
981 		rthp = &rth->u.dst.rt_next;
982 	}
983 
984 	if (cand) {
985 		/* ip_rt_gc_elasticity used to be average length of chain
986 		 * length, when exceeded gc becomes really aggressive.
987 		 *
988 		 * The second limit is less certain. At the moment it allows
989 		 * only 2 entries per bucket. We will see.
990 		 */
991 		if (chain_length > ip_rt_gc_elasticity) {
992 			*candp = cand->u.dst.rt_next;
993 			rt_free(cand);
994 		}
995 	}
996 
997 	/* Try to bind route to arp only if it is output
998 	   route or unicast forwarding path.
999 	 */
1000 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1001 		int err = arp_bind_neighbour(&rt->u.dst);
1002 		if (err) {
1003 			spin_unlock_bh(rt_hash_lock_addr(hash));
1004 
1005 			if (err != -ENOBUFS) {
1006 				rt_drop(rt);
1007 				return err;
1008 			}
1009 
1010 			/* Neighbour tables are full and nothing
1011 			   can be released. Try to shrink route cache,
1012 			   it is most likely it holds some neighbour records.
1013 			 */
1014 			if (attempts-- > 0) {
1015 				int saved_elasticity = ip_rt_gc_elasticity;
1016 				int saved_int = ip_rt_gc_min_interval;
1017 				ip_rt_gc_elasticity	= 1;
1018 				ip_rt_gc_min_interval	= 0;
1019 				rt_garbage_collect(&ipv4_dst_ops);
1020 				ip_rt_gc_min_interval	= saved_int;
1021 				ip_rt_gc_elasticity	= saved_elasticity;
1022 				goto restart;
1023 			}
1024 
1025 			if (net_ratelimit())
1026 				printk(KERN_WARNING "Neighbour table overflow.\n");
1027 			rt_drop(rt);
1028 			return -ENOBUFS;
1029 		}
1030 	}
1031 
1032 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1033 #if RT_CACHE_DEBUG >= 2
1034 	if (rt->u.dst.rt_next) {
1035 		struct rtable *trt;
1036 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1037 		       NIPQUAD(rt->rt_dst));
1038 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1039 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1040 		printk("\n");
1041 	}
1042 #endif
1043 	rt_hash_table[hash].chain = rt;
1044 	spin_unlock_bh(rt_hash_lock_addr(hash));
1045 	*rp = rt;
1046 	return 0;
1047 }
1048 
1049 void rt_bind_peer(struct rtable *rt, int create)
1050 {
1051 	static DEFINE_SPINLOCK(rt_peer_lock);
1052 	struct inet_peer *peer;
1053 
1054 	peer = inet_getpeer(rt->rt_dst, create);
1055 
1056 	spin_lock_bh(&rt_peer_lock);
1057 	if (rt->peer == NULL) {
1058 		rt->peer = peer;
1059 		peer = NULL;
1060 	}
1061 	spin_unlock_bh(&rt_peer_lock);
1062 	if (peer)
1063 		inet_putpeer(peer);
1064 }
1065 
1066 /*
1067  * Peer allocation may fail only in serious out-of-memory conditions.  However
1068  * we still can generate some output.
1069  * Random ID selection looks a bit dangerous because we have no chances to
1070  * select ID being unique in a reasonable period of time.
1071  * But broken packet identifier may be better than no packet at all.
1072  */
1073 static void ip_select_fb_ident(struct iphdr *iph)
1074 {
1075 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1076 	static u32 ip_fallback_id;
1077 	u32 salt;
1078 
1079 	spin_lock_bh(&ip_fb_id_lock);
1080 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1081 	iph->id = htons(salt & 0xFFFF);
1082 	ip_fallback_id = salt;
1083 	spin_unlock_bh(&ip_fb_id_lock);
1084 }
1085 
1086 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1087 {
1088 	struct rtable *rt = (struct rtable *) dst;
1089 
1090 	if (rt) {
1091 		if (rt->peer == NULL)
1092 			rt_bind_peer(rt, 1);
1093 
1094 		/* If peer is attached to destination, it is never detached,
1095 		   so that we need not to grab a lock to dereference it.
1096 		 */
1097 		if (rt->peer) {
1098 			iph->id = htons(inet_getid(rt->peer, more));
1099 			return;
1100 		}
1101 	} else
1102 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1103 		       __builtin_return_address(0));
1104 
1105 	ip_select_fb_ident(iph);
1106 }
1107 
1108 static void rt_del(unsigned hash, struct rtable *rt)
1109 {
1110 	struct rtable **rthp, *aux;
1111 
1112 	rthp = &rt_hash_table[hash].chain;
1113 	spin_lock_bh(rt_hash_lock_addr(hash));
1114 	ip_rt_put(rt);
1115 	while ((aux = *rthp) != NULL) {
1116 		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1117 			*rthp = aux->u.dst.rt_next;
1118 			rt_free(aux);
1119 			continue;
1120 		}
1121 		rthp = &aux->u.dst.rt_next;
1122 	}
1123 	spin_unlock_bh(rt_hash_lock_addr(hash));
1124 }
1125 
1126 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1127 		    __be32 saddr, struct net_device *dev)
1128 {
1129 	int i, k;
1130 	struct in_device *in_dev = in_dev_get(dev);
1131 	struct rtable *rth, **rthp;
1132 	__be32  skeys[2] = { saddr, 0 };
1133 	int  ikeys[2] = { dev->ifindex, 0 };
1134 	struct netevent_redirect netevent;
1135 
1136 	if (!in_dev)
1137 		return;
1138 
1139 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1140 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1141 	    || ipv4_is_zeronet(new_gw))
1142 		goto reject_redirect;
1143 
1144 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1145 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1146 			goto reject_redirect;
1147 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1148 			goto reject_redirect;
1149 	} else {
1150 		if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1151 			goto reject_redirect;
1152 	}
1153 
1154 	for (i = 0; i < 2; i++) {
1155 		for (k = 0; k < 2; k++) {
1156 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1157 
1158 			rthp=&rt_hash_table[hash].chain;
1159 
1160 			rcu_read_lock();
1161 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1162 				struct rtable *rt;
1163 
1164 				if (rth->fl.fl4_dst != daddr ||
1165 				    rth->fl.fl4_src != skeys[i] ||
1166 				    rth->fl.oif != ikeys[k] ||
1167 				    rth->fl.iif != 0 ||
1168 				    rth->rt_genid != atomic_read(&rt_genid)) {
1169 					rthp = &rth->u.dst.rt_next;
1170 					continue;
1171 				}
1172 
1173 				if (rth->rt_dst != daddr ||
1174 				    rth->rt_src != saddr ||
1175 				    rth->u.dst.error ||
1176 				    rth->rt_gateway != old_gw ||
1177 				    rth->u.dst.dev != dev)
1178 					break;
1179 
1180 				dst_hold(&rth->u.dst);
1181 				rcu_read_unlock();
1182 
1183 				rt = dst_alloc(&ipv4_dst_ops);
1184 				if (rt == NULL) {
1185 					ip_rt_put(rth);
1186 					in_dev_put(in_dev);
1187 					return;
1188 				}
1189 
1190 				/* Copy all the information. */
1191 				*rt = *rth;
1192 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193 				rt->u.dst.__use		= 1;
1194 				atomic_set(&rt->u.dst.__refcnt, 1);
1195 				rt->u.dst.child		= NULL;
1196 				if (rt->u.dst.dev)
1197 					dev_hold(rt->u.dst.dev);
1198 				if (rt->idev)
1199 					in_dev_hold(rt->idev);
1200 				rt->u.dst.obsolete	= 0;
1201 				rt->u.dst.lastuse	= jiffies;
1202 				rt->u.dst.path		= &rt->u.dst;
1203 				rt->u.dst.neighbour	= NULL;
1204 				rt->u.dst.hh		= NULL;
1205 				rt->u.dst.xfrm		= NULL;
1206 				rt->rt_genid		= atomic_read(&rt_genid);
1207 				rt->rt_flags		|= RTCF_REDIRECTED;
1208 
1209 				/* Gateway is different ... */
1210 				rt->rt_gateway		= new_gw;
1211 
1212 				/* Redirect received -> path was valid */
1213 				dst_confirm(&rth->u.dst);
1214 
1215 				if (rt->peer)
1216 					atomic_inc(&rt->peer->refcnt);
1217 
1218 				if (arp_bind_neighbour(&rt->u.dst) ||
1219 				    !(rt->u.dst.neighbour->nud_state &
1220 					    NUD_VALID)) {
1221 					if (rt->u.dst.neighbour)
1222 						neigh_event_send(rt->u.dst.neighbour, NULL);
1223 					ip_rt_put(rth);
1224 					rt_drop(rt);
1225 					goto do_next;
1226 				}
1227 
1228 				netevent.old = &rth->u.dst;
1229 				netevent.new = &rt->u.dst;
1230 				call_netevent_notifiers(NETEVENT_REDIRECT,
1231 							&netevent);
1232 
1233 				rt_del(hash, rth);
1234 				if (!rt_intern_hash(hash, rt, &rt))
1235 					ip_rt_put(rt);
1236 				goto do_next;
1237 			}
1238 			rcu_read_unlock();
1239 		do_next:
1240 			;
1241 		}
1242 	}
1243 	in_dev_put(in_dev);
1244 	return;
1245 
1246 reject_redirect:
1247 #ifdef CONFIG_IP_ROUTE_VERBOSE
1248 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250 			"%u.%u.%u.%u ignored.\n"
1251 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253 		       NIPQUAD(saddr), NIPQUAD(daddr));
1254 #endif
1255 	in_dev_put(in_dev);
1256 }
1257 
1258 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259 {
1260 	struct rtable *rt = (struct rtable*)dst;
1261 	struct dst_entry *ret = dst;
1262 
1263 	if (rt) {
1264 		if (dst->obsolete) {
1265 			ip_rt_put(rt);
1266 			ret = NULL;
1267 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268 			   rt->u.dst.expires) {
1269 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270 						rt->fl.oif);
1271 #if RT_CACHE_DEBUG >= 1
1272 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1273 					  "%u.%u.%u.%u/%02x dropped\n",
1274 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275 #endif
1276 			rt_del(hash, rt);
1277 			ret = NULL;
1278 		}
1279 	}
1280 	return ret;
1281 }
1282 
1283 /*
1284  * Algorithm:
1285  *	1. The first ip_rt_redirect_number redirects are sent
1286  *	   with exponential backoff, then we stop sending them at all,
1287  *	   assuming that the host ignores our redirects.
1288  *	2. If we did not see packets requiring redirects
1289  *	   during ip_rt_redirect_silence, we assume that the host
1290  *	   forgot redirected route and start to send redirects again.
1291  *
1292  * This algorithm is much cheaper and more intelligent than dumb load limiting
1293  * in icmp.c.
1294  *
1295  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297  */
1298 
1299 void ip_rt_send_redirect(struct sk_buff *skb)
1300 {
1301 	struct rtable *rt = (struct rtable*)skb->dst;
1302 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303 
1304 	if (!in_dev)
1305 		return;
1306 
1307 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1308 		goto out;
1309 
1310 	/* No redirected packets during ip_rt_redirect_silence;
1311 	 * reset the algorithm.
1312 	 */
1313 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314 		rt->u.dst.rate_tokens = 0;
1315 
1316 	/* Too many ignored redirects; do not send anything
1317 	 * set u.dst.rate_last to the last seen redirected packet.
1318 	 */
1319 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320 		rt->u.dst.rate_last = jiffies;
1321 		goto out;
1322 	}
1323 
1324 	/* Check for load limit; set rate_last to the latest sent
1325 	 * redirect.
1326 	 */
1327 	if (rt->u.dst.rate_tokens == 0 ||
1328 	    time_after(jiffies,
1329 		       (rt->u.dst.rate_last +
1330 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 		rt->u.dst.rate_last = jiffies;
1333 		++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337 		    net_ratelimit())
1338 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 				NIPQUAD(rt->rt_src), rt->rt_iif,
1341 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343 	}
1344 out:
1345 	in_dev_put(in_dev);
1346 }
1347 
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350 	struct rtable *rt = (struct rtable*)skb->dst;
1351 	unsigned long now;
1352 	int code;
1353 
1354 	switch (rt->u.dst.error) {
1355 		case EINVAL:
1356 		default:
1357 			goto out;
1358 		case EHOSTUNREACH:
1359 			code = ICMP_HOST_UNREACH;
1360 			break;
1361 		case ENETUNREACH:
1362 			code = ICMP_NET_UNREACH;
1363 			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1364 			break;
1365 		case EACCES:
1366 			code = ICMP_PKT_FILTERED;
1367 			break;
1368 	}
1369 
1370 	now = jiffies;
1371 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 	rt->u.dst.rate_last = now;
1375 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378 	}
1379 
1380 out:	kfree_skb(skb);
1381 	return 0;
1382 }
1383 
1384 /*
1385  *	The last two values are not from the RFC but
1386  *	are needed for AMPRnet AX.25 paths.
1387  */
1388 
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391 
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393 {
1394 	int i;
1395 
1396 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 		if (old_mtu > mtu_plateau[i])
1398 			return mtu_plateau[i];
1399 	return 68;
1400 }
1401 
1402 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1403 				 unsigned short new_mtu)
1404 {
1405 	int i;
1406 	unsigned short old_mtu = ntohs(iph->tot_len);
1407 	struct rtable *rth;
1408 	__be32  skeys[2] = { iph->saddr, 0, };
1409 	__be32  daddr = iph->daddr;
1410 	unsigned short est_mtu = 0;
1411 
1412 	if (ipv4_config.no_pmtu_disc)
1413 		return 0;
1414 
1415 	for (i = 0; i < 2; i++) {
1416 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1417 
1418 		rcu_read_lock();
1419 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1420 		     rth = rcu_dereference(rth->u.dst.rt_next)) {
1421 			if (rth->fl.fl4_dst == daddr &&
1422 			    rth->fl.fl4_src == skeys[i] &&
1423 			    rth->rt_dst  == daddr &&
1424 			    rth->rt_src  == iph->saddr &&
1425 			    rth->fl.iif == 0 &&
1426 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1427 			    rth->u.dst.dev->nd_net == net &&
1428 			    rth->rt_genid == atomic_read(&rt_genid)) {
1429 				unsigned short mtu = new_mtu;
1430 
1431 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1432 
1433 					/* BSD 4.2 compatibility hack :-( */
1434 					if (mtu == 0 &&
1435 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1436 					    old_mtu >= 68 + (iph->ihl << 2))
1437 						old_mtu -= iph->ihl << 2;
1438 
1439 					mtu = guess_mtu(old_mtu);
1440 				}
1441 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1442 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1443 						dst_confirm(&rth->u.dst);
1444 						if (mtu < ip_rt_min_pmtu) {
1445 							mtu = ip_rt_min_pmtu;
1446 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1447 								(1 << RTAX_MTU);
1448 						}
1449 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1450 						dst_set_expires(&rth->u.dst,
1451 							ip_rt_mtu_expires);
1452 					}
1453 					est_mtu = mtu;
1454 				}
1455 			}
1456 		}
1457 		rcu_read_unlock();
1458 	}
1459 	return est_mtu ? : new_mtu;
1460 }
1461 
1462 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1463 {
1464 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1465 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1466 		if (mtu < ip_rt_min_pmtu) {
1467 			mtu = ip_rt_min_pmtu;
1468 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1469 		}
1470 		dst->metrics[RTAX_MTU-1] = mtu;
1471 		dst_set_expires(dst, ip_rt_mtu_expires);
1472 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1473 	}
1474 }
1475 
1476 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1477 {
1478 	return NULL;
1479 }
1480 
1481 static void ipv4_dst_destroy(struct dst_entry *dst)
1482 {
1483 	struct rtable *rt = (struct rtable *) dst;
1484 	struct inet_peer *peer = rt->peer;
1485 	struct in_device *idev = rt->idev;
1486 
1487 	if (peer) {
1488 		rt->peer = NULL;
1489 		inet_putpeer(peer);
1490 	}
1491 
1492 	if (idev) {
1493 		rt->idev = NULL;
1494 		in_dev_put(idev);
1495 	}
1496 }
1497 
1498 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1499 			    int how)
1500 {
1501 	struct rtable *rt = (struct rtable *) dst;
1502 	struct in_device *idev = rt->idev;
1503 	if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1504 		struct in_device *loopback_idev =
1505 			in_dev_get(dev->nd_net->loopback_dev);
1506 		if (loopback_idev) {
1507 			rt->idev = loopback_idev;
1508 			in_dev_put(idev);
1509 		}
1510 	}
1511 }
1512 
1513 static void ipv4_link_failure(struct sk_buff *skb)
1514 {
1515 	struct rtable *rt;
1516 
1517 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1518 
1519 	rt = (struct rtable *) skb->dst;
1520 	if (rt)
1521 		dst_set_expires(&rt->u.dst, 0);
1522 }
1523 
1524 static int ip_rt_bug(struct sk_buff *skb)
1525 {
1526 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1527 		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1528 		skb->dev ? skb->dev->name : "?");
1529 	kfree_skb(skb);
1530 	return 0;
1531 }
1532 
1533 /*
1534    We do not cache source address of outgoing interface,
1535    because it is used only by IP RR, TS and SRR options,
1536    so that it out of fast path.
1537 
1538    BTW remember: "addr" is allowed to be not aligned
1539    in IP options!
1540  */
1541 
1542 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1543 {
1544 	__be32 src;
1545 	struct fib_result res;
1546 
1547 	if (rt->fl.iif == 0)
1548 		src = rt->rt_src;
1549 	else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1550 		src = FIB_RES_PREFSRC(res);
1551 		fib_res_put(&res);
1552 	} else
1553 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1554 					RT_SCOPE_UNIVERSE);
1555 	memcpy(addr, &src, 4);
1556 }
1557 
1558 #ifdef CONFIG_NET_CLS_ROUTE
1559 static void set_class_tag(struct rtable *rt, u32 tag)
1560 {
1561 	if (!(rt->u.dst.tclassid & 0xFFFF))
1562 		rt->u.dst.tclassid |= tag & 0xFFFF;
1563 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1564 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1565 }
1566 #endif
1567 
1568 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1569 {
1570 	struct fib_info *fi = res->fi;
1571 
1572 	if (fi) {
1573 		if (FIB_RES_GW(*res) &&
1574 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1575 			rt->rt_gateway = FIB_RES_GW(*res);
1576 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1577 		       sizeof(rt->u.dst.metrics));
1578 		if (fi->fib_mtu == 0) {
1579 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1580 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1581 			    rt->rt_gateway != rt->rt_dst &&
1582 			    rt->u.dst.dev->mtu > 576)
1583 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1584 		}
1585 #ifdef CONFIG_NET_CLS_ROUTE
1586 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1587 #endif
1588 	} else
1589 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1590 
1591 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1592 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1593 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1594 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1595 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1596 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1597 				       ip_rt_min_advmss);
1598 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1599 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1600 
1601 #ifdef CONFIG_NET_CLS_ROUTE
1602 #ifdef CONFIG_IP_MULTIPLE_TABLES
1603 	set_class_tag(rt, fib_rules_tclass(res));
1604 #endif
1605 	set_class_tag(rt, itag);
1606 #endif
1607 	rt->rt_type = res->type;
1608 }
1609 
1610 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1611 				u8 tos, struct net_device *dev, int our)
1612 {
1613 	unsigned hash;
1614 	struct rtable *rth;
1615 	__be32 spec_dst;
1616 	struct in_device *in_dev = in_dev_get(dev);
1617 	u32 itag = 0;
1618 
1619 	/* Primary sanity checks. */
1620 
1621 	if (in_dev == NULL)
1622 		return -EINVAL;
1623 
1624 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1625 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1626 		goto e_inval;
1627 
1628 	if (ipv4_is_zeronet(saddr)) {
1629 		if (!ipv4_is_local_multicast(daddr))
1630 			goto e_inval;
1631 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1632 	} else if (fib_validate_source(saddr, 0, tos, 0,
1633 					dev, &spec_dst, &itag) < 0)
1634 		goto e_inval;
1635 
1636 	rth = dst_alloc(&ipv4_dst_ops);
1637 	if (!rth)
1638 		goto e_nobufs;
1639 
1640 	rth->u.dst.output= ip_rt_bug;
1641 
1642 	atomic_set(&rth->u.dst.__refcnt, 1);
1643 	rth->u.dst.flags= DST_HOST;
1644 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1645 		rth->u.dst.flags |= DST_NOPOLICY;
1646 	rth->fl.fl4_dst	= daddr;
1647 	rth->rt_dst	= daddr;
1648 	rth->fl.fl4_tos	= tos;
1649 	rth->fl.mark    = skb->mark;
1650 	rth->fl.fl4_src	= saddr;
1651 	rth->rt_src	= saddr;
1652 #ifdef CONFIG_NET_CLS_ROUTE
1653 	rth->u.dst.tclassid = itag;
1654 #endif
1655 	rth->rt_iif	=
1656 	rth->fl.iif	= dev->ifindex;
1657 	rth->u.dst.dev	= init_net.loopback_dev;
1658 	dev_hold(rth->u.dst.dev);
1659 	rth->idev	= in_dev_get(rth->u.dst.dev);
1660 	rth->fl.oif	= 0;
1661 	rth->rt_gateway	= daddr;
1662 	rth->rt_spec_dst= spec_dst;
1663 	rth->rt_genid	= atomic_read(&rt_genid);
1664 	rth->rt_flags	= RTCF_MULTICAST;
1665 	rth->rt_type	= RTN_MULTICAST;
1666 	if (our) {
1667 		rth->u.dst.input= ip_local_deliver;
1668 		rth->rt_flags |= RTCF_LOCAL;
1669 	}
1670 
1671 #ifdef CONFIG_IP_MROUTE
1672 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1673 		rth->u.dst.input = ip_mr_input;
1674 #endif
1675 	RT_CACHE_STAT_INC(in_slow_mc);
1676 
1677 	in_dev_put(in_dev);
1678 	hash = rt_hash(daddr, saddr, dev->ifindex);
1679 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1680 
1681 e_nobufs:
1682 	in_dev_put(in_dev);
1683 	return -ENOBUFS;
1684 
1685 e_inval:
1686 	in_dev_put(in_dev);
1687 	return -EINVAL;
1688 }
1689 
1690 
1691 static void ip_handle_martian_source(struct net_device *dev,
1692 				     struct in_device *in_dev,
1693 				     struct sk_buff *skb,
1694 				     __be32 daddr,
1695 				     __be32 saddr)
1696 {
1697 	RT_CACHE_STAT_INC(in_martian_src);
1698 #ifdef CONFIG_IP_ROUTE_VERBOSE
1699 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1700 		/*
1701 		 *	RFC1812 recommendation, if source is martian,
1702 		 *	the only hint is MAC header.
1703 		 */
1704 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1705 			"%u.%u.%u.%u, on dev %s\n",
1706 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1707 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1708 			int i;
1709 			const unsigned char *p = skb_mac_header(skb);
1710 			printk(KERN_WARNING "ll header: ");
1711 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1712 				printk("%02x", *p);
1713 				if (i < (dev->hard_header_len - 1))
1714 					printk(":");
1715 			}
1716 			printk("\n");
1717 		}
1718 	}
1719 #endif
1720 }
1721 
1722 static inline int __mkroute_input(struct sk_buff *skb,
1723 				  struct fib_result* res,
1724 				  struct in_device *in_dev,
1725 				  __be32 daddr, __be32 saddr, u32 tos,
1726 				  struct rtable **result)
1727 {
1728 
1729 	struct rtable *rth;
1730 	int err;
1731 	struct in_device *out_dev;
1732 	unsigned flags = 0;
1733 	__be32 spec_dst;
1734 	u32 itag;
1735 
1736 	/* get a working reference to the output device */
1737 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1738 	if (out_dev == NULL) {
1739 		if (net_ratelimit())
1740 			printk(KERN_CRIT "Bug in ip_route_input" \
1741 			       "_slow(). Please, report\n");
1742 		return -EINVAL;
1743 	}
1744 
1745 
1746 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1747 				  in_dev->dev, &spec_dst, &itag);
1748 	if (err < 0) {
1749 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1750 					 saddr);
1751 
1752 		err = -EINVAL;
1753 		goto cleanup;
1754 	}
1755 
1756 	if (err)
1757 		flags |= RTCF_DIRECTSRC;
1758 
1759 	if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1760 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1761 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1762 		flags |= RTCF_DOREDIRECT;
1763 
1764 	if (skb->protocol != htons(ETH_P_IP)) {
1765 		/* Not IP (i.e. ARP). Do not create route, if it is
1766 		 * invalid for proxy arp. DNAT routes are always valid.
1767 		 */
1768 		if (out_dev == in_dev) {
1769 			err = -EINVAL;
1770 			goto cleanup;
1771 		}
1772 	}
1773 
1774 
1775 	rth = dst_alloc(&ipv4_dst_ops);
1776 	if (!rth) {
1777 		err = -ENOBUFS;
1778 		goto cleanup;
1779 	}
1780 
1781 	atomic_set(&rth->u.dst.__refcnt, 1);
1782 	rth->u.dst.flags= DST_HOST;
1783 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1784 		rth->u.dst.flags |= DST_NOPOLICY;
1785 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1786 		rth->u.dst.flags |= DST_NOXFRM;
1787 	rth->fl.fl4_dst	= daddr;
1788 	rth->rt_dst	= daddr;
1789 	rth->fl.fl4_tos	= tos;
1790 	rth->fl.mark    = skb->mark;
1791 	rth->fl.fl4_src	= saddr;
1792 	rth->rt_src	= saddr;
1793 	rth->rt_gateway	= daddr;
1794 	rth->rt_iif 	=
1795 		rth->fl.iif	= in_dev->dev->ifindex;
1796 	rth->u.dst.dev	= (out_dev)->dev;
1797 	dev_hold(rth->u.dst.dev);
1798 	rth->idev	= in_dev_get(rth->u.dst.dev);
1799 	rth->fl.oif 	= 0;
1800 	rth->rt_spec_dst= spec_dst;
1801 
1802 	rth->u.dst.input = ip_forward;
1803 	rth->u.dst.output = ip_output;
1804 	rth->rt_genid = atomic_read(&rt_genid);
1805 
1806 	rt_set_nexthop(rth, res, itag);
1807 
1808 	rth->rt_flags = flags;
1809 
1810 	*result = rth;
1811 	err = 0;
1812  cleanup:
1813 	/* release the working reference to the output device */
1814 	in_dev_put(out_dev);
1815 	return err;
1816 }
1817 
1818 static inline int ip_mkroute_input(struct sk_buff *skb,
1819 				   struct fib_result* res,
1820 				   const struct flowi *fl,
1821 				   struct in_device *in_dev,
1822 				   __be32 daddr, __be32 saddr, u32 tos)
1823 {
1824 	struct rtable* rth = NULL;
1825 	int err;
1826 	unsigned hash;
1827 
1828 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1829 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1830 		fib_select_multipath(fl, res);
1831 #endif
1832 
1833 	/* create a routing cache entry */
1834 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1835 	if (err)
1836 		return err;
1837 
1838 	/* put it into the cache */
1839 	hash = rt_hash(daddr, saddr, fl->iif);
1840 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1841 }
1842 
1843 /*
1844  *	NOTE. We drop all the packets that has local source
1845  *	addresses, because every properly looped back packet
1846  *	must have correct destination already attached by output routine.
1847  *
1848  *	Such approach solves two big problems:
1849  *	1. Not simplex devices are handled properly.
1850  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1851  */
1852 
1853 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1854 			       u8 tos, struct net_device *dev)
1855 {
1856 	struct fib_result res;
1857 	struct in_device *in_dev = in_dev_get(dev);
1858 	struct flowi fl = { .nl_u = { .ip4_u =
1859 				      { .daddr = daddr,
1860 					.saddr = saddr,
1861 					.tos = tos,
1862 					.scope = RT_SCOPE_UNIVERSE,
1863 				      } },
1864 			    .mark = skb->mark,
1865 			    .iif = dev->ifindex };
1866 	unsigned	flags = 0;
1867 	u32		itag = 0;
1868 	struct rtable * rth;
1869 	unsigned	hash;
1870 	__be32		spec_dst;
1871 	int		err = -EINVAL;
1872 	int		free_res = 0;
1873 	struct net    * net = dev->nd_net;
1874 
1875 	/* IP on this device is disabled. */
1876 
1877 	if (!in_dev)
1878 		goto out;
1879 
1880 	/* Check for the most weird martians, which can be not detected
1881 	   by fib_lookup.
1882 	 */
1883 
1884 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1885 	    ipv4_is_loopback(saddr))
1886 		goto martian_source;
1887 
1888 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1889 		goto brd_input;
1890 
1891 	/* Accept zero addresses only to limited broadcast;
1892 	 * I even do not know to fix it or not. Waiting for complains :-)
1893 	 */
1894 	if (ipv4_is_zeronet(saddr))
1895 		goto martian_source;
1896 
1897 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1898 	    ipv4_is_loopback(daddr))
1899 		goto martian_destination;
1900 
1901 	/*
1902 	 *	Now we are ready to route packet.
1903 	 */
1904 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1905 		if (!IN_DEV_FORWARD(in_dev))
1906 			goto e_hostunreach;
1907 		goto no_route;
1908 	}
1909 	free_res = 1;
1910 
1911 	RT_CACHE_STAT_INC(in_slow_tot);
1912 
1913 	if (res.type == RTN_BROADCAST)
1914 		goto brd_input;
1915 
1916 	if (res.type == RTN_LOCAL) {
1917 		int result;
1918 		result = fib_validate_source(saddr, daddr, tos,
1919 					     net->loopback_dev->ifindex,
1920 					     dev, &spec_dst, &itag);
1921 		if (result < 0)
1922 			goto martian_source;
1923 		if (result)
1924 			flags |= RTCF_DIRECTSRC;
1925 		spec_dst = daddr;
1926 		goto local_input;
1927 	}
1928 
1929 	if (!IN_DEV_FORWARD(in_dev))
1930 		goto e_hostunreach;
1931 	if (res.type != RTN_UNICAST)
1932 		goto martian_destination;
1933 
1934 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1935 done:
1936 	in_dev_put(in_dev);
1937 	if (free_res)
1938 		fib_res_put(&res);
1939 out:	return err;
1940 
1941 brd_input:
1942 	if (skb->protocol != htons(ETH_P_IP))
1943 		goto e_inval;
1944 
1945 	if (ipv4_is_zeronet(saddr))
1946 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1947 	else {
1948 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1949 					  &itag);
1950 		if (err < 0)
1951 			goto martian_source;
1952 		if (err)
1953 			flags |= RTCF_DIRECTSRC;
1954 	}
1955 	flags |= RTCF_BROADCAST;
1956 	res.type = RTN_BROADCAST;
1957 	RT_CACHE_STAT_INC(in_brd);
1958 
1959 local_input:
1960 	rth = dst_alloc(&ipv4_dst_ops);
1961 	if (!rth)
1962 		goto e_nobufs;
1963 
1964 	rth->u.dst.output= ip_rt_bug;
1965 	rth->rt_genid = atomic_read(&rt_genid);
1966 
1967 	atomic_set(&rth->u.dst.__refcnt, 1);
1968 	rth->u.dst.flags= DST_HOST;
1969 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1970 		rth->u.dst.flags |= DST_NOPOLICY;
1971 	rth->fl.fl4_dst	= daddr;
1972 	rth->rt_dst	= daddr;
1973 	rth->fl.fl4_tos	= tos;
1974 	rth->fl.mark    = skb->mark;
1975 	rth->fl.fl4_src	= saddr;
1976 	rth->rt_src	= saddr;
1977 #ifdef CONFIG_NET_CLS_ROUTE
1978 	rth->u.dst.tclassid = itag;
1979 #endif
1980 	rth->rt_iif	=
1981 	rth->fl.iif	= dev->ifindex;
1982 	rth->u.dst.dev	= net->loopback_dev;
1983 	dev_hold(rth->u.dst.dev);
1984 	rth->idev	= in_dev_get(rth->u.dst.dev);
1985 	rth->rt_gateway	= daddr;
1986 	rth->rt_spec_dst= spec_dst;
1987 	rth->u.dst.input= ip_local_deliver;
1988 	rth->rt_flags 	= flags|RTCF_LOCAL;
1989 	if (res.type == RTN_UNREACHABLE) {
1990 		rth->u.dst.input= ip_error;
1991 		rth->u.dst.error= -err;
1992 		rth->rt_flags 	&= ~RTCF_LOCAL;
1993 	}
1994 	rth->rt_type	= res.type;
1995 	hash = rt_hash(daddr, saddr, fl.iif);
1996 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1997 	goto done;
1998 
1999 no_route:
2000 	RT_CACHE_STAT_INC(in_no_route);
2001 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2002 	res.type = RTN_UNREACHABLE;
2003 	if (err == -ESRCH)
2004 		err = -ENETUNREACH;
2005 	goto local_input;
2006 
2007 	/*
2008 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2009 	 */
2010 martian_destination:
2011 	RT_CACHE_STAT_INC(in_martian_dst);
2012 #ifdef CONFIG_IP_ROUTE_VERBOSE
2013 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2014 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2015 			"%u.%u.%u.%u, dev %s\n",
2016 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2017 #endif
2018 
2019 e_hostunreach:
2020 	err = -EHOSTUNREACH;
2021 	goto done;
2022 
2023 e_inval:
2024 	err = -EINVAL;
2025 	goto done;
2026 
2027 e_nobufs:
2028 	err = -ENOBUFS;
2029 	goto done;
2030 
2031 martian_source:
2032 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2033 	goto e_inval;
2034 }
2035 
2036 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2037 		   u8 tos, struct net_device *dev)
2038 {
2039 	struct rtable * rth;
2040 	unsigned	hash;
2041 	int iif = dev->ifindex;
2042 	struct net *net;
2043 
2044 	net = skb->dev->nd_net;
2045 	tos &= IPTOS_RT_MASK;
2046 	hash = rt_hash(daddr, saddr, iif);
2047 
2048 	rcu_read_lock();
2049 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2050 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2051 		if (rth->fl.fl4_dst == daddr &&
2052 		    rth->fl.fl4_src == saddr &&
2053 		    rth->fl.iif == iif &&
2054 		    rth->fl.oif == 0 &&
2055 		    rth->fl.mark == skb->mark &&
2056 		    rth->fl.fl4_tos == tos &&
2057 		    rth->u.dst.dev->nd_net == net &&
2058 		    rth->rt_genid == atomic_read(&rt_genid)) {
2059 			dst_use(&rth->u.dst, jiffies);
2060 			RT_CACHE_STAT_INC(in_hit);
2061 			rcu_read_unlock();
2062 			skb->dst = (struct dst_entry*)rth;
2063 			return 0;
2064 		}
2065 		RT_CACHE_STAT_INC(in_hlist_search);
2066 	}
2067 	rcu_read_unlock();
2068 
2069 	/* Multicast recognition logic is moved from route cache to here.
2070 	   The problem was that too many Ethernet cards have broken/missing
2071 	   hardware multicast filters :-( As result the host on multicasting
2072 	   network acquires a lot of useless route cache entries, sort of
2073 	   SDR messages from all the world. Now we try to get rid of them.
2074 	   Really, provided software IP multicast filter is organized
2075 	   reasonably (at least, hashed), it does not result in a slowdown
2076 	   comparing with route cache reject entries.
2077 	   Note, that multicast routers are not affected, because
2078 	   route cache entry is created eventually.
2079 	 */
2080 	if (ipv4_is_multicast(daddr)) {
2081 		struct in_device *in_dev;
2082 
2083 		rcu_read_lock();
2084 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2085 			int our = ip_check_mc(in_dev, daddr, saddr,
2086 				ip_hdr(skb)->protocol);
2087 			if (our
2088 #ifdef CONFIG_IP_MROUTE
2089 			    || (!ipv4_is_local_multicast(daddr) &&
2090 				IN_DEV_MFORWARD(in_dev))
2091 #endif
2092 			    ) {
2093 				rcu_read_unlock();
2094 				return ip_route_input_mc(skb, daddr, saddr,
2095 							 tos, dev, our);
2096 			}
2097 		}
2098 		rcu_read_unlock();
2099 		return -EINVAL;
2100 	}
2101 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2102 }
2103 
2104 static inline int __mkroute_output(struct rtable **result,
2105 				   struct fib_result* res,
2106 				   const struct flowi *fl,
2107 				   const struct flowi *oldflp,
2108 				   struct net_device *dev_out,
2109 				   unsigned flags)
2110 {
2111 	struct rtable *rth;
2112 	struct in_device *in_dev;
2113 	u32 tos = RT_FL_TOS(oldflp);
2114 	int err = 0;
2115 
2116 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2117 		return -EINVAL;
2118 
2119 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2120 		res->type = RTN_BROADCAST;
2121 	else if (ipv4_is_multicast(fl->fl4_dst))
2122 		res->type = RTN_MULTICAST;
2123 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2124 		return -EINVAL;
2125 
2126 	if (dev_out->flags & IFF_LOOPBACK)
2127 		flags |= RTCF_LOCAL;
2128 
2129 	/* get work reference to inet device */
2130 	in_dev = in_dev_get(dev_out);
2131 	if (!in_dev)
2132 		return -EINVAL;
2133 
2134 	if (res->type == RTN_BROADCAST) {
2135 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2136 		if (res->fi) {
2137 			fib_info_put(res->fi);
2138 			res->fi = NULL;
2139 		}
2140 	} else if (res->type == RTN_MULTICAST) {
2141 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2142 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2143 				 oldflp->proto))
2144 			flags &= ~RTCF_LOCAL;
2145 		/* If multicast route do not exist use
2146 		   default one, but do not gateway in this case.
2147 		   Yes, it is hack.
2148 		 */
2149 		if (res->fi && res->prefixlen < 4) {
2150 			fib_info_put(res->fi);
2151 			res->fi = NULL;
2152 		}
2153 	}
2154 
2155 
2156 	rth = dst_alloc(&ipv4_dst_ops);
2157 	if (!rth) {
2158 		err = -ENOBUFS;
2159 		goto cleanup;
2160 	}
2161 
2162 	atomic_set(&rth->u.dst.__refcnt, 1);
2163 	rth->u.dst.flags= DST_HOST;
2164 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2165 		rth->u.dst.flags |= DST_NOXFRM;
2166 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2167 		rth->u.dst.flags |= DST_NOPOLICY;
2168 
2169 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2170 	rth->fl.fl4_tos	= tos;
2171 	rth->fl.fl4_src	= oldflp->fl4_src;
2172 	rth->fl.oif	= oldflp->oif;
2173 	rth->fl.mark    = oldflp->mark;
2174 	rth->rt_dst	= fl->fl4_dst;
2175 	rth->rt_src	= fl->fl4_src;
2176 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2177 	/* get references to the devices that are to be hold by the routing
2178 	   cache entry */
2179 	rth->u.dst.dev	= dev_out;
2180 	dev_hold(dev_out);
2181 	rth->idev	= in_dev_get(dev_out);
2182 	rth->rt_gateway = fl->fl4_dst;
2183 	rth->rt_spec_dst= fl->fl4_src;
2184 
2185 	rth->u.dst.output=ip_output;
2186 	rth->rt_genid = atomic_read(&rt_genid);
2187 
2188 	RT_CACHE_STAT_INC(out_slow_tot);
2189 
2190 	if (flags & RTCF_LOCAL) {
2191 		rth->u.dst.input = ip_local_deliver;
2192 		rth->rt_spec_dst = fl->fl4_dst;
2193 	}
2194 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2195 		rth->rt_spec_dst = fl->fl4_src;
2196 		if (flags & RTCF_LOCAL &&
2197 		    !(dev_out->flags & IFF_LOOPBACK)) {
2198 			rth->u.dst.output = ip_mc_output;
2199 			RT_CACHE_STAT_INC(out_slow_mc);
2200 		}
2201 #ifdef CONFIG_IP_MROUTE
2202 		if (res->type == RTN_MULTICAST) {
2203 			if (IN_DEV_MFORWARD(in_dev) &&
2204 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2205 				rth->u.dst.input = ip_mr_input;
2206 				rth->u.dst.output = ip_mc_output;
2207 			}
2208 		}
2209 #endif
2210 	}
2211 
2212 	rt_set_nexthop(rth, res, 0);
2213 
2214 	rth->rt_flags = flags;
2215 
2216 	*result = rth;
2217  cleanup:
2218 	/* release work reference to inet device */
2219 	in_dev_put(in_dev);
2220 
2221 	return err;
2222 }
2223 
2224 static inline int ip_mkroute_output(struct rtable **rp,
2225 				    struct fib_result* res,
2226 				    const struct flowi *fl,
2227 				    const struct flowi *oldflp,
2228 				    struct net_device *dev_out,
2229 				    unsigned flags)
2230 {
2231 	struct rtable *rth = NULL;
2232 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2233 	unsigned hash;
2234 	if (err == 0) {
2235 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2236 		err = rt_intern_hash(hash, rth, rp);
2237 	}
2238 
2239 	return err;
2240 }
2241 
2242 /*
2243  * Major route resolver routine.
2244  */
2245 
2246 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2247 				const struct flowi *oldflp)
2248 {
2249 	u32 tos	= RT_FL_TOS(oldflp);
2250 	struct flowi fl = { .nl_u = { .ip4_u =
2251 				      { .daddr = oldflp->fl4_dst,
2252 					.saddr = oldflp->fl4_src,
2253 					.tos = tos & IPTOS_RT_MASK,
2254 					.scope = ((tos & RTO_ONLINK) ?
2255 						  RT_SCOPE_LINK :
2256 						  RT_SCOPE_UNIVERSE),
2257 				      } },
2258 			    .mark = oldflp->mark,
2259 			    .iif = net->loopback_dev->ifindex,
2260 			    .oif = oldflp->oif };
2261 	struct fib_result res;
2262 	unsigned flags = 0;
2263 	struct net_device *dev_out = NULL;
2264 	int free_res = 0;
2265 	int err;
2266 
2267 
2268 	res.fi		= NULL;
2269 #ifdef CONFIG_IP_MULTIPLE_TABLES
2270 	res.r		= NULL;
2271 #endif
2272 
2273 	if (oldflp->fl4_src) {
2274 		err = -EINVAL;
2275 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2276 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2277 		    ipv4_is_zeronet(oldflp->fl4_src))
2278 			goto out;
2279 
2280 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2281 		dev_out = ip_dev_find(net, oldflp->fl4_src);
2282 		if (dev_out == NULL)
2283 			goto out;
2284 
2285 		/* I removed check for oif == dev_out->oif here.
2286 		   It was wrong for two reasons:
2287 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2288 		      is assigned to multiple interfaces.
2289 		   2. Moreover, we are allowed to send packets with saddr
2290 		      of another iface. --ANK
2291 		 */
2292 
2293 		if (oldflp->oif == 0
2294 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2295 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2296 			/* Special hack: user can direct multicasts
2297 			   and limited broadcast via necessary interface
2298 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2299 			   This hack is not just for fun, it allows
2300 			   vic,vat and friends to work.
2301 			   They bind socket to loopback, set ttl to zero
2302 			   and expect that it will work.
2303 			   From the viewpoint of routing cache they are broken,
2304 			   because we are not allowed to build multicast path
2305 			   with loopback source addr (look, routing cache
2306 			   cannot know, that ttl is zero, so that packet
2307 			   will not leave this host and route is valid).
2308 			   Luckily, this hack is good workaround.
2309 			 */
2310 
2311 			fl.oif = dev_out->ifindex;
2312 			goto make_route;
2313 		}
2314 		if (dev_out)
2315 			dev_put(dev_out);
2316 		dev_out = NULL;
2317 	}
2318 
2319 
2320 	if (oldflp->oif) {
2321 		dev_out = dev_get_by_index(net, oldflp->oif);
2322 		err = -ENODEV;
2323 		if (dev_out == NULL)
2324 			goto out;
2325 
2326 		/* RACE: Check return value of inet_select_addr instead. */
2327 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2328 			dev_put(dev_out);
2329 			goto out;	/* Wrong error code */
2330 		}
2331 
2332 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2333 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2334 			if (!fl.fl4_src)
2335 				fl.fl4_src = inet_select_addr(dev_out, 0,
2336 							      RT_SCOPE_LINK);
2337 			goto make_route;
2338 		}
2339 		if (!fl.fl4_src) {
2340 			if (ipv4_is_multicast(oldflp->fl4_dst))
2341 				fl.fl4_src = inet_select_addr(dev_out, 0,
2342 							      fl.fl4_scope);
2343 			else if (!oldflp->fl4_dst)
2344 				fl.fl4_src = inet_select_addr(dev_out, 0,
2345 							      RT_SCOPE_HOST);
2346 		}
2347 	}
2348 
2349 	if (!fl.fl4_dst) {
2350 		fl.fl4_dst = fl.fl4_src;
2351 		if (!fl.fl4_dst)
2352 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2353 		if (dev_out)
2354 			dev_put(dev_out);
2355 		dev_out = net->loopback_dev;
2356 		dev_hold(dev_out);
2357 		fl.oif = net->loopback_dev->ifindex;
2358 		res.type = RTN_LOCAL;
2359 		flags |= RTCF_LOCAL;
2360 		goto make_route;
2361 	}
2362 
2363 	if (fib_lookup(net, &fl, &res)) {
2364 		res.fi = NULL;
2365 		if (oldflp->oif) {
2366 			/* Apparently, routing tables are wrong. Assume,
2367 			   that the destination is on link.
2368 
2369 			   WHY? DW.
2370 			   Because we are allowed to send to iface
2371 			   even if it has NO routes and NO assigned
2372 			   addresses. When oif is specified, routing
2373 			   tables are looked up with only one purpose:
2374 			   to catch if destination is gatewayed, rather than
2375 			   direct. Moreover, if MSG_DONTROUTE is set,
2376 			   we send packet, ignoring both routing tables
2377 			   and ifaddr state. --ANK
2378 
2379 
2380 			   We could make it even if oif is unknown,
2381 			   likely IPv6, but we do not.
2382 			 */
2383 
2384 			if (fl.fl4_src == 0)
2385 				fl.fl4_src = inet_select_addr(dev_out, 0,
2386 							      RT_SCOPE_LINK);
2387 			res.type = RTN_UNICAST;
2388 			goto make_route;
2389 		}
2390 		if (dev_out)
2391 			dev_put(dev_out);
2392 		err = -ENETUNREACH;
2393 		goto out;
2394 	}
2395 	free_res = 1;
2396 
2397 	if (res.type == RTN_LOCAL) {
2398 		if (!fl.fl4_src)
2399 			fl.fl4_src = fl.fl4_dst;
2400 		if (dev_out)
2401 			dev_put(dev_out);
2402 		dev_out = net->loopback_dev;
2403 		dev_hold(dev_out);
2404 		fl.oif = dev_out->ifindex;
2405 		if (res.fi)
2406 			fib_info_put(res.fi);
2407 		res.fi = NULL;
2408 		flags |= RTCF_LOCAL;
2409 		goto make_route;
2410 	}
2411 
2412 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2413 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2414 		fib_select_multipath(&fl, &res);
2415 	else
2416 #endif
2417 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2418 		fib_select_default(net, &fl, &res);
2419 
2420 	if (!fl.fl4_src)
2421 		fl.fl4_src = FIB_RES_PREFSRC(res);
2422 
2423 	if (dev_out)
2424 		dev_put(dev_out);
2425 	dev_out = FIB_RES_DEV(res);
2426 	dev_hold(dev_out);
2427 	fl.oif = dev_out->ifindex;
2428 
2429 
2430 make_route:
2431 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2432 
2433 
2434 	if (free_res)
2435 		fib_res_put(&res);
2436 	if (dev_out)
2437 		dev_put(dev_out);
2438 out:	return err;
2439 }
2440 
2441 int __ip_route_output_key(struct net *net, struct rtable **rp,
2442 			  const struct flowi *flp)
2443 {
2444 	unsigned hash;
2445 	struct rtable *rth;
2446 
2447 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2448 
2449 	rcu_read_lock_bh();
2450 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2451 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2452 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2453 		    rth->fl.fl4_src == flp->fl4_src &&
2454 		    rth->fl.iif == 0 &&
2455 		    rth->fl.oif == flp->oif &&
2456 		    rth->fl.mark == flp->mark &&
2457 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2458 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2459 		    rth->u.dst.dev->nd_net == net &&
2460 		    rth->rt_genid == atomic_read(&rt_genid)) {
2461 			dst_use(&rth->u.dst, jiffies);
2462 			RT_CACHE_STAT_INC(out_hit);
2463 			rcu_read_unlock_bh();
2464 			*rp = rth;
2465 			return 0;
2466 		}
2467 		RT_CACHE_STAT_INC(out_hlist_search);
2468 	}
2469 	rcu_read_unlock_bh();
2470 
2471 	return ip_route_output_slow(net, rp, flp);
2472 }
2473 
2474 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2475 
2476 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2477 {
2478 }
2479 
2480 static struct dst_ops ipv4_dst_blackhole_ops = {
2481 	.family			=	AF_INET,
2482 	.protocol		=	__constant_htons(ETH_P_IP),
2483 	.destroy		=	ipv4_dst_destroy,
2484 	.check			=	ipv4_dst_check,
2485 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2486 	.entry_size		=	sizeof(struct rtable),
2487 	.entries		=	ATOMIC_INIT(0),
2488 };
2489 
2490 
2491 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2492 {
2493 	struct rtable *ort = *rp;
2494 	struct rtable *rt = (struct rtable *)
2495 		dst_alloc(&ipv4_dst_blackhole_ops);
2496 
2497 	if (rt) {
2498 		struct dst_entry *new = &rt->u.dst;
2499 
2500 		atomic_set(&new->__refcnt, 1);
2501 		new->__use = 1;
2502 		new->input = dst_discard;
2503 		new->output = dst_discard;
2504 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2505 
2506 		new->dev = ort->u.dst.dev;
2507 		if (new->dev)
2508 			dev_hold(new->dev);
2509 
2510 		rt->fl = ort->fl;
2511 
2512 		rt->idev = ort->idev;
2513 		if (rt->idev)
2514 			in_dev_hold(rt->idev);
2515 		rt->rt_genid = atomic_read(&rt_genid);
2516 		rt->rt_flags = ort->rt_flags;
2517 		rt->rt_type = ort->rt_type;
2518 		rt->rt_dst = ort->rt_dst;
2519 		rt->rt_src = ort->rt_src;
2520 		rt->rt_iif = ort->rt_iif;
2521 		rt->rt_gateway = ort->rt_gateway;
2522 		rt->rt_spec_dst = ort->rt_spec_dst;
2523 		rt->peer = ort->peer;
2524 		if (rt->peer)
2525 			atomic_inc(&rt->peer->refcnt);
2526 
2527 		dst_free(new);
2528 	}
2529 
2530 	dst_release(&(*rp)->u.dst);
2531 	*rp = rt;
2532 	return (rt ? 0 : -ENOMEM);
2533 }
2534 
2535 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2536 			 struct sock *sk, int flags)
2537 {
2538 	int err;
2539 
2540 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2541 		return err;
2542 
2543 	if (flp->proto) {
2544 		if (!flp->fl4_src)
2545 			flp->fl4_src = (*rp)->rt_src;
2546 		if (!flp->fl4_dst)
2547 			flp->fl4_dst = (*rp)->rt_dst;
2548 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2549 				    flags ? XFRM_LOOKUP_WAIT : 0);
2550 		if (err == -EREMOTE)
2551 			err = ipv4_dst_blackhole(rp, flp, sk);
2552 
2553 		return err;
2554 	}
2555 
2556 	return 0;
2557 }
2558 
2559 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2560 
2561 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2562 {
2563 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2564 }
2565 
2566 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2567 			int nowait, unsigned int flags)
2568 {
2569 	struct rtable *rt = (struct rtable*)skb->dst;
2570 	struct rtmsg *r;
2571 	struct nlmsghdr *nlh;
2572 	long expires;
2573 	u32 id = 0, ts = 0, tsage = 0, error;
2574 
2575 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2576 	if (nlh == NULL)
2577 		return -EMSGSIZE;
2578 
2579 	r = nlmsg_data(nlh);
2580 	r->rtm_family	 = AF_INET;
2581 	r->rtm_dst_len	= 32;
2582 	r->rtm_src_len	= 0;
2583 	r->rtm_tos	= rt->fl.fl4_tos;
2584 	r->rtm_table	= RT_TABLE_MAIN;
2585 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2586 	r->rtm_type	= rt->rt_type;
2587 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2588 	r->rtm_protocol = RTPROT_UNSPEC;
2589 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2590 	if (rt->rt_flags & RTCF_NOTIFY)
2591 		r->rtm_flags |= RTM_F_NOTIFY;
2592 
2593 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2594 
2595 	if (rt->fl.fl4_src) {
2596 		r->rtm_src_len = 32;
2597 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2598 	}
2599 	if (rt->u.dst.dev)
2600 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2601 #ifdef CONFIG_NET_CLS_ROUTE
2602 	if (rt->u.dst.tclassid)
2603 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2604 #endif
2605 	if (rt->fl.iif)
2606 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2607 	else if (rt->rt_src != rt->fl.fl4_src)
2608 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2609 
2610 	if (rt->rt_dst != rt->rt_gateway)
2611 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2612 
2613 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2614 		goto nla_put_failure;
2615 
2616 	error = rt->u.dst.error;
2617 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2618 	if (rt->peer) {
2619 		id = rt->peer->ip_id_count;
2620 		if (rt->peer->tcp_ts_stamp) {
2621 			ts = rt->peer->tcp_ts;
2622 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2623 		}
2624 	}
2625 
2626 	if (rt->fl.iif) {
2627 #ifdef CONFIG_IP_MROUTE
2628 		__be32 dst = rt->rt_dst;
2629 
2630 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2631 		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2632 			int err = ipmr_get_route(skb, r, nowait);
2633 			if (err <= 0) {
2634 				if (!nowait) {
2635 					if (err == 0)
2636 						return 0;
2637 					goto nla_put_failure;
2638 				} else {
2639 					if (err == -EMSGSIZE)
2640 						goto nla_put_failure;
2641 					error = err;
2642 				}
2643 			}
2644 		} else
2645 #endif
2646 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2647 	}
2648 
2649 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2650 			       expires, error) < 0)
2651 		goto nla_put_failure;
2652 
2653 	return nlmsg_end(skb, nlh);
2654 
2655 nla_put_failure:
2656 	nlmsg_cancel(skb, nlh);
2657 	return -EMSGSIZE;
2658 }
2659 
2660 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2661 {
2662 	struct net *net = in_skb->sk->sk_net;
2663 	struct rtmsg *rtm;
2664 	struct nlattr *tb[RTA_MAX+1];
2665 	struct rtable *rt = NULL;
2666 	__be32 dst = 0;
2667 	__be32 src = 0;
2668 	u32 iif;
2669 	int err;
2670 	struct sk_buff *skb;
2671 
2672 	if (net != &init_net)
2673 		return -EINVAL;
2674 
2675 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2676 	if (err < 0)
2677 		goto errout;
2678 
2679 	rtm = nlmsg_data(nlh);
2680 
2681 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2682 	if (skb == NULL) {
2683 		err = -ENOBUFS;
2684 		goto errout;
2685 	}
2686 
2687 	/* Reserve room for dummy headers, this skb can pass
2688 	   through good chunk of routing engine.
2689 	 */
2690 	skb_reset_mac_header(skb);
2691 	skb_reset_network_header(skb);
2692 
2693 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2694 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2695 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2696 
2697 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2698 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2699 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2700 
2701 	if (iif) {
2702 		struct net_device *dev;
2703 
2704 		dev = __dev_get_by_index(&init_net, iif);
2705 		if (dev == NULL) {
2706 			err = -ENODEV;
2707 			goto errout_free;
2708 		}
2709 
2710 		skb->protocol	= htons(ETH_P_IP);
2711 		skb->dev	= dev;
2712 		local_bh_disable();
2713 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2714 		local_bh_enable();
2715 
2716 		rt = (struct rtable*) skb->dst;
2717 		if (err == 0 && rt->u.dst.error)
2718 			err = -rt->u.dst.error;
2719 	} else {
2720 		struct flowi fl = {
2721 			.nl_u = {
2722 				.ip4_u = {
2723 					.daddr = dst,
2724 					.saddr = src,
2725 					.tos = rtm->rtm_tos,
2726 				},
2727 			},
2728 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2729 		};
2730 		err = ip_route_output_key(&init_net, &rt, &fl);
2731 	}
2732 
2733 	if (err)
2734 		goto errout_free;
2735 
2736 	skb->dst = &rt->u.dst;
2737 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2738 		rt->rt_flags |= RTCF_NOTIFY;
2739 
2740 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2741 				RTM_NEWROUTE, 0, 0);
2742 	if (err <= 0)
2743 		goto errout_free;
2744 
2745 	err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2746 errout:
2747 	return err;
2748 
2749 errout_free:
2750 	kfree_skb(skb);
2751 	goto errout;
2752 }
2753 
2754 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2755 {
2756 	struct rtable *rt;
2757 	int h, s_h;
2758 	int idx, s_idx;
2759 
2760 	s_h = cb->args[0];
2761 	if (s_h < 0)
2762 		s_h = 0;
2763 	s_idx = idx = cb->args[1];
2764 	for (h = s_h; h <= rt_hash_mask; h++) {
2765 		rcu_read_lock_bh();
2766 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2767 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2768 			if (idx < s_idx)
2769 				continue;
2770 			if (rt->rt_genid != atomic_read(&rt_genid))
2771 				continue;
2772 			skb->dst = dst_clone(&rt->u.dst);
2773 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2774 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2775 					 1, NLM_F_MULTI) <= 0) {
2776 				dst_release(xchg(&skb->dst, NULL));
2777 				rcu_read_unlock_bh();
2778 				goto done;
2779 			}
2780 			dst_release(xchg(&skb->dst, NULL));
2781 		}
2782 		rcu_read_unlock_bh();
2783 		s_idx = 0;
2784 	}
2785 
2786 done:
2787 	cb->args[0] = h;
2788 	cb->args[1] = idx;
2789 	return skb->len;
2790 }
2791 
2792 void ip_rt_multicast_event(struct in_device *in_dev)
2793 {
2794 	rt_cache_flush(0);
2795 }
2796 
2797 #ifdef CONFIG_SYSCTL
2798 static int flush_delay;
2799 
2800 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2801 					struct file *filp, void __user *buffer,
2802 					size_t *lenp, loff_t *ppos)
2803 {
2804 	if (write) {
2805 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2806 		rt_cache_flush(flush_delay);
2807 		return 0;
2808 	}
2809 
2810 	return -EINVAL;
2811 }
2812 
2813 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2814 						int __user *name,
2815 						int nlen,
2816 						void __user *oldval,
2817 						size_t __user *oldlenp,
2818 						void __user *newval,
2819 						size_t newlen)
2820 {
2821 	int delay;
2822 	if (newlen != sizeof(int))
2823 		return -EINVAL;
2824 	if (get_user(delay, (int __user *)newval))
2825 		return -EFAULT;
2826 	rt_cache_flush(delay);
2827 	return 0;
2828 }
2829 
2830 ctl_table ipv4_route_table[] = {
2831 	{
2832 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2833 		.procname	= "flush",
2834 		.data		= &flush_delay,
2835 		.maxlen		= sizeof(int),
2836 		.mode		= 0200,
2837 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2838 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2839 	},
2840 	{
2841 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2842 		.procname	= "gc_thresh",
2843 		.data		= &ipv4_dst_ops.gc_thresh,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= &proc_dointvec,
2847 	},
2848 	{
2849 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2850 		.procname	= "max_size",
2851 		.data		= &ip_rt_max_size,
2852 		.maxlen		= sizeof(int),
2853 		.mode		= 0644,
2854 		.proc_handler	= &proc_dointvec,
2855 	},
2856 	{
2857 		/*  Deprecated. Use gc_min_interval_ms */
2858 
2859 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2860 		.procname	= "gc_min_interval",
2861 		.data		= &ip_rt_gc_min_interval,
2862 		.maxlen		= sizeof(int),
2863 		.mode		= 0644,
2864 		.proc_handler	= &proc_dointvec_jiffies,
2865 		.strategy	= &sysctl_jiffies,
2866 	},
2867 	{
2868 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2869 		.procname	= "gc_min_interval_ms",
2870 		.data		= &ip_rt_gc_min_interval,
2871 		.maxlen		= sizeof(int),
2872 		.mode		= 0644,
2873 		.proc_handler	= &proc_dointvec_ms_jiffies,
2874 		.strategy	= &sysctl_ms_jiffies,
2875 	},
2876 	{
2877 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2878 		.procname	= "gc_timeout",
2879 		.data		= &ip_rt_gc_timeout,
2880 		.maxlen		= sizeof(int),
2881 		.mode		= 0644,
2882 		.proc_handler	= &proc_dointvec_jiffies,
2883 		.strategy	= &sysctl_jiffies,
2884 	},
2885 	{
2886 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2887 		.procname	= "gc_interval",
2888 		.data		= &ip_rt_gc_interval,
2889 		.maxlen		= sizeof(int),
2890 		.mode		= 0644,
2891 		.proc_handler	= &proc_dointvec_jiffies,
2892 		.strategy	= &sysctl_jiffies,
2893 	},
2894 	{
2895 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2896 		.procname	= "redirect_load",
2897 		.data		= &ip_rt_redirect_load,
2898 		.maxlen		= sizeof(int),
2899 		.mode		= 0644,
2900 		.proc_handler	= &proc_dointvec,
2901 	},
2902 	{
2903 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2904 		.procname	= "redirect_number",
2905 		.data		= &ip_rt_redirect_number,
2906 		.maxlen		= sizeof(int),
2907 		.mode		= 0644,
2908 		.proc_handler	= &proc_dointvec,
2909 	},
2910 	{
2911 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2912 		.procname	= "redirect_silence",
2913 		.data		= &ip_rt_redirect_silence,
2914 		.maxlen		= sizeof(int),
2915 		.mode		= 0644,
2916 		.proc_handler	= &proc_dointvec,
2917 	},
2918 	{
2919 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2920 		.procname	= "error_cost",
2921 		.data		= &ip_rt_error_cost,
2922 		.maxlen		= sizeof(int),
2923 		.mode		= 0644,
2924 		.proc_handler	= &proc_dointvec,
2925 	},
2926 	{
2927 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2928 		.procname	= "error_burst",
2929 		.data		= &ip_rt_error_burst,
2930 		.maxlen		= sizeof(int),
2931 		.mode		= 0644,
2932 		.proc_handler	= &proc_dointvec,
2933 	},
2934 	{
2935 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2936 		.procname	= "gc_elasticity",
2937 		.data		= &ip_rt_gc_elasticity,
2938 		.maxlen		= sizeof(int),
2939 		.mode		= 0644,
2940 		.proc_handler	= &proc_dointvec,
2941 	},
2942 	{
2943 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2944 		.procname	= "mtu_expires",
2945 		.data		= &ip_rt_mtu_expires,
2946 		.maxlen		= sizeof(int),
2947 		.mode		= 0644,
2948 		.proc_handler	= &proc_dointvec_jiffies,
2949 		.strategy	= &sysctl_jiffies,
2950 	},
2951 	{
2952 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2953 		.procname	= "min_pmtu",
2954 		.data		= &ip_rt_min_pmtu,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= &proc_dointvec,
2958 	},
2959 	{
2960 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2961 		.procname	= "min_adv_mss",
2962 		.data		= &ip_rt_min_advmss,
2963 		.maxlen		= sizeof(int),
2964 		.mode		= 0644,
2965 		.proc_handler	= &proc_dointvec,
2966 	},
2967 	{
2968 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
2969 		.procname	= "secret_interval",
2970 		.data		= &ip_rt_secret_interval,
2971 		.maxlen		= sizeof(int),
2972 		.mode		= 0644,
2973 		.proc_handler	= &proc_dointvec_jiffies,
2974 		.strategy	= &sysctl_jiffies,
2975 	},
2976 	{ .ctl_name = 0 }
2977 };
2978 #endif
2979 
2980 #ifdef CONFIG_NET_CLS_ROUTE
2981 struct ip_rt_acct *ip_rt_acct __read_mostly;
2982 #endif /* CONFIG_NET_CLS_ROUTE */
2983 
2984 static __initdata unsigned long rhash_entries;
2985 static int __init set_rhash_entries(char *str)
2986 {
2987 	if (!str)
2988 		return 0;
2989 	rhash_entries = simple_strtoul(str, &str, 0);
2990 	return 1;
2991 }
2992 __setup("rhash_entries=", set_rhash_entries);
2993 
2994 int __init ip_rt_init(void)
2995 {
2996 	int rc = 0;
2997 
2998 	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
2999 			     (jiffies ^ (jiffies >> 7))));
3000 
3001 #ifdef CONFIG_NET_CLS_ROUTE
3002 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3003 	if (!ip_rt_acct)
3004 		panic("IP: failed to allocate ip_rt_acct\n");
3005 #endif
3006 
3007 	ipv4_dst_ops.kmem_cachep =
3008 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3009 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3010 
3011 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3012 
3013 	rt_hash_table = (struct rt_hash_bucket *)
3014 		alloc_large_system_hash("IP route cache",
3015 					sizeof(struct rt_hash_bucket),
3016 					rhash_entries,
3017 					(num_physpages >= 128 * 1024) ?
3018 					15 : 17,
3019 					0,
3020 					&rt_hash_log,
3021 					&rt_hash_mask,
3022 					0);
3023 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3024 	rt_hash_lock_init();
3025 
3026 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3027 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3028 
3029 	devinet_init();
3030 	ip_fib_init();
3031 
3032 	setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3033 
3034 	/* All the timers, started at system startup tend
3035 	   to synchronize. Perturb it a bit.
3036 	 */
3037 	schedule_delayed_work(&expires_work,
3038 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3039 
3040 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3041 		ip_rt_secret_interval;
3042 	add_timer(&rt_secret_timer);
3043 
3044 	if (ip_rt_proc_init(&init_net))
3045 		printk(KERN_ERR "Unable to create route proc files\n");
3046 #ifdef CONFIG_XFRM
3047 	xfrm_init();
3048 	xfrm4_init();
3049 #endif
3050 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3051 
3052 	return rc;
3053 }
3054 
3055 EXPORT_SYMBOL(__ip_select_ident);
3056 EXPORT_SYMBOL(ip_route_input);
3057 EXPORT_SYMBOL(ip_route_output_key);
3058