xref: /linux/net/ipv4/route.c (revision 606d099cdd1080bbb50ea50dc52d98252f8f10a1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_min_delay		= 2 * HZ;
120 static int ip_rt_max_delay		= 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval		= 60 * HZ;
124 static int ip_rt_gc_min_interval	= HZ / 2;
125 static int ip_rt_redirect_number	= 9;
126 static int ip_rt_redirect_load		= HZ / 50;
127 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost		= HZ;
129 static int ip_rt_error_burst		= 5 * HZ;
130 static int ip_rt_gc_elasticity		= 8;
131 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu		= 512 + 20 + 20;
133 static int ip_rt_min_advmss		= 256;
134 static int ip_rt_secret_interval	= 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136 
137 #define RTprint(a...)	printk(KERN_DEBUG a)
138 
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142 
143 /*
144  *	Interface to generic destination cache.
145  */
146 
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
150 					 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void		 ipv4_link_failure(struct sk_buff *skb);
153 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155 
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		__constant_htons(ETH_P_IP),
160 	.gc =			rt_garbage_collect,
161 	.check =		ipv4_dst_check,
162 	.destroy =		ipv4_dst_destroy,
163 	.ifdown =		ipv4_dst_ifdown,
164 	.negative_advice =	ipv4_negative_advice,
165 	.link_failure =		ipv4_link_failure,
166 	.update_pmtu =		ip_rt_update_pmtu,
167 	.entry_size =		sizeof(struct rtable),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 	defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ	256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ	4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ	2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ	1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ	512
227 # else
228 #  define RT_HASH_LOCK_SZ	256
229 # endif
230 #endif
231 
232 static spinlock_t	*rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()	{ \
235 		int i; \
236 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 			spin_lock_init(&rt_hash_locks[i]); \
240 		}
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245 
246 static struct rt_hash_bucket 	*rt_hash_table;
247 static unsigned			rt_hash_mask;
248 static int			rt_hash_log;
249 static unsigned int		rt_hash_rnd;
250 
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253 	(__raw_get_cpu_var(rt_cache_stat).field++)
254 
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 				struct rtable **res);
257 
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
261 		& rt_hash_mask);
262 }
263 
264 #define rt_hash(daddr, saddr, idx) \
265 	rt_hash_code((__force u32)(__be32)(daddr),\
266 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267 
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270 	int bucket;
271 };
272 
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 {
275 	struct rtable *r = NULL;
276 	struct rt_cache_iter_state *st = seq->private;
277 
278 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 		rcu_read_lock_bh();
280 		r = rt_hash_table[st->bucket].chain;
281 		if (r)
282 			break;
283 		rcu_read_unlock_bh();
284 	}
285 	return r;
286 }
287 
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 {
290 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291 
292 	r = r->u.rt_next;
293 	while (!r) {
294 		rcu_read_unlock_bh();
295 		if (--st->bucket < 0)
296 			break;
297 		rcu_read_lock_bh();
298 		r = rt_hash_table[st->bucket].chain;
299 	}
300 	return r;
301 }
302 
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 {
305 	struct rtable *r = rt_cache_get_first(seq);
306 
307 	if (r)
308 		while (pos && (r = rt_cache_get_next(seq, r)))
309 			--pos;
310 	return pos ? NULL : r;
311 }
312 
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 {
315 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316 }
317 
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 {
320 	struct rtable *r = NULL;
321 
322 	if (v == SEQ_START_TOKEN)
323 		r = rt_cache_get_first(seq);
324 	else
325 		r = rt_cache_get_next(seq, v);
326 	++*pos;
327 	return r;
328 }
329 
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 {
332 	if (v && v != SEQ_START_TOKEN)
333 		rcu_read_unlock_bh();
334 }
335 
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 {
338 	if (v == SEQ_START_TOKEN)
339 		seq_printf(seq, "%-127s\n",
340 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 			   "HHUptod\tSpecDst");
343 	else {
344 		struct rtable *r = v;
345 		char temp[256];
346 
347 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 			r->u.dst.dev ? r->u.dst.dev->name : "*",
350 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 			dst_metric(&r->u.dst, RTAX_WINDOW),
356 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 			r->fl.fl4_tos,
359 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 				       dev_queue_xmit) : 0,
362 			r->rt_spec_dst);
363 		seq_printf(seq, "%-127s\n", temp);
364         }
365   	return 0;
366 }
367 
368 static struct seq_operations rt_cache_seq_ops = {
369 	.start  = rt_cache_seq_start,
370 	.next   = rt_cache_seq_next,
371 	.stop   = rt_cache_seq_stop,
372 	.show   = rt_cache_seq_show,
373 };
374 
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 {
377 	struct seq_file *seq;
378 	int rc = -ENOMEM;
379 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380 
381 	if (!s)
382 		goto out;
383 	rc = seq_open(file, &rt_cache_seq_ops);
384 	if (rc)
385 		goto out_kfree;
386 	seq          = file->private_data;
387 	seq->private = s;
388 	memset(s, 0, sizeof(*s));
389 out:
390 	return rc;
391 out_kfree:
392 	kfree(s);
393 	goto out;
394 }
395 
396 static struct file_operations rt_cache_seq_fops = {
397 	.owner	 = THIS_MODULE,
398 	.open	 = rt_cache_seq_open,
399 	.read	 = seq_read,
400 	.llseek	 = seq_lseek,
401 	.release = seq_release_private,
402 };
403 
404 
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 {
407 	int cpu;
408 
409 	if (*pos == 0)
410 		return SEQ_START_TOKEN;
411 
412 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 		if (!cpu_possible(cpu))
414 			continue;
415 		*pos = cpu+1;
416 		return &per_cpu(rt_cache_stat, cpu);
417 	}
418 	return NULL;
419 }
420 
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 {
423 	int cpu;
424 
425 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 		if (!cpu_possible(cpu))
427 			continue;
428 		*pos = cpu+1;
429 		return &per_cpu(rt_cache_stat, cpu);
430 	}
431 	return NULL;
432 
433 }
434 
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436 {
437 
438 }
439 
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 {
442 	struct rt_cache_stat *st = v;
443 
444 	if (v == SEQ_START_TOKEN) {
445 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446 		return 0;
447 	}
448 
449 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
450 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 		   atomic_read(&ipv4_dst_ops.entries),
452 		   st->in_hit,
453 		   st->in_slow_tot,
454 		   st->in_slow_mc,
455 		   st->in_no_route,
456 		   st->in_brd,
457 		   st->in_martian_dst,
458 		   st->in_martian_src,
459 
460 		   st->out_hit,
461 		   st->out_slow_tot,
462 		   st->out_slow_mc,
463 
464 		   st->gc_total,
465 		   st->gc_ignored,
466 		   st->gc_goal_miss,
467 		   st->gc_dst_overflow,
468 		   st->in_hlist_search,
469 		   st->out_hlist_search
470 		);
471 	return 0;
472 }
473 
474 static struct seq_operations rt_cpu_seq_ops = {
475 	.start  = rt_cpu_seq_start,
476 	.next   = rt_cpu_seq_next,
477 	.stop   = rt_cpu_seq_stop,
478 	.show   = rt_cpu_seq_show,
479 };
480 
481 
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 {
484 	return seq_open(file, &rt_cpu_seq_ops);
485 }
486 
487 static struct file_operations rt_cpu_seq_fops = {
488 	.owner	 = THIS_MODULE,
489 	.open	 = rt_cpu_seq_open,
490 	.read	 = seq_read,
491 	.llseek	 = seq_lseek,
492 	.release = seq_release,
493 };
494 
495 #endif /* CONFIG_PROC_FS */
496 
497 static __inline__ void rt_free(struct rtable *rt)
498 {
499 	multipath_remove(rt);
500 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 }
502 
503 static __inline__ void rt_drop(struct rtable *rt)
504 {
505 	multipath_remove(rt);
506 	ip_rt_put(rt);
507 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508 }
509 
510 static __inline__ int rt_fast_clean(struct rtable *rth)
511 {
512 	/* Kill broadcast/multicast entries very aggresively, if they
513 	   collide in hash table with more useful entries */
514 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 		rth->fl.iif && rth->u.rt_next;
516 }
517 
518 static __inline__ int rt_valuable(struct rtable *rth)
519 {
520 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 		rth->u.dst.expires;
522 }
523 
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525 {
526 	unsigned long age;
527 	int ret = 0;
528 
529 	if (atomic_read(&rth->u.dst.__refcnt))
530 		goto out;
531 
532 	ret = 1;
533 	if (rth->u.dst.expires &&
534 	    time_after_eq(jiffies, rth->u.dst.expires))
535 		goto out;
536 
537 	age = jiffies - rth->u.dst.lastuse;
538 	ret = 0;
539 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 	    (age <= tmo2 && rt_valuable(rth)))
541 		goto out;
542 	ret = 1;
543 out:	return ret;
544 }
545 
546 /* Bits of score are:
547  * 31: very valuable
548  * 30: not quite useless
549  * 29..0: usage counter
550  */
551 static inline u32 rt_score(struct rtable *rt)
552 {
553 	u32 score = jiffies - rt->u.dst.lastuse;
554 
555 	score = ~score & ~(3<<30);
556 
557 	if (rt_valuable(rt))
558 		score |= (1<<31);
559 
560 	if (!rt->fl.iif ||
561 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 		score |= (1<<30);
563 
564 	return score;
565 }
566 
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 {
569 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571 		(fl1->mark ^ fl2->mark) |
572 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 		(fl1->oif ^ fl2->oif) |
575 		(fl1->iif ^ fl2->iif)) == 0;
576 }
577 
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 						struct rtable *expentry,
581 						int *removed_count)
582 {
583 	int passedexpired = 0;
584 	struct rtable **nextstep = NULL;
585 	struct rtable **rthp = chain_head;
586 	struct rtable *rth;
587 
588 	if (removed_count)
589 		*removed_count = 0;
590 
591 	while ((rth = *rthp) != NULL) {
592 		if (rth == expentry)
593 			passedexpired = 1;
594 
595 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
596 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 			if (*rthp == expentry) {
598 				*rthp = rth->u.rt_next;
599 				continue;
600 			} else {
601 				*rthp = rth->u.rt_next;
602 				rt_free(rth);
603 				if (removed_count)
604 					++(*removed_count);
605 			}
606 		} else {
607 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 			    passedexpired && !nextstep)
609 				nextstep = &rth->u.rt_next;
610 
611 			rthp = &rth->u.rt_next;
612 		}
613 	}
614 
615 	rt_free(expentry);
616 	if (removed_count)
617 		++(*removed_count);
618 
619 	return nextstep;
620 }
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 
623 
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
626 {
627 	static unsigned int rover;
628 	unsigned int i = rover, goal;
629 	struct rtable *rth, **rthp;
630 	unsigned long now = jiffies;
631 	u64 mult;
632 
633 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 	if (ip_rt_gc_timeout > 1)
635 		do_div(mult, ip_rt_gc_timeout);
636 	goal = (unsigned int)mult;
637 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 	for (; goal > 0; goal--) {
639 		unsigned long tmo = ip_rt_gc_timeout;
640 
641 		i = (i + 1) & rt_hash_mask;
642 		rthp = &rt_hash_table[i].chain;
643 
644 		if (*rthp == 0)
645 			continue;
646 		spin_lock(rt_hash_lock_addr(i));
647 		while ((rth = *rthp) != NULL) {
648 			if (rth->u.dst.expires) {
649 				/* Entry is expired even if it is in use */
650 				if (time_before_eq(now, rth->u.dst.expires)) {
651 					tmo >>= 1;
652 					rthp = &rth->u.rt_next;
653 					continue;
654 				}
655 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 				tmo >>= 1;
657 				rthp = &rth->u.rt_next;
658 				continue;
659 			}
660 
661 			/* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 			/* remove all related balanced entries if necessary */
664 			if (rth->u.dst.flags & DST_BALANCED) {
665 				rthp = rt_remove_balanced_route(
666 					&rt_hash_table[i].chain,
667 					rth, NULL);
668 				if (!rthp)
669 					break;
670 			} else {
671 				*rthp = rth->u.rt_next;
672 				rt_free(rth);
673 			}
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675  			*rthp = rth->u.rt_next;
676  			rt_free(rth);
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 		}
679 		spin_unlock(rt_hash_lock_addr(i));
680 
681 		/* Fallback loop breaker. */
682 		if (time_after(jiffies, now))
683 			break;
684 	}
685 	rover = i;
686 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
687 }
688 
689 /* This can run from both BH and non-BH contexts, the latter
690  * in the case of a forced flush event.
691  */
692 static void rt_run_flush(unsigned long dummy)
693 {
694 	int i;
695 	struct rtable *rth, *next;
696 
697 	rt_deadline = 0;
698 
699 	get_random_bytes(&rt_hash_rnd, 4);
700 
701 	for (i = rt_hash_mask; i >= 0; i--) {
702 		spin_lock_bh(rt_hash_lock_addr(i));
703 		rth = rt_hash_table[i].chain;
704 		if (rth)
705 			rt_hash_table[i].chain = NULL;
706 		spin_unlock_bh(rt_hash_lock_addr(i));
707 
708 		for (; rth; rth = next) {
709 			next = rth->u.rt_next;
710 			rt_free(rth);
711 		}
712 	}
713 }
714 
715 static DEFINE_SPINLOCK(rt_flush_lock);
716 
717 void rt_cache_flush(int delay)
718 {
719 	unsigned long now = jiffies;
720 	int user_mode = !in_softirq();
721 
722 	if (delay < 0)
723 		delay = ip_rt_min_delay;
724 
725 	/* flush existing multipath state*/
726 	multipath_flush();
727 
728 	spin_lock_bh(&rt_flush_lock);
729 
730 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 		long tmo = (long)(rt_deadline - now);
732 
733 		/* If flush timer is already running
734 		   and flush request is not immediate (delay > 0):
735 
736 		   if deadline is not achieved, prolongate timer to "delay",
737 		   otherwise fire it at deadline time.
738 		 */
739 
740 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 			tmo = 0;
742 
743 		if (delay > tmo)
744 			delay = tmo;
745 	}
746 
747 	if (delay <= 0) {
748 		spin_unlock_bh(&rt_flush_lock);
749 		rt_run_flush(0);
750 		return;
751 	}
752 
753 	if (rt_deadline == 0)
754 		rt_deadline = now + ip_rt_max_delay;
755 
756 	mod_timer(&rt_flush_timer, now+delay);
757 	spin_unlock_bh(&rt_flush_lock);
758 }
759 
760 static void rt_secret_rebuild(unsigned long dummy)
761 {
762 	unsigned long now = jiffies;
763 
764 	rt_cache_flush(0);
765 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766 }
767 
768 /*
769    Short description of GC goals.
770 
771    We want to build algorithm, which will keep routing cache
772    at some equilibrium point, when number of aged off entries
773    is kept approximately equal to newly generated ones.
774 
775    Current expiration strength is variable "expire".
776    We try to adjust it dynamically, so that if networking
777    is idle expires is large enough to keep enough of warm entries,
778    and when load increases it reduces to limit cache size.
779  */
780 
781 static int rt_garbage_collect(void)
782 {
783 	static unsigned long expire = RT_GC_TIMEOUT;
784 	static unsigned long last_gc;
785 	static int rover;
786 	static int equilibrium;
787 	struct rtable *rth, **rthp;
788 	unsigned long now = jiffies;
789 	int goal;
790 
791 	/*
792 	 * Garbage collection is pretty expensive,
793 	 * do not make it too frequently.
794 	 */
795 
796 	RT_CACHE_STAT_INC(gc_total);
797 
798 	if (now - last_gc < ip_rt_gc_min_interval &&
799 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 		RT_CACHE_STAT_INC(gc_ignored);
801 		goto out;
802 	}
803 
804 	/* Calculate number of entries, which we want to expire now. */
805 	goal = atomic_read(&ipv4_dst_ops.entries) -
806 		(ip_rt_gc_elasticity << rt_hash_log);
807 	if (goal <= 0) {
808 		if (equilibrium < ipv4_dst_ops.gc_thresh)
809 			equilibrium = ipv4_dst_ops.gc_thresh;
810 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 		if (goal > 0) {
812 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814 		}
815 	} else {
816 		/* We are in dangerous area. Try to reduce cache really
817 		 * aggressively.
818 		 */
819 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821 	}
822 
823 	if (now - last_gc >= ip_rt_gc_min_interval)
824 		last_gc = now;
825 
826 	if (goal <= 0) {
827 		equilibrium += goal;
828 		goto work_done;
829 	}
830 
831 	do {
832 		int i, k;
833 
834 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 			unsigned long tmo = expire;
836 
837 			k = (k + 1) & rt_hash_mask;
838 			rthp = &rt_hash_table[k].chain;
839 			spin_lock_bh(rt_hash_lock_addr(k));
840 			while ((rth = *rthp) != NULL) {
841 				if (!rt_may_expire(rth, tmo, expire)) {
842 					tmo >>= 1;
843 					rthp = &rth->u.rt_next;
844 					continue;
845 				}
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 				/* remove all related balanced entries
848 				 * if necessary
849 				 */
850 				if (rth->u.dst.flags & DST_BALANCED) {
851 					int r;
852 
853 					rthp = rt_remove_balanced_route(
854 						&rt_hash_table[k].chain,
855 						rth,
856 						&r);
857 					goal -= r;
858 					if (!rthp)
859 						break;
860 				} else {
861 					*rthp = rth->u.rt_next;
862 					rt_free(rth);
863 					goal--;
864 				}
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 				*rthp = rth->u.rt_next;
867 				rt_free(rth);
868 				goal--;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 			}
871 			spin_unlock_bh(rt_hash_lock_addr(k));
872 			if (goal <= 0)
873 				break;
874 		}
875 		rover = k;
876 
877 		if (goal <= 0)
878 			goto work_done;
879 
880 		/* Goal is not achieved. We stop process if:
881 
882 		   - if expire reduced to zero. Otherwise, expire is halfed.
883 		   - if table is not full.
884 		   - if we are called from interrupt.
885 		   - jiffies check is just fallback/debug loop breaker.
886 		     We will not spin here for long time in any case.
887 		 */
888 
889 		RT_CACHE_STAT_INC(gc_goal_miss);
890 
891 		if (expire == 0)
892 			break;
893 
894 		expire >>= 1;
895 #if RT_CACHE_DEBUG >= 2
896 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 				atomic_read(&ipv4_dst_ops.entries), goal, i);
898 #endif
899 
900 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 			goto out;
902 	} while (!in_softirq() && time_before_eq(jiffies, now));
903 
904 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 		goto out;
906 	if (net_ratelimit())
907 		printk(KERN_WARNING "dst cache overflow\n");
908 	RT_CACHE_STAT_INC(gc_dst_overflow);
909 	return 1;
910 
911 work_done:
912 	expire += ip_rt_gc_min_interval;
913 	if (expire > ip_rt_gc_timeout ||
914 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 		expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
919 #endif
920 out:	return 0;
921 }
922 
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924 {
925 	struct rtable	*rth, **rthp;
926 	unsigned long	now;
927 	struct rtable *cand, **candp;
928 	u32 		min_score;
929 	int		chain_length;
930 	int attempts = !in_softirq();
931 
932 restart:
933 	chain_length = 0;
934 	min_score = ~(u32)0;
935 	cand = NULL;
936 	candp = NULL;
937 	now = jiffies;
938 
939 	rthp = &rt_hash_table[hash].chain;
940 
941 	spin_lock_bh(rt_hash_lock_addr(hash));
942 	while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 		if (!(rth->u.dst.flags & DST_BALANCED) &&
945 		    compare_keys(&rth->fl, &rt->fl)) {
946 #else
947 		if (compare_keys(&rth->fl, &rt->fl)) {
948 #endif
949 			/* Put it first */
950 			*rthp = rth->u.rt_next;
951 			/*
952 			 * Since lookup is lockfree, the deletion
953 			 * must be visible to another weakly ordered CPU before
954 			 * the insertion at the start of the hash chain.
955 			 */
956 			rcu_assign_pointer(rth->u.rt_next,
957 					   rt_hash_table[hash].chain);
958 			/*
959 			 * Since lookup is lockfree, the update writes
960 			 * must be ordered for consistency on SMP.
961 			 */
962 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963 
964 			rth->u.dst.__use++;
965 			dst_hold(&rth->u.dst);
966 			rth->u.dst.lastuse = now;
967 			spin_unlock_bh(rt_hash_lock_addr(hash));
968 
969 			rt_drop(rt);
970 			*rp = rth;
971 			return 0;
972 		}
973 
974 		if (!atomic_read(&rth->u.dst.__refcnt)) {
975 			u32 score = rt_score(rth);
976 
977 			if (score <= min_score) {
978 				cand = rth;
979 				candp = rthp;
980 				min_score = score;
981 			}
982 		}
983 
984 		chain_length++;
985 
986 		rthp = &rth->u.rt_next;
987 	}
988 
989 	if (cand) {
990 		/* ip_rt_gc_elasticity used to be average length of chain
991 		 * length, when exceeded gc becomes really aggressive.
992 		 *
993 		 * The second limit is less certain. At the moment it allows
994 		 * only 2 entries per bucket. We will see.
995 		 */
996 		if (chain_length > ip_rt_gc_elasticity) {
997 			*candp = cand->u.rt_next;
998 			rt_free(cand);
999 		}
1000 	}
1001 
1002 	/* Try to bind route to arp only if it is output
1003 	   route or unicast forwarding path.
1004 	 */
1005 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 		int err = arp_bind_neighbour(&rt->u.dst);
1007 		if (err) {
1008 			spin_unlock_bh(rt_hash_lock_addr(hash));
1009 
1010 			if (err != -ENOBUFS) {
1011 				rt_drop(rt);
1012 				return err;
1013 			}
1014 
1015 			/* Neighbour tables are full and nothing
1016 			   can be released. Try to shrink route cache,
1017 			   it is most likely it holds some neighbour records.
1018 			 */
1019 			if (attempts-- > 0) {
1020 				int saved_elasticity = ip_rt_gc_elasticity;
1021 				int saved_int = ip_rt_gc_min_interval;
1022 				ip_rt_gc_elasticity	= 1;
1023 				ip_rt_gc_min_interval	= 0;
1024 				rt_garbage_collect();
1025 				ip_rt_gc_min_interval	= saved_int;
1026 				ip_rt_gc_elasticity	= saved_elasticity;
1027 				goto restart;
1028 			}
1029 
1030 			if (net_ratelimit())
1031 				printk(KERN_WARNING "Neighbour table overflow.\n");
1032 			rt_drop(rt);
1033 			return -ENOBUFS;
1034 		}
1035 	}
1036 
1037 	rt->u.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039 	if (rt->u.rt_next) {
1040 		struct rtable *trt;
1041 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 		       NIPQUAD(rt->rt_dst));
1043 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 		printk("\n");
1046 	}
1047 #endif
1048 	rt_hash_table[hash].chain = rt;
1049 	spin_unlock_bh(rt_hash_lock_addr(hash));
1050 	*rp = rt;
1051 	return 0;
1052 }
1053 
1054 void rt_bind_peer(struct rtable *rt, int create)
1055 {
1056 	static DEFINE_SPINLOCK(rt_peer_lock);
1057 	struct inet_peer *peer;
1058 
1059 	peer = inet_getpeer(rt->rt_dst, create);
1060 
1061 	spin_lock_bh(&rt_peer_lock);
1062 	if (rt->peer == NULL) {
1063 		rt->peer = peer;
1064 		peer = NULL;
1065 	}
1066 	spin_unlock_bh(&rt_peer_lock);
1067 	if (peer)
1068 		inet_putpeer(peer);
1069 }
1070 
1071 /*
1072  * Peer allocation may fail only in serious out-of-memory conditions.  However
1073  * we still can generate some output.
1074  * Random ID selection looks a bit dangerous because we have no chances to
1075  * select ID being unique in a reasonable period of time.
1076  * But broken packet identifier may be better than no packet at all.
1077  */
1078 static void ip_select_fb_ident(struct iphdr *iph)
1079 {
1080 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 	static u32 ip_fallback_id;
1082 	u32 salt;
1083 
1084 	spin_lock_bh(&ip_fb_id_lock);
1085 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086 	iph->id = htons(salt & 0xFFFF);
1087 	ip_fallback_id = salt;
1088 	spin_unlock_bh(&ip_fb_id_lock);
1089 }
1090 
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 {
1093 	struct rtable *rt = (struct rtable *) dst;
1094 
1095 	if (rt) {
1096 		if (rt->peer == NULL)
1097 			rt_bind_peer(rt, 1);
1098 
1099 		/* If peer is attached to destination, it is never detached,
1100 		   so that we need not to grab a lock to dereference it.
1101 		 */
1102 		if (rt->peer) {
1103 			iph->id = htons(inet_getid(rt->peer, more));
1104 			return;
1105 		}
1106 	} else
1107 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 		       __builtin_return_address(0));
1109 
1110 	ip_select_fb_ident(iph);
1111 }
1112 
1113 static void rt_del(unsigned hash, struct rtable *rt)
1114 {
1115 	struct rtable **rthp;
1116 
1117 	spin_lock_bh(rt_hash_lock_addr(hash));
1118 	ip_rt_put(rt);
1119 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 	     rthp = &(*rthp)->u.rt_next)
1121 		if (*rthp == rt) {
1122 			*rthp = rt->u.rt_next;
1123 			rt_free(rt);
1124 			break;
1125 		}
1126 	spin_unlock_bh(rt_hash_lock_addr(hash));
1127 }
1128 
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 		    __be32 saddr, struct net_device *dev)
1131 {
1132 	int i, k;
1133 	struct in_device *in_dev = in_dev_get(dev);
1134 	struct rtable *rth, **rthp;
1135 	__be32  skeys[2] = { saddr, 0 };
1136 	int  ikeys[2] = { dev->ifindex, 0 };
1137 	struct netevent_redirect netevent;
1138 
1139 	if (!in_dev)
1140 		return;
1141 
1142 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 		goto reject_redirect;
1145 
1146 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 			goto reject_redirect;
1149 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 			goto reject_redirect;
1151 	} else {
1152 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 			goto reject_redirect;
1154 	}
1155 
1156 	for (i = 0; i < 2; i++) {
1157 		for (k = 0; k < 2; k++) {
1158 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159 
1160 			rthp=&rt_hash_table[hash].chain;
1161 
1162 			rcu_read_lock();
1163 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 				struct rtable *rt;
1165 
1166 				if (rth->fl.fl4_dst != daddr ||
1167 				    rth->fl.fl4_src != skeys[i] ||
1168 				    rth->fl.oif != ikeys[k] ||
1169 				    rth->fl.iif != 0) {
1170 					rthp = &rth->u.rt_next;
1171 					continue;
1172 				}
1173 
1174 				if (rth->rt_dst != daddr ||
1175 				    rth->rt_src != saddr ||
1176 				    rth->u.dst.error ||
1177 				    rth->rt_gateway != old_gw ||
1178 				    rth->u.dst.dev != dev)
1179 					break;
1180 
1181 				dst_hold(&rth->u.dst);
1182 				rcu_read_unlock();
1183 
1184 				rt = dst_alloc(&ipv4_dst_ops);
1185 				if (rt == NULL) {
1186 					ip_rt_put(rth);
1187 					in_dev_put(in_dev);
1188 					return;
1189 				}
1190 
1191 				/* Copy all the information. */
1192 				*rt = *rth;
1193  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 				rt->u.dst.__use		= 1;
1195 				atomic_set(&rt->u.dst.__refcnt, 1);
1196 				rt->u.dst.child		= NULL;
1197 				if (rt->u.dst.dev)
1198 					dev_hold(rt->u.dst.dev);
1199 				if (rt->idev)
1200 					in_dev_hold(rt->idev);
1201 				rt->u.dst.obsolete	= 0;
1202 				rt->u.dst.lastuse	= jiffies;
1203 				rt->u.dst.path		= &rt->u.dst;
1204 				rt->u.dst.neighbour	= NULL;
1205 				rt->u.dst.hh		= NULL;
1206 				rt->u.dst.xfrm		= NULL;
1207 
1208 				rt->rt_flags		|= RTCF_REDIRECTED;
1209 
1210 				/* Gateway is different ... */
1211 				rt->rt_gateway		= new_gw;
1212 
1213 				/* Redirect received -> path was valid */
1214 				dst_confirm(&rth->u.dst);
1215 
1216 				if (rt->peer)
1217 					atomic_inc(&rt->peer->refcnt);
1218 
1219 				if (arp_bind_neighbour(&rt->u.dst) ||
1220 				    !(rt->u.dst.neighbour->nud_state &
1221 					    NUD_VALID)) {
1222 					if (rt->u.dst.neighbour)
1223 						neigh_event_send(rt->u.dst.neighbour, NULL);
1224 					ip_rt_put(rth);
1225 					rt_drop(rt);
1226 					goto do_next;
1227 				}
1228 
1229 				netevent.old = &rth->u.dst;
1230 				netevent.new = &rt->u.dst;
1231 				call_netevent_notifiers(NETEVENT_REDIRECT,
1232 						        &netevent);
1233 
1234 				rt_del(hash, rth);
1235 				if (!rt_intern_hash(hash, rt, &rt))
1236 					ip_rt_put(rt);
1237 				goto do_next;
1238 			}
1239 			rcu_read_unlock();
1240 		do_next:
1241 			;
1242 		}
1243 	}
1244 	in_dev_put(in_dev);
1245 	return;
1246 
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 			"%u.%u.%u.%u ignored.\n"
1252 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254 		       NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256 	in_dev_put(in_dev);
1257 }
1258 
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 {
1261 	struct rtable *rt = (struct rtable*)dst;
1262 	struct dst_entry *ret = dst;
1263 
1264 	if (rt) {
1265 		if (dst->obsolete) {
1266 			ip_rt_put(rt);
1267 			ret = NULL;
1268 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 			   rt->u.dst.expires) {
1270 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 						rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 					  "%u.%u.%u.%u/%02x dropped\n",
1275 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277 			rt_del(hash, rt);
1278 			ret = NULL;
1279 		}
1280 	}
1281 	return ret;
1282 }
1283 
1284 /*
1285  * Algorithm:
1286  *	1. The first ip_rt_redirect_number redirects are sent
1287  *	   with exponential backoff, then we stop sending them at all,
1288  *	   assuming that the host ignores our redirects.
1289  *	2. If we did not see packets requiring redirects
1290  *	   during ip_rt_redirect_silence, we assume that the host
1291  *	   forgot redirected route and start to send redirects again.
1292  *
1293  * This algorithm is much cheaper and more intelligent than dumb load limiting
1294  * in icmp.c.
1295  *
1296  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298  */
1299 
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1301 {
1302 	struct rtable *rt = (struct rtable*)skb->dst;
1303 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304 
1305 	if (!in_dev)
1306 		return;
1307 
1308 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 		goto out;
1310 
1311 	/* No redirected packets during ip_rt_redirect_silence;
1312 	 * reset the algorithm.
1313 	 */
1314 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 		rt->u.dst.rate_tokens = 0;
1316 
1317 	/* Too many ignored redirects; do not send anything
1318 	 * set u.dst.rate_last to the last seen redirected packet.
1319 	 */
1320 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 		rt->u.dst.rate_last = jiffies;
1322 		goto out;
1323 	}
1324 
1325 	/* Check for load limit; set rate_last to the latest sent
1326 	 * redirect.
1327 	 */
1328 	if (time_after(jiffies,
1329 		       (rt->u.dst.rate_last +
1330 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 		rt->u.dst.rate_last = jiffies;
1333 		++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337 		    net_ratelimit())
1338 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 				NIPQUAD(rt->rt_src), rt->rt_iif,
1341 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343 	}
1344 out:
1345         in_dev_put(in_dev);
1346 }
1347 
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350 	struct rtable *rt = (struct rtable*)skb->dst;
1351 	unsigned long now;
1352 	int code;
1353 
1354 	switch (rt->u.dst.error) {
1355 		case EINVAL:
1356 		default:
1357 			goto out;
1358 		case EHOSTUNREACH:
1359 			code = ICMP_HOST_UNREACH;
1360 			break;
1361 		case ENETUNREACH:
1362 			code = ICMP_NET_UNREACH;
1363 			break;
1364 		case EACCES:
1365 			code = ICMP_PKT_FILTERED;
1366 			break;
1367 	}
1368 
1369 	now = jiffies;
1370 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 	rt->u.dst.rate_last = now;
1374 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377 	}
1378 
1379 out:	kfree_skb(skb);
1380 	return 0;
1381 }
1382 
1383 /*
1384  *	The last two values are not from the RFC but
1385  *	are needed for AMPRnet AX.25 paths.
1386  */
1387 
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390 
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393 	int i;
1394 
1395 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 		if (old_mtu > mtu_plateau[i])
1397 			return mtu_plateau[i];
1398 	return 68;
1399 }
1400 
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402 {
1403 	int i;
1404 	unsigned short old_mtu = ntohs(iph->tot_len);
1405 	struct rtable *rth;
1406 	__be32  skeys[2] = { iph->saddr, 0, };
1407 	__be32  daddr = iph->daddr;
1408 	unsigned short est_mtu = 0;
1409 
1410 	if (ipv4_config.no_pmtu_disc)
1411 		return 0;
1412 
1413 	for (i = 0; i < 2; i++) {
1414 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1415 
1416 		rcu_read_lock();
1417 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418 		     rth = rcu_dereference(rth->u.rt_next)) {
1419 			if (rth->fl.fl4_dst == daddr &&
1420 			    rth->fl.fl4_src == skeys[i] &&
1421 			    rth->rt_dst  == daddr &&
1422 			    rth->rt_src  == iph->saddr &&
1423 			    rth->fl.iif == 0 &&
1424 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425 				unsigned short mtu = new_mtu;
1426 
1427 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1428 
1429 					/* BSD 4.2 compatibility hack :-( */
1430 					if (mtu == 0 &&
1431 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432 					    old_mtu >= 68 + (iph->ihl << 2))
1433 						old_mtu -= iph->ihl << 2;
1434 
1435 					mtu = guess_mtu(old_mtu);
1436 				}
1437 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439 						dst_confirm(&rth->u.dst);
1440 						if (mtu < ip_rt_min_pmtu) {
1441 							mtu = ip_rt_min_pmtu;
1442 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1443 								(1 << RTAX_MTU);
1444 						}
1445 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446 						dst_set_expires(&rth->u.dst,
1447 							ip_rt_mtu_expires);
1448 					}
1449 					est_mtu = mtu;
1450 				}
1451 			}
1452 		}
1453 		rcu_read_unlock();
1454 	}
1455 	return est_mtu ? : new_mtu;
1456 }
1457 
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459 {
1460 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1462 		if (mtu < ip_rt_min_pmtu) {
1463 			mtu = ip_rt_min_pmtu;
1464 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465 		}
1466 		dst->metrics[RTAX_MTU-1] = mtu;
1467 		dst_set_expires(dst, ip_rt_mtu_expires);
1468 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469 	}
1470 }
1471 
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473 {
1474 	return NULL;
1475 }
1476 
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479 	struct rtable *rt = (struct rtable *) dst;
1480 	struct inet_peer *peer = rt->peer;
1481 	struct in_device *idev = rt->idev;
1482 
1483 	if (peer) {
1484 		rt->peer = NULL;
1485 		inet_putpeer(peer);
1486 	}
1487 
1488 	if (idev) {
1489 		rt->idev = NULL;
1490 		in_dev_put(idev);
1491 	}
1492 }
1493 
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495 			    int how)
1496 {
1497 	struct rtable *rt = (struct rtable *) dst;
1498 	struct in_device *idev = rt->idev;
1499 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1500 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501 		if (loopback_idev) {
1502 			rt->idev = loopback_idev;
1503 			in_dev_put(idev);
1504 		}
1505 	}
1506 }
1507 
1508 static void ipv4_link_failure(struct sk_buff *skb)
1509 {
1510 	struct rtable *rt;
1511 
1512 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513 
1514 	rt = (struct rtable *) skb->dst;
1515 	if (rt)
1516 		dst_set_expires(&rt->u.dst, 0);
1517 }
1518 
1519 static int ip_rt_bug(struct sk_buff *skb)
1520 {
1521 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1523 		skb->dev ? skb->dev->name : "?");
1524 	kfree_skb(skb);
1525 	return 0;
1526 }
1527 
1528 /*
1529    We do not cache source address of outgoing interface,
1530    because it is used only by IP RR, TS and SRR options,
1531    so that it out of fast path.
1532 
1533    BTW remember: "addr" is allowed to be not aligned
1534    in IP options!
1535  */
1536 
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538 {
1539 	__be32 src;
1540 	struct fib_result res;
1541 
1542 	if (rt->fl.iif == 0)
1543 		src = rt->rt_src;
1544 	else if (fib_lookup(&rt->fl, &res) == 0) {
1545 		src = FIB_RES_PREFSRC(res);
1546 		fib_res_put(&res);
1547 	} else
1548 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549 					RT_SCOPE_UNIVERSE);
1550 	memcpy(addr, &src, 4);
1551 }
1552 
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1555 {
1556 	if (!(rt->u.dst.tclassid & 0xFFFF))
1557 		rt->u.dst.tclassid |= tag & 0xFFFF;
1558 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560 }
1561 #endif
1562 
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564 {
1565 	struct fib_info *fi = res->fi;
1566 
1567 	if (fi) {
1568 		if (FIB_RES_GW(*res) &&
1569 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570 			rt->rt_gateway = FIB_RES_GW(*res);
1571 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572 		       sizeof(rt->u.dst.metrics));
1573 		if (fi->fib_mtu == 0) {
1574 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576 			    rt->rt_gateway != rt->rt_dst &&
1577 			    rt->u.dst.dev->mtu > 576)
1578 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579 		}
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582 #endif
1583 	} else
1584 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585 
1586 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592 				       ip_rt_min_advmss);
1593 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595 
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598 	set_class_tag(rt, fib_rules_tclass(res));
1599 #endif
1600 	set_class_tag(rt, itag);
1601 #endif
1602         rt->rt_type = res->type;
1603 }
1604 
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606 				u8 tos, struct net_device *dev, int our)
1607 {
1608 	unsigned hash;
1609 	struct rtable *rth;
1610 	__be32 spec_dst;
1611 	struct in_device *in_dev = in_dev_get(dev);
1612 	u32 itag = 0;
1613 
1614 	/* Primary sanity checks. */
1615 
1616 	if (in_dev == NULL)
1617 		return -EINVAL;
1618 
1619 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620 	    skb->protocol != htons(ETH_P_IP))
1621 		goto e_inval;
1622 
1623 	if (ZERONET(saddr)) {
1624 		if (!LOCAL_MCAST(daddr))
1625 			goto e_inval;
1626 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627 	} else if (fib_validate_source(saddr, 0, tos, 0,
1628 					dev, &spec_dst, &itag) < 0)
1629 		goto e_inval;
1630 
1631 	rth = dst_alloc(&ipv4_dst_ops);
1632 	if (!rth)
1633 		goto e_nobufs;
1634 
1635 	rth->u.dst.output= ip_rt_bug;
1636 
1637 	atomic_set(&rth->u.dst.__refcnt, 1);
1638 	rth->u.dst.flags= DST_HOST;
1639 	if (in_dev->cnf.no_policy)
1640 		rth->u.dst.flags |= DST_NOPOLICY;
1641 	rth->fl.fl4_dst	= daddr;
1642 	rth->rt_dst	= daddr;
1643 	rth->fl.fl4_tos	= tos;
1644 	rth->fl.mark    = skb->mark;
1645 	rth->fl.fl4_src	= saddr;
1646 	rth->rt_src	= saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648 	rth->u.dst.tclassid = itag;
1649 #endif
1650 	rth->rt_iif	=
1651 	rth->fl.iif	= dev->ifindex;
1652 	rth->u.dst.dev	= &loopback_dev;
1653 	dev_hold(rth->u.dst.dev);
1654 	rth->idev	= in_dev_get(rth->u.dst.dev);
1655 	rth->fl.oif	= 0;
1656 	rth->rt_gateway	= daddr;
1657 	rth->rt_spec_dst= spec_dst;
1658 	rth->rt_type	= RTN_MULTICAST;
1659 	rth->rt_flags	= RTCF_MULTICAST;
1660 	if (our) {
1661 		rth->u.dst.input= ip_local_deliver;
1662 		rth->rt_flags |= RTCF_LOCAL;
1663 	}
1664 
1665 #ifdef CONFIG_IP_MROUTE
1666 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667 		rth->u.dst.input = ip_mr_input;
1668 #endif
1669 	RT_CACHE_STAT_INC(in_slow_mc);
1670 
1671 	in_dev_put(in_dev);
1672 	hash = rt_hash(daddr, saddr, dev->ifindex);
1673 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674 
1675 e_nobufs:
1676 	in_dev_put(in_dev);
1677 	return -ENOBUFS;
1678 
1679 e_inval:
1680 	in_dev_put(in_dev);
1681 	return -EINVAL;
1682 }
1683 
1684 
1685 static void ip_handle_martian_source(struct net_device *dev,
1686 				     struct in_device *in_dev,
1687 				     struct sk_buff *skb,
1688 				     __be32 daddr,
1689 				     __be32 saddr)
1690 {
1691 	RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694 		/*
1695 		 *	RFC1812 recommendation, if source is martian,
1696 		 *	the only hint is MAC header.
1697 		 */
1698 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 			"%u.%u.%u.%u, on dev %s\n",
1700 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701 		if (dev->hard_header_len && skb->mac.raw) {
1702 			int i;
1703 			unsigned char *p = skb->mac.raw;
1704 			printk(KERN_WARNING "ll header: ");
1705 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1706 				printk("%02x", *p);
1707 				if (i < (dev->hard_header_len - 1))
1708 					printk(":");
1709 			}
1710 			printk("\n");
1711 		}
1712 	}
1713 #endif
1714 }
1715 
1716 static inline int __mkroute_input(struct sk_buff *skb,
1717 				  struct fib_result* res,
1718 				  struct in_device *in_dev,
1719 				  __be32 daddr, __be32 saddr, u32 tos,
1720 				  struct rtable **result)
1721 {
1722 
1723 	struct rtable *rth;
1724 	int err;
1725 	struct in_device *out_dev;
1726 	unsigned flags = 0;
1727 	__be32 spec_dst;
1728 	u32 itag;
1729 
1730 	/* get a working reference to the output device */
1731 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1732 	if (out_dev == NULL) {
1733 		if (net_ratelimit())
1734 			printk(KERN_CRIT "Bug in ip_route_input" \
1735 			       "_slow(). Please, report\n");
1736 		return -EINVAL;
1737 	}
1738 
1739 
1740 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741 				  in_dev->dev, &spec_dst, &itag);
1742 	if (err < 0) {
1743 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744 					 saddr);
1745 
1746 		err = -EINVAL;
1747 		goto cleanup;
1748 	}
1749 
1750 	if (err)
1751 		flags |= RTCF_DIRECTSRC;
1752 
1753 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 		flags |= RTCF_DOREDIRECT;
1757 
1758 	if (skb->protocol != htons(ETH_P_IP)) {
1759 		/* Not IP (i.e. ARP). Do not create route, if it is
1760 		 * invalid for proxy arp. DNAT routes are always valid.
1761 		 */
1762 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763 			err = -EINVAL;
1764 			goto cleanup;
1765 		}
1766 	}
1767 
1768 
1769 	rth = dst_alloc(&ipv4_dst_ops);
1770 	if (!rth) {
1771 		err = -ENOBUFS;
1772 		goto cleanup;
1773 	}
1774 
1775 	atomic_set(&rth->u.dst.__refcnt, 1);
1776 	rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778 	if (res->fi->fib_nhs > 1)
1779 		rth->u.dst.flags |= DST_BALANCED;
1780 #endif
1781 	if (in_dev->cnf.no_policy)
1782 		rth->u.dst.flags |= DST_NOPOLICY;
1783 	if (out_dev->cnf.no_xfrm)
1784 		rth->u.dst.flags |= DST_NOXFRM;
1785 	rth->fl.fl4_dst	= daddr;
1786 	rth->rt_dst	= daddr;
1787 	rth->fl.fl4_tos	= tos;
1788 	rth->fl.mark    = skb->mark;
1789 	rth->fl.fl4_src	= saddr;
1790 	rth->rt_src	= saddr;
1791 	rth->rt_gateway	= daddr;
1792 	rth->rt_iif 	=
1793 		rth->fl.iif	= in_dev->dev->ifindex;
1794 	rth->u.dst.dev	= (out_dev)->dev;
1795 	dev_hold(rth->u.dst.dev);
1796 	rth->idev	= in_dev_get(rth->u.dst.dev);
1797 	rth->fl.oif 	= 0;
1798 	rth->rt_spec_dst= spec_dst;
1799 
1800 	rth->u.dst.input = ip_forward;
1801 	rth->u.dst.output = ip_output;
1802 
1803 	rt_set_nexthop(rth, res, itag);
1804 
1805 	rth->rt_flags = flags;
1806 
1807 	*result = rth;
1808 	err = 0;
1809  cleanup:
1810 	/* release the working reference to the output device */
1811 	in_dev_put(out_dev);
1812 	return err;
1813 }
1814 
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816 				       struct fib_result* res,
1817 				       const struct flowi *fl,
1818 				       struct in_device *in_dev,
1819 				       __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821 	struct rtable* rth = NULL;
1822 	int err;
1823 	unsigned hash;
1824 
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827 		fib_select_multipath(fl, res);
1828 #endif
1829 
1830 	/* create a routing cache entry */
1831 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832 	if (err)
1833 		return err;
1834 
1835 	/* put it into the cache */
1836 	hash = rt_hash(daddr, saddr, fl->iif);
1837 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838 }
1839 
1840 static inline int ip_mkroute_input(struct sk_buff *skb,
1841 				   struct fib_result* res,
1842 				   const struct flowi *fl,
1843 				   struct in_device *in_dev,
1844 				   __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847 	struct rtable* rth = NULL, *rtres;
1848 	unsigned char hop, hopcount;
1849 	int err = -EINVAL;
1850 	unsigned int hash;
1851 
1852 	if (res->fi)
1853 		hopcount = res->fi->fib_nhs;
1854 	else
1855 		hopcount = 1;
1856 
1857 	/* distinguish between multipath and singlepath */
1858 	if (hopcount < 2)
1859 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860 					    saddr, tos);
1861 
1862 	/* add all alternatives to the routing cache */
1863 	for (hop = 0; hop < hopcount; hop++) {
1864 		res->nh_sel = hop;
1865 
1866 		/* put reference to previous result */
1867 		if (hop)
1868 			ip_rt_put(rtres);
1869 
1870 		/* create a routing cache entry */
1871 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872 				      &rth);
1873 		if (err)
1874 			return err;
1875 
1876 		/* put it into the cache */
1877 		hash = rt_hash(daddr, saddr, fl->iif);
1878 		err = rt_intern_hash(hash, rth, &rtres);
1879 		if (err)
1880 			return err;
1881 
1882 		/* forward hop information to multipath impl. */
1883 		multipath_set_nhinfo(rth,
1884 				     FIB_RES_NETWORK(*res),
1885 				     FIB_RES_NETMASK(*res),
1886 				     res->prefixlen,
1887 				     &FIB_RES_NH(*res));
1888 	}
1889 	skb->dst = &rtres->u.dst;
1890 	return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895 
1896 
1897 /*
1898  *	NOTE. We drop all the packets that has local source
1899  *	addresses, because every properly looped back packet
1900  *	must have correct destination already attached by output routine.
1901  *
1902  *	Such approach solves two big problems:
1903  *	1. Not simplex devices are handled properly.
1904  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906 
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908 			       u8 tos, struct net_device *dev)
1909 {
1910 	struct fib_result res;
1911 	struct in_device *in_dev = in_dev_get(dev);
1912 	struct flowi fl = { .nl_u = { .ip4_u =
1913 				      { .daddr = daddr,
1914 					.saddr = saddr,
1915 					.tos = tos,
1916 					.scope = RT_SCOPE_UNIVERSE,
1917 				      } },
1918 			    .mark = skb->mark,
1919 			    .iif = dev->ifindex };
1920 	unsigned	flags = 0;
1921 	u32		itag = 0;
1922 	struct rtable * rth;
1923 	unsigned	hash;
1924 	__be32		spec_dst;
1925 	int		err = -EINVAL;
1926 	int		free_res = 0;
1927 
1928 	/* IP on this device is disabled. */
1929 
1930 	if (!in_dev)
1931 		goto out;
1932 
1933 	/* Check for the most weird martians, which can be not detected
1934 	   by fib_lookup.
1935 	 */
1936 
1937 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938 		goto martian_source;
1939 
1940 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941 		goto brd_input;
1942 
1943 	/* Accept zero addresses only to limited broadcast;
1944 	 * I even do not know to fix it or not. Waiting for complains :-)
1945 	 */
1946 	if (ZERONET(saddr))
1947 		goto martian_source;
1948 
1949 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950 		goto martian_destination;
1951 
1952 	/*
1953 	 *	Now we are ready to route packet.
1954 	 */
1955 	if ((err = fib_lookup(&fl, &res)) != 0) {
1956 		if (!IN_DEV_FORWARD(in_dev))
1957 			goto e_hostunreach;
1958 		goto no_route;
1959 	}
1960 	free_res = 1;
1961 
1962 	RT_CACHE_STAT_INC(in_slow_tot);
1963 
1964 	if (res.type == RTN_BROADCAST)
1965 		goto brd_input;
1966 
1967 	if (res.type == RTN_LOCAL) {
1968 		int result;
1969 		result = fib_validate_source(saddr, daddr, tos,
1970 					     loopback_dev.ifindex,
1971 					     dev, &spec_dst, &itag);
1972 		if (result < 0)
1973 			goto martian_source;
1974 		if (result)
1975 			flags |= RTCF_DIRECTSRC;
1976 		spec_dst = daddr;
1977 		goto local_input;
1978 	}
1979 
1980 	if (!IN_DEV_FORWARD(in_dev))
1981 		goto e_hostunreach;
1982 	if (res.type != RTN_UNICAST)
1983 		goto martian_destination;
1984 
1985 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986 	if (err == -ENOBUFS)
1987 		goto e_nobufs;
1988 	if (err == -EINVAL)
1989 		goto e_inval;
1990 
1991 done:
1992 	in_dev_put(in_dev);
1993 	if (free_res)
1994 		fib_res_put(&res);
1995 out:	return err;
1996 
1997 brd_input:
1998 	if (skb->protocol != htons(ETH_P_IP))
1999 		goto e_inval;
2000 
2001 	if (ZERONET(saddr))
2002 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003 	else {
2004 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005 					  &itag);
2006 		if (err < 0)
2007 			goto martian_source;
2008 		if (err)
2009 			flags |= RTCF_DIRECTSRC;
2010 	}
2011 	flags |= RTCF_BROADCAST;
2012 	res.type = RTN_BROADCAST;
2013 	RT_CACHE_STAT_INC(in_brd);
2014 
2015 local_input:
2016 	rth = dst_alloc(&ipv4_dst_ops);
2017 	if (!rth)
2018 		goto e_nobufs;
2019 
2020 	rth->u.dst.output= ip_rt_bug;
2021 
2022 	atomic_set(&rth->u.dst.__refcnt, 1);
2023 	rth->u.dst.flags= DST_HOST;
2024 	if (in_dev->cnf.no_policy)
2025 		rth->u.dst.flags |= DST_NOPOLICY;
2026 	rth->fl.fl4_dst	= daddr;
2027 	rth->rt_dst	= daddr;
2028 	rth->fl.fl4_tos	= tos;
2029 	rth->fl.mark    = skb->mark;
2030 	rth->fl.fl4_src	= saddr;
2031 	rth->rt_src	= saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033 	rth->u.dst.tclassid = itag;
2034 #endif
2035 	rth->rt_iif	=
2036 	rth->fl.iif	= dev->ifindex;
2037 	rth->u.dst.dev	= &loopback_dev;
2038 	dev_hold(rth->u.dst.dev);
2039 	rth->idev	= in_dev_get(rth->u.dst.dev);
2040 	rth->rt_gateway	= daddr;
2041 	rth->rt_spec_dst= spec_dst;
2042 	rth->u.dst.input= ip_local_deliver;
2043 	rth->rt_flags 	= flags|RTCF_LOCAL;
2044 	if (res.type == RTN_UNREACHABLE) {
2045 		rth->u.dst.input= ip_error;
2046 		rth->u.dst.error= -err;
2047 		rth->rt_flags 	&= ~RTCF_LOCAL;
2048 	}
2049 	rth->rt_type	= res.type;
2050 	hash = rt_hash(daddr, saddr, fl.iif);
2051 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052 	goto done;
2053 
2054 no_route:
2055 	RT_CACHE_STAT_INC(in_no_route);
2056 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057 	res.type = RTN_UNREACHABLE;
2058 	goto local_input;
2059 
2060 	/*
2061 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2062 	 */
2063 martian_destination:
2064 	RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068 			"%u.%u.%u.%u, dev %s\n",
2069 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 #endif
2071 
2072 e_hostunreach:
2073         err = -EHOSTUNREACH;
2074         goto done;
2075 
2076 e_inval:
2077 	err = -EINVAL;
2078 	goto done;
2079 
2080 e_nobufs:
2081 	err = -ENOBUFS;
2082 	goto done;
2083 
2084 martian_source:
2085 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086 	goto e_inval;
2087 }
2088 
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090 		   u8 tos, struct net_device *dev)
2091 {
2092 	struct rtable * rth;
2093 	unsigned	hash;
2094 	int iif = dev->ifindex;
2095 
2096 	tos &= IPTOS_RT_MASK;
2097 	hash = rt_hash(daddr, saddr, iif);
2098 
2099 	rcu_read_lock();
2100 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101 	     rth = rcu_dereference(rth->u.rt_next)) {
2102 		if (rth->fl.fl4_dst == daddr &&
2103 		    rth->fl.fl4_src == saddr &&
2104 		    rth->fl.iif == iif &&
2105 		    rth->fl.oif == 0 &&
2106 		    rth->fl.mark == skb->mark &&
2107 		    rth->fl.fl4_tos == tos) {
2108 			rth->u.dst.lastuse = jiffies;
2109 			dst_hold(&rth->u.dst);
2110 			rth->u.dst.__use++;
2111 			RT_CACHE_STAT_INC(in_hit);
2112 			rcu_read_unlock();
2113 			skb->dst = (struct dst_entry*)rth;
2114 			return 0;
2115 		}
2116 		RT_CACHE_STAT_INC(in_hlist_search);
2117 	}
2118 	rcu_read_unlock();
2119 
2120 	/* Multicast recognition logic is moved from route cache to here.
2121 	   The problem was that too many Ethernet cards have broken/missing
2122 	   hardware multicast filters :-( As result the host on multicasting
2123 	   network acquires a lot of useless route cache entries, sort of
2124 	   SDR messages from all the world. Now we try to get rid of them.
2125 	   Really, provided software IP multicast filter is organized
2126 	   reasonably (at least, hashed), it does not result in a slowdown
2127 	   comparing with route cache reject entries.
2128 	   Note, that multicast routers are not affected, because
2129 	   route cache entry is created eventually.
2130 	 */
2131 	if (MULTICAST(daddr)) {
2132 		struct in_device *in_dev;
2133 
2134 		rcu_read_lock();
2135 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136 			int our = ip_check_mc(in_dev, daddr, saddr,
2137 				skb->nh.iph->protocol);
2138 			if (our
2139 #ifdef CONFIG_IP_MROUTE
2140 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 #endif
2142 			    ) {
2143 				rcu_read_unlock();
2144 				return ip_route_input_mc(skb, daddr, saddr,
2145 							 tos, dev, our);
2146 			}
2147 		}
2148 		rcu_read_unlock();
2149 		return -EINVAL;
2150 	}
2151 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152 }
2153 
2154 static inline int __mkroute_output(struct rtable **result,
2155 				   struct fib_result* res,
2156 				   const struct flowi *fl,
2157 				   const struct flowi *oldflp,
2158 				   struct net_device *dev_out,
2159 				   unsigned flags)
2160 {
2161 	struct rtable *rth;
2162 	struct in_device *in_dev;
2163 	u32 tos = RT_FL_TOS(oldflp);
2164 	int err = 0;
2165 
2166 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167 		return -EINVAL;
2168 
2169 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170 		res->type = RTN_BROADCAST;
2171 	else if (MULTICAST(fl->fl4_dst))
2172 		res->type = RTN_MULTICAST;
2173 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174 		return -EINVAL;
2175 
2176 	if (dev_out->flags & IFF_LOOPBACK)
2177 		flags |= RTCF_LOCAL;
2178 
2179 	/* get work reference to inet device */
2180 	in_dev = in_dev_get(dev_out);
2181 	if (!in_dev)
2182 		return -EINVAL;
2183 
2184 	if (res->type == RTN_BROADCAST) {
2185 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186 		if (res->fi) {
2187 			fib_info_put(res->fi);
2188 			res->fi = NULL;
2189 		}
2190 	} else if (res->type == RTN_MULTICAST) {
2191 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193 				 oldflp->proto))
2194 			flags &= ~RTCF_LOCAL;
2195 		/* If multicast route do not exist use
2196 		   default one, but do not gateway in this case.
2197 		   Yes, it is hack.
2198 		 */
2199 		if (res->fi && res->prefixlen < 4) {
2200 			fib_info_put(res->fi);
2201 			res->fi = NULL;
2202 		}
2203 	}
2204 
2205 
2206 	rth = dst_alloc(&ipv4_dst_ops);
2207 	if (!rth) {
2208 		err = -ENOBUFS;
2209 		goto cleanup;
2210 	}
2211 
2212 	atomic_set(&rth->u.dst.__refcnt, 1);
2213 	rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215 	if (res->fi) {
2216 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217 		if (res->fi->fib_nhs > 1)
2218 			rth->u.dst.flags |= DST_BALANCED;
2219 	}
2220 #endif
2221 	if (in_dev->cnf.no_xfrm)
2222 		rth->u.dst.flags |= DST_NOXFRM;
2223 	if (in_dev->cnf.no_policy)
2224 		rth->u.dst.flags |= DST_NOPOLICY;
2225 
2226 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2227 	rth->fl.fl4_tos	= tos;
2228 	rth->fl.fl4_src	= oldflp->fl4_src;
2229 	rth->fl.oif	= oldflp->oif;
2230 	rth->fl.mark    = oldflp->mark;
2231 	rth->rt_dst	= fl->fl4_dst;
2232 	rth->rt_src	= fl->fl4_src;
2233 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2234 	/* get references to the devices that are to be hold by the routing
2235 	   cache entry */
2236 	rth->u.dst.dev	= dev_out;
2237 	dev_hold(dev_out);
2238 	rth->idev	= in_dev_get(dev_out);
2239 	rth->rt_gateway = fl->fl4_dst;
2240 	rth->rt_spec_dst= fl->fl4_src;
2241 
2242 	rth->u.dst.output=ip_output;
2243 
2244 	RT_CACHE_STAT_INC(out_slow_tot);
2245 
2246 	if (flags & RTCF_LOCAL) {
2247 		rth->u.dst.input = ip_local_deliver;
2248 		rth->rt_spec_dst = fl->fl4_dst;
2249 	}
2250 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 		rth->rt_spec_dst = fl->fl4_src;
2252 		if (flags & RTCF_LOCAL &&
2253 		    !(dev_out->flags & IFF_LOOPBACK)) {
2254 			rth->u.dst.output = ip_mc_output;
2255 			RT_CACHE_STAT_INC(out_slow_mc);
2256 		}
2257 #ifdef CONFIG_IP_MROUTE
2258 		if (res->type == RTN_MULTICAST) {
2259 			if (IN_DEV_MFORWARD(in_dev) &&
2260 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 				rth->u.dst.input = ip_mr_input;
2262 				rth->u.dst.output = ip_mc_output;
2263 			}
2264 		}
2265 #endif
2266 	}
2267 
2268 	rt_set_nexthop(rth, res, 0);
2269 
2270 	rth->rt_flags = flags;
2271 
2272 	*result = rth;
2273  cleanup:
2274 	/* release work reference to inet device */
2275 	in_dev_put(in_dev);
2276 
2277 	return err;
2278 }
2279 
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281 					struct fib_result* res,
2282 					const struct flowi *fl,
2283 					const struct flowi *oldflp,
2284 					struct net_device *dev_out,
2285 					unsigned flags)
2286 {
2287 	struct rtable *rth = NULL;
2288 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 	unsigned hash;
2290 	if (err == 0) {
2291 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292 		err = rt_intern_hash(hash, rth, rp);
2293 	}
2294 
2295 	return err;
2296 }
2297 
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299 				    struct fib_result* res,
2300 				    const struct flowi *fl,
2301 				    const struct flowi *oldflp,
2302 				    struct net_device *dev_out,
2303 				    unsigned flags)
2304 {
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306 	unsigned char hop;
2307 	unsigned hash;
2308 	int err = -EINVAL;
2309 	struct rtable *rth = NULL;
2310 
2311 	if (res->fi && res->fi->fib_nhs > 1) {
2312 		unsigned char hopcount = res->fi->fib_nhs;
2313 
2314 		for (hop = 0; hop < hopcount; hop++) {
2315 			struct net_device *dev2nexthop;
2316 
2317 			res->nh_sel = hop;
2318 
2319 			/* hold a work reference to the output device */
2320 			dev2nexthop = FIB_RES_DEV(*res);
2321 			dev_hold(dev2nexthop);
2322 
2323 			/* put reference to previous result */
2324 			if (hop)
2325 				ip_rt_put(*rp);
2326 
2327 			err = __mkroute_output(&rth, res, fl, oldflp,
2328 					       dev2nexthop, flags);
2329 
2330 			if (err != 0)
2331 				goto cleanup;
2332 
2333 			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334 					oldflp->oif);
2335 			err = rt_intern_hash(hash, rth, rp);
2336 
2337 			/* forward hop information to multipath impl. */
2338 			multipath_set_nhinfo(rth,
2339 					     FIB_RES_NETWORK(*res),
2340 					     FIB_RES_NETMASK(*res),
2341 					     res->prefixlen,
2342 					     &FIB_RES_NH(*res));
2343 		cleanup:
2344 			/* release work reference to output device */
2345 			dev_put(dev2nexthop);
2346 
2347 			if (err != 0)
2348 				return err;
2349 		}
2350 		return err;
2351 	} else {
2352 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353 					     flags);
2354 	}
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357 #endif
2358 }
2359 
2360 /*
2361  * Major route resolver routine.
2362  */
2363 
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 {
2366 	u32 tos	= RT_FL_TOS(oldflp);
2367 	struct flowi fl = { .nl_u = { .ip4_u =
2368 				      { .daddr = oldflp->fl4_dst,
2369 					.saddr = oldflp->fl4_src,
2370 					.tos = tos & IPTOS_RT_MASK,
2371 					.scope = ((tos & RTO_ONLINK) ?
2372 						  RT_SCOPE_LINK :
2373 						  RT_SCOPE_UNIVERSE),
2374 				      } },
2375 			    .mark = oldflp->mark,
2376 			    .iif = loopback_dev.ifindex,
2377 			    .oif = oldflp->oif };
2378 	struct fib_result res;
2379 	unsigned flags = 0;
2380 	struct net_device *dev_out = NULL;
2381 	int free_res = 0;
2382 	int err;
2383 
2384 
2385 	res.fi		= NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2387 	res.r		= NULL;
2388 #endif
2389 
2390 	if (oldflp->fl4_src) {
2391 		err = -EINVAL;
2392 		if (MULTICAST(oldflp->fl4_src) ||
2393 		    BADCLASS(oldflp->fl4_src) ||
2394 		    ZERONET(oldflp->fl4_src))
2395 			goto out;
2396 
2397 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 		dev_out = ip_dev_find(oldflp->fl4_src);
2399 		if (dev_out == NULL)
2400 			goto out;
2401 
2402 		/* I removed check for oif == dev_out->oif here.
2403 		   It was wrong for two reasons:
2404 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405 		      assigned to multiple interfaces.
2406 		   2. Moreover, we are allowed to send packets with saddr
2407 		      of another iface. --ANK
2408 		 */
2409 
2410 		if (oldflp->oif == 0
2411 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412 			/* Special hack: user can direct multicasts
2413 			   and limited broadcast via necessary interface
2414 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415 			   This hack is not just for fun, it allows
2416 			   vic,vat and friends to work.
2417 			   They bind socket to loopback, set ttl to zero
2418 			   and expect that it will work.
2419 			   From the viewpoint of routing cache they are broken,
2420 			   because we are not allowed to build multicast path
2421 			   with loopback source addr (look, routing cache
2422 			   cannot know, that ttl is zero, so that packet
2423 			   will not leave this host and route is valid).
2424 			   Luckily, this hack is good workaround.
2425 			 */
2426 
2427 			fl.oif = dev_out->ifindex;
2428 			goto make_route;
2429 		}
2430 		if (dev_out)
2431 			dev_put(dev_out);
2432 		dev_out = NULL;
2433 	}
2434 
2435 
2436 	if (oldflp->oif) {
2437 		dev_out = dev_get_by_index(oldflp->oif);
2438 		err = -ENODEV;
2439 		if (dev_out == NULL)
2440 			goto out;
2441 
2442 		/* RACE: Check return value of inet_select_addr instead. */
2443 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2444 			dev_put(dev_out);
2445 			goto out;	/* Wrong error code */
2446 		}
2447 
2448 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2449 			if (!fl.fl4_src)
2450 				fl.fl4_src = inet_select_addr(dev_out, 0,
2451 							      RT_SCOPE_LINK);
2452 			goto make_route;
2453 		}
2454 		if (!fl.fl4_src) {
2455 			if (MULTICAST(oldflp->fl4_dst))
2456 				fl.fl4_src = inet_select_addr(dev_out, 0,
2457 							      fl.fl4_scope);
2458 			else if (!oldflp->fl4_dst)
2459 				fl.fl4_src = inet_select_addr(dev_out, 0,
2460 							      RT_SCOPE_HOST);
2461 		}
2462 	}
2463 
2464 	if (!fl.fl4_dst) {
2465 		fl.fl4_dst = fl.fl4_src;
2466 		if (!fl.fl4_dst)
2467 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468 		if (dev_out)
2469 			dev_put(dev_out);
2470 		dev_out = &loopback_dev;
2471 		dev_hold(dev_out);
2472 		fl.oif = loopback_dev.ifindex;
2473 		res.type = RTN_LOCAL;
2474 		flags |= RTCF_LOCAL;
2475 		goto make_route;
2476 	}
2477 
2478 	if (fib_lookup(&fl, &res)) {
2479 		res.fi = NULL;
2480 		if (oldflp->oif) {
2481 			/* Apparently, routing tables are wrong. Assume,
2482 			   that the destination is on link.
2483 
2484 			   WHY? DW.
2485 			   Because we are allowed to send to iface
2486 			   even if it has NO routes and NO assigned
2487 			   addresses. When oif is specified, routing
2488 			   tables are looked up with only one purpose:
2489 			   to catch if destination is gatewayed, rather than
2490 			   direct. Moreover, if MSG_DONTROUTE is set,
2491 			   we send packet, ignoring both routing tables
2492 			   and ifaddr state. --ANK
2493 
2494 
2495 			   We could make it even if oif is unknown,
2496 			   likely IPv6, but we do not.
2497 			 */
2498 
2499 			if (fl.fl4_src == 0)
2500 				fl.fl4_src = inet_select_addr(dev_out, 0,
2501 							      RT_SCOPE_LINK);
2502 			res.type = RTN_UNICAST;
2503 			goto make_route;
2504 		}
2505 		if (dev_out)
2506 			dev_put(dev_out);
2507 		err = -ENETUNREACH;
2508 		goto out;
2509 	}
2510 	free_res = 1;
2511 
2512 	if (res.type == RTN_LOCAL) {
2513 		if (!fl.fl4_src)
2514 			fl.fl4_src = fl.fl4_dst;
2515 		if (dev_out)
2516 			dev_put(dev_out);
2517 		dev_out = &loopback_dev;
2518 		dev_hold(dev_out);
2519 		fl.oif = dev_out->ifindex;
2520 		if (res.fi)
2521 			fib_info_put(res.fi);
2522 		res.fi = NULL;
2523 		flags |= RTCF_LOCAL;
2524 		goto make_route;
2525 	}
2526 
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529 		fib_select_multipath(&fl, &res);
2530 	else
2531 #endif
2532 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533 		fib_select_default(&fl, &res);
2534 
2535 	if (!fl.fl4_src)
2536 		fl.fl4_src = FIB_RES_PREFSRC(res);
2537 
2538 	if (dev_out)
2539 		dev_put(dev_out);
2540 	dev_out = FIB_RES_DEV(res);
2541 	dev_hold(dev_out);
2542 	fl.oif = dev_out->ifindex;
2543 
2544 
2545 make_route:
2546 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547 
2548 
2549 	if (free_res)
2550 		fib_res_put(&res);
2551 	if (dev_out)
2552 		dev_put(dev_out);
2553 out:	return err;
2554 }
2555 
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557 {
2558 	unsigned hash;
2559 	struct rtable *rth;
2560 
2561 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2562 
2563 	rcu_read_lock_bh();
2564 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565 		rth = rcu_dereference(rth->u.rt_next)) {
2566 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2567 		    rth->fl.fl4_src == flp->fl4_src &&
2568 		    rth->fl.iif == 0 &&
2569 		    rth->fl.oif == flp->oif &&
2570 		    rth->fl.mark == flp->mark &&
2571 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2573 
2574 			/* check for multipath routes and choose one if
2575 			 * necessary
2576 			 */
2577 			if (multipath_select_route(flp, rth, rp)) {
2578 				dst_hold(&(*rp)->u.dst);
2579 				RT_CACHE_STAT_INC(out_hit);
2580 				rcu_read_unlock_bh();
2581 				return 0;
2582 			}
2583 
2584 			rth->u.dst.lastuse = jiffies;
2585 			dst_hold(&rth->u.dst);
2586 			rth->u.dst.__use++;
2587 			RT_CACHE_STAT_INC(out_hit);
2588 			rcu_read_unlock_bh();
2589 			*rp = rth;
2590 			return 0;
2591 		}
2592 		RT_CACHE_STAT_INC(out_hlist_search);
2593 	}
2594 	rcu_read_unlock_bh();
2595 
2596 	return ip_route_output_slow(rp, flp);
2597 }
2598 
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600 
2601 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602 {
2603 	int err;
2604 
2605 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2606 		return err;
2607 
2608 	if (flp->proto) {
2609 		if (!flp->fl4_src)
2610 			flp->fl4_src = (*rp)->rt_src;
2611 		if (!flp->fl4_dst)
2612 			flp->fl4_dst = (*rp)->rt_dst;
2613 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2614 	}
2615 
2616 	return 0;
2617 }
2618 
2619 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2620 
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623 	return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625 
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627 			int nowait, unsigned int flags)
2628 {
2629 	struct rtable *rt = (struct rtable*)skb->dst;
2630 	struct rtmsg *r;
2631 	struct nlmsghdr *nlh;
2632 	long expires;
2633 	u32 id = 0, ts = 0, tsage = 0, error;
2634 
2635 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2636 	if (nlh == NULL)
2637 		return -ENOBUFS;
2638 
2639 	r = nlmsg_data(nlh);
2640 	r->rtm_family	 = AF_INET;
2641 	r->rtm_dst_len	= 32;
2642 	r->rtm_src_len	= 0;
2643 	r->rtm_tos	= rt->fl.fl4_tos;
2644 	r->rtm_table	= RT_TABLE_MAIN;
2645 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2646 	r->rtm_type	= rt->rt_type;
2647 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2648 	r->rtm_protocol = RTPROT_UNSPEC;
2649 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2650 	if (rt->rt_flags & RTCF_NOTIFY)
2651 		r->rtm_flags |= RTM_F_NOTIFY;
2652 
2653 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2654 
2655 	if (rt->fl.fl4_src) {
2656 		r->rtm_src_len = 32;
2657 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2658 	}
2659 	if (rt->u.dst.dev)
2660 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2661 #ifdef CONFIG_NET_CLS_ROUTE
2662 	if (rt->u.dst.tclassid)
2663 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2664 #endif
2665 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2666 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2667 		NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2668 #endif
2669 	if (rt->fl.iif)
2670 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2671 	else if (rt->rt_src != rt->fl.fl4_src)
2672 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2673 
2674 	if (rt->rt_dst != rt->rt_gateway)
2675 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2676 
2677 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 		goto nla_put_failure;
2679 
2680 	error = rt->u.dst.error;
2681 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2682 	if (rt->peer) {
2683 		id = rt->peer->ip_id_count;
2684 		if (rt->peer->tcp_ts_stamp) {
2685 			ts = rt->peer->tcp_ts;
2686 			tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2687 		}
2688 	}
2689 
2690 	if (rt->fl.iif) {
2691 #ifdef CONFIG_IP_MROUTE
2692 		__be32 dst = rt->rt_dst;
2693 
2694 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2695 		    ipv4_devconf.mc_forwarding) {
2696 			int err = ipmr_get_route(skb, r, nowait);
2697 			if (err <= 0) {
2698 				if (!nowait) {
2699 					if (err == 0)
2700 						return 0;
2701 					goto nla_put_failure;
2702 				} else {
2703 					if (err == -EMSGSIZE)
2704 						goto nla_put_failure;
2705 					error = err;
2706 				}
2707 			}
2708 		} else
2709 #endif
2710 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2711 	}
2712 
2713 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2714 			       expires, error) < 0)
2715 		goto nla_put_failure;
2716 
2717 	return nlmsg_end(skb, nlh);
2718 
2719 nla_put_failure:
2720 	return nlmsg_cancel(skb, nlh);
2721 }
2722 
2723 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2724 {
2725 	struct rtmsg *rtm;
2726 	struct nlattr *tb[RTA_MAX+1];
2727 	struct rtable *rt = NULL;
2728 	__be32 dst = 0;
2729 	__be32 src = 0;
2730 	u32 iif;
2731 	int err;
2732 	struct sk_buff *skb;
2733 
2734 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2735 	if (err < 0)
2736 		goto errout;
2737 
2738 	rtm = nlmsg_data(nlh);
2739 
2740 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2741 	if (skb == NULL) {
2742 		err = -ENOBUFS;
2743 		goto errout;
2744 	}
2745 
2746 	/* Reserve room for dummy headers, this skb can pass
2747 	   through good chunk of routing engine.
2748 	 */
2749 	skb->mac.raw = skb->nh.raw = skb->data;
2750 
2751 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 	skb->nh.iph->protocol = IPPROTO_ICMP;
2753 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754 
2755 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2756 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2757 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2758 
2759 	if (iif) {
2760 		struct net_device *dev;
2761 
2762 		dev = __dev_get_by_index(iif);
2763 		if (dev == NULL) {
2764 			err = -ENODEV;
2765 			goto errout_free;
2766 		}
2767 
2768 		skb->protocol	= htons(ETH_P_IP);
2769 		skb->dev	= dev;
2770 		local_bh_disable();
2771 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 		local_bh_enable();
2773 
2774 		rt = (struct rtable*) skb->dst;
2775 		if (err == 0 && rt->u.dst.error)
2776 			err = -rt->u.dst.error;
2777 	} else {
2778 		struct flowi fl = {
2779 			.nl_u = {
2780 				.ip4_u = {
2781 					.daddr = dst,
2782 					.saddr = src,
2783 					.tos = rtm->rtm_tos,
2784 				},
2785 			},
2786 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2787 		};
2788 		err = ip_route_output_key(&rt, &fl);
2789 	}
2790 
2791 	if (err)
2792 		goto errout_free;
2793 
2794 	skb->dst = &rt->u.dst;
2795 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2796 		rt->rt_flags |= RTCF_NOTIFY;
2797 
2798 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2799 				RTM_NEWROUTE, 0, 0);
2800 	if (err <= 0)
2801 		goto errout_free;
2802 
2803 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2804 errout:
2805 	return err;
2806 
2807 errout_free:
2808 	kfree_skb(skb);
2809 	goto errout;
2810 }
2811 
2812 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2813 {
2814 	struct rtable *rt;
2815 	int h, s_h;
2816 	int idx, s_idx;
2817 
2818 	s_h = cb->args[0];
2819 	s_idx = idx = cb->args[1];
2820 	for (h = 0; h <= rt_hash_mask; h++) {
2821 		if (h < s_h) continue;
2822 		if (h > s_h)
2823 			s_idx = 0;
2824 		rcu_read_lock_bh();
2825 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2827 			if (idx < s_idx)
2828 				continue;
2829 			skb->dst = dst_clone(&rt->u.dst);
2830 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2831 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2832 					 1, NLM_F_MULTI) <= 0) {
2833 				dst_release(xchg(&skb->dst, NULL));
2834 				rcu_read_unlock_bh();
2835 				goto done;
2836 			}
2837 			dst_release(xchg(&skb->dst, NULL));
2838 		}
2839 		rcu_read_unlock_bh();
2840 	}
2841 
2842 done:
2843 	cb->args[0] = h;
2844 	cb->args[1] = idx;
2845 	return skb->len;
2846 }
2847 
2848 void ip_rt_multicast_event(struct in_device *in_dev)
2849 {
2850 	rt_cache_flush(0);
2851 }
2852 
2853 #ifdef CONFIG_SYSCTL
2854 static int flush_delay;
2855 
2856 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857 					struct file *filp, void __user *buffer,
2858 					size_t *lenp, loff_t *ppos)
2859 {
2860 	if (write) {
2861 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862 		rt_cache_flush(flush_delay);
2863 		return 0;
2864 	}
2865 
2866 	return -EINVAL;
2867 }
2868 
2869 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870 						int __user *name,
2871 						int nlen,
2872 						void __user *oldval,
2873 						size_t __user *oldlenp,
2874 						void __user *newval,
2875 						size_t newlen,
2876 						void **context)
2877 {
2878 	int delay;
2879 	if (newlen != sizeof(int))
2880 		return -EINVAL;
2881 	if (get_user(delay, (int __user *)newval))
2882 		return -EFAULT;
2883 	rt_cache_flush(delay);
2884 	return 0;
2885 }
2886 
2887 ctl_table ipv4_route_table[] = {
2888         {
2889 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2890 		.procname	= "flush",
2891 		.data		= &flush_delay,
2892 		.maxlen		= sizeof(int),
2893 		.mode		= 0200,
2894 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2895 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2896 	},
2897 	{
2898 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2899 		.procname	= "min_delay",
2900 		.data		= &ip_rt_min_delay,
2901 		.maxlen		= sizeof(int),
2902 		.mode		= 0644,
2903 		.proc_handler	= &proc_dointvec_jiffies,
2904 		.strategy	= &sysctl_jiffies,
2905 	},
2906 	{
2907 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2908 		.procname	= "max_delay",
2909 		.data		= &ip_rt_max_delay,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= &proc_dointvec_jiffies,
2913 		.strategy	= &sysctl_jiffies,
2914 	},
2915 	{
2916 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2917 		.procname	= "gc_thresh",
2918 		.data		= &ipv4_dst_ops.gc_thresh,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= &proc_dointvec,
2922 	},
2923 	{
2924 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2925 		.procname	= "max_size",
2926 		.data		= &ip_rt_max_size,
2927 		.maxlen		= sizeof(int),
2928 		.mode		= 0644,
2929 		.proc_handler	= &proc_dointvec,
2930 	},
2931 	{
2932 		/*  Deprecated. Use gc_min_interval_ms */
2933 
2934 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 		.procname	= "gc_min_interval",
2936 		.data		= &ip_rt_gc_min_interval,
2937 		.maxlen		= sizeof(int),
2938 		.mode		= 0644,
2939 		.proc_handler	= &proc_dointvec_jiffies,
2940 		.strategy	= &sysctl_jiffies,
2941 	},
2942 	{
2943 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 		.procname	= "gc_min_interval_ms",
2945 		.data		= &ip_rt_gc_min_interval,
2946 		.maxlen		= sizeof(int),
2947 		.mode		= 0644,
2948 		.proc_handler	= &proc_dointvec_ms_jiffies,
2949 		.strategy	= &sysctl_ms_jiffies,
2950 	},
2951 	{
2952 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2953 		.procname	= "gc_timeout",
2954 		.data		= &ip_rt_gc_timeout,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= &proc_dointvec_jiffies,
2958 		.strategy	= &sysctl_jiffies,
2959 	},
2960 	{
2961 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2962 		.procname	= "gc_interval",
2963 		.data		= &ip_rt_gc_interval,
2964 		.maxlen		= sizeof(int),
2965 		.mode		= 0644,
2966 		.proc_handler	= &proc_dointvec_jiffies,
2967 		.strategy	= &sysctl_jiffies,
2968 	},
2969 	{
2970 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 		.procname	= "redirect_load",
2972 		.data		= &ip_rt_redirect_load,
2973 		.maxlen		= sizeof(int),
2974 		.mode		= 0644,
2975 		.proc_handler	= &proc_dointvec,
2976 	},
2977 	{
2978 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 		.procname	= "redirect_number",
2980 		.data		= &ip_rt_redirect_number,
2981 		.maxlen		= sizeof(int),
2982 		.mode		= 0644,
2983 		.proc_handler	= &proc_dointvec,
2984 	},
2985 	{
2986 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 		.procname	= "redirect_silence",
2988 		.data		= &ip_rt_redirect_silence,
2989 		.maxlen		= sizeof(int),
2990 		.mode		= 0644,
2991 		.proc_handler	= &proc_dointvec,
2992 	},
2993 	{
2994 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2995 		.procname	= "error_cost",
2996 		.data		= &ip_rt_error_cost,
2997 		.maxlen		= sizeof(int),
2998 		.mode		= 0644,
2999 		.proc_handler	= &proc_dointvec,
3000 	},
3001 	{
3002 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3003 		.procname	= "error_burst",
3004 		.data		= &ip_rt_error_burst,
3005 		.maxlen		= sizeof(int),
3006 		.mode		= 0644,
3007 		.proc_handler	= &proc_dointvec,
3008 	},
3009 	{
3010 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3011 		.procname	= "gc_elasticity",
3012 		.data		= &ip_rt_gc_elasticity,
3013 		.maxlen		= sizeof(int),
3014 		.mode		= 0644,
3015 		.proc_handler	= &proc_dointvec,
3016 	},
3017 	{
3018 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3019 		.procname	= "mtu_expires",
3020 		.data		= &ip_rt_mtu_expires,
3021 		.maxlen		= sizeof(int),
3022 		.mode		= 0644,
3023 		.proc_handler	= &proc_dointvec_jiffies,
3024 		.strategy	= &sysctl_jiffies,
3025 	},
3026 	{
3027 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3028 		.procname	= "min_pmtu",
3029 		.data		= &ip_rt_min_pmtu,
3030 		.maxlen		= sizeof(int),
3031 		.mode		= 0644,
3032 		.proc_handler	= &proc_dointvec,
3033 	},
3034 	{
3035 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3036 		.procname	= "min_adv_mss",
3037 		.data		= &ip_rt_min_advmss,
3038 		.maxlen		= sizeof(int),
3039 		.mode		= 0644,
3040 		.proc_handler	= &proc_dointvec,
3041 	},
3042 	{
3043 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 		.procname	= "secret_interval",
3045 		.data		= &ip_rt_secret_interval,
3046 		.maxlen		= sizeof(int),
3047 		.mode		= 0644,
3048 		.proc_handler	= &proc_dointvec_jiffies,
3049 		.strategy	= &sysctl_jiffies,
3050 	},
3051 	{ .ctl_name = 0 }
3052 };
3053 #endif
3054 
3055 #ifdef CONFIG_NET_CLS_ROUTE
3056 struct ip_rt_acct *ip_rt_acct;
3057 
3058 /* This code sucks.  But you should have seen it before! --RR */
3059 
3060 /* IP route accounting ptr for this logical cpu number. */
3061 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062 
3063 #ifdef CONFIG_PROC_FS
3064 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 			   int length, int *eof, void *data)
3066 {
3067 	unsigned int i;
3068 
3069 	if ((offset & 3) || (length & 3))
3070 		return -EIO;
3071 
3072 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 		*eof = 1;
3074 		return 0;
3075 	}
3076 
3077 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 		*eof = 1;
3080 	}
3081 
3082 	offset /= sizeof(u32);
3083 
3084 	if (length > 0) {
3085 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 		u32 *dst = (u32 *) buffer;
3087 
3088 		/* Copy first cpu. */
3089 		*start = buffer;
3090 		memcpy(dst, src, length);
3091 
3092 		/* Add the other cpus in, one int at a time */
3093 		for_each_possible_cpu(i) {
3094 			unsigned int j;
3095 
3096 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097 
3098 			for (j = 0; j < length/4; j++)
3099 				dst[j] += src[j];
3100 		}
3101 	}
3102 	return length;
3103 }
3104 #endif /* CONFIG_PROC_FS */
3105 #endif /* CONFIG_NET_CLS_ROUTE */
3106 
3107 static __initdata unsigned long rhash_entries;
3108 static int __init set_rhash_entries(char *str)
3109 {
3110 	if (!str)
3111 		return 0;
3112 	rhash_entries = simple_strtoul(str, &str, 0);
3113 	return 1;
3114 }
3115 __setup("rhash_entries=", set_rhash_entries);
3116 
3117 int __init ip_rt_init(void)
3118 {
3119 	int rc = 0;
3120 
3121 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 			     (jiffies ^ (jiffies >> 7)));
3123 
3124 #ifdef CONFIG_NET_CLS_ROUTE
3125 	{
3126 	int order;
3127 	for (order = 0;
3128 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 		/* NOTHING */;
3130 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 	if (!ip_rt_acct)
3132 		panic("IP: failed to allocate ip_rt_acct\n");
3133 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134 	}
3135 #endif
3136 
3137 	ipv4_dst_ops.kmem_cachep =
3138 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3140 
3141 	rt_hash_table = (struct rt_hash_bucket *)
3142 		alloc_large_system_hash("IP route cache",
3143 					sizeof(struct rt_hash_bucket),
3144 					rhash_entries,
3145 					(num_physpages >= 128 * 1024) ?
3146 					15 : 17,
3147 					0,
3148 					&rt_hash_log,
3149 					&rt_hash_mask,
3150 					0);
3151 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 	rt_hash_lock_init();
3153 
3154 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156 
3157 	devinet_init();
3158 	ip_fib_init();
3159 
3160 	init_timer(&rt_flush_timer);
3161 	rt_flush_timer.function = rt_run_flush;
3162 	init_timer(&rt_periodic_timer);
3163 	rt_periodic_timer.function = rt_check_expire;
3164 	init_timer(&rt_secret_timer);
3165 	rt_secret_timer.function = rt_secret_rebuild;
3166 
3167 	/* All the timers, started at system startup tend
3168 	   to synchronize. Perturb it a bit.
3169 	 */
3170 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 					ip_rt_gc_interval;
3172 	add_timer(&rt_periodic_timer);
3173 
3174 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175 		ip_rt_secret_interval;
3176 	add_timer(&rt_secret_timer);
3177 
3178 #ifdef CONFIG_PROC_FS
3179 	{
3180 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183 			    		     proc_net_stat))) {
3184 		return -ENOMEM;
3185 	}
3186 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 	}
3188 #ifdef CONFIG_NET_CLS_ROUTE
3189 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190 #endif
3191 #endif
3192 #ifdef CONFIG_XFRM
3193 	xfrm_init();
3194 	xfrm4_init();
3195 #endif
3196 	return rc;
3197 }
3198 
3199 EXPORT_SYMBOL(__ip_select_ident);
3200 EXPORT_SYMBOL(ip_route_input);
3201 EXPORT_SYMBOL(ip_route_output_key);
3202