xref: /linux/net/ipv4/route.c (revision b454cc6636d254fbf6049b73e9560aee76fb04a3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_min_delay		= 2 * HZ;
120 static int ip_rt_max_delay		= 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval		= 60 * HZ;
124 static int ip_rt_gc_min_interval	= HZ / 2;
125 static int ip_rt_redirect_number	= 9;
126 static int ip_rt_redirect_load		= HZ / 50;
127 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost		= HZ;
129 static int ip_rt_error_burst		= 5 * HZ;
130 static int ip_rt_gc_elasticity		= 8;
131 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu		= 512 + 20 + 20;
133 static int ip_rt_min_advmss		= 256;
134 static int ip_rt_secret_interval	= 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136 
137 #define RTprint(a...)	printk(KERN_DEBUG a)
138 
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142 
143 /*
144  *	Interface to generic destination cache.
145  */
146 
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
150 					 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void		 ipv4_link_failure(struct sk_buff *skb);
153 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155 
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		__constant_htons(ETH_P_IP),
160 	.gc =			rt_garbage_collect,
161 	.check =		ipv4_dst_check,
162 	.destroy =		ipv4_dst_destroy,
163 	.ifdown =		ipv4_dst_ifdown,
164 	.negative_advice =	ipv4_negative_advice,
165 	.link_failure =		ipv4_link_failure,
166 	.update_pmtu =		ip_rt_update_pmtu,
167 	.entry_size =		sizeof(struct rtable),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 	defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ	256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ	4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ	2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ	1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ	512
227 # else
228 #  define RT_HASH_LOCK_SZ	256
229 # endif
230 #endif
231 
232 static spinlock_t	*rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()	{ \
235 		int i; \
236 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 			spin_lock_init(&rt_hash_locks[i]); \
240 		}
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245 
246 static struct rt_hash_bucket 	*rt_hash_table;
247 static unsigned			rt_hash_mask;
248 static int			rt_hash_log;
249 static unsigned int		rt_hash_rnd;
250 
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253 	(__raw_get_cpu_var(rt_cache_stat).field++)
254 
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 				struct rtable **res);
257 
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
261 		& rt_hash_mask);
262 }
263 
264 #define rt_hash(daddr, saddr, idx) \
265 	rt_hash_code((__force u32)(__be32)(daddr),\
266 		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267 
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270 	int bucket;
271 };
272 
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 {
275 	struct rtable *r = NULL;
276 	struct rt_cache_iter_state *st = seq->private;
277 
278 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 		rcu_read_lock_bh();
280 		r = rt_hash_table[st->bucket].chain;
281 		if (r)
282 			break;
283 		rcu_read_unlock_bh();
284 	}
285 	return r;
286 }
287 
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 {
290 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291 
292 	r = r->u.rt_next;
293 	while (!r) {
294 		rcu_read_unlock_bh();
295 		if (--st->bucket < 0)
296 			break;
297 		rcu_read_lock_bh();
298 		r = rt_hash_table[st->bucket].chain;
299 	}
300 	return r;
301 }
302 
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 {
305 	struct rtable *r = rt_cache_get_first(seq);
306 
307 	if (r)
308 		while (pos && (r = rt_cache_get_next(seq, r)))
309 			--pos;
310 	return pos ? NULL : r;
311 }
312 
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 {
315 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316 }
317 
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 {
320 	struct rtable *r = NULL;
321 
322 	if (v == SEQ_START_TOKEN)
323 		r = rt_cache_get_first(seq);
324 	else
325 		r = rt_cache_get_next(seq, v);
326 	++*pos;
327 	return r;
328 }
329 
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 {
332 	if (v && v != SEQ_START_TOKEN)
333 		rcu_read_unlock_bh();
334 }
335 
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 {
338 	if (v == SEQ_START_TOKEN)
339 		seq_printf(seq, "%-127s\n",
340 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 			   "HHUptod\tSpecDst");
343 	else {
344 		struct rtable *r = v;
345 		char temp[256];
346 
347 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 			r->u.dst.dev ? r->u.dst.dev->name : "*",
350 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 			dst_metric(&r->u.dst, RTAX_WINDOW),
356 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 			r->fl.fl4_tos,
359 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 				       dev_queue_xmit) : 0,
362 			r->rt_spec_dst);
363 		seq_printf(seq, "%-127s\n", temp);
364         }
365   	return 0;
366 }
367 
368 static struct seq_operations rt_cache_seq_ops = {
369 	.start  = rt_cache_seq_start,
370 	.next   = rt_cache_seq_next,
371 	.stop   = rt_cache_seq_stop,
372 	.show   = rt_cache_seq_show,
373 };
374 
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 {
377 	struct seq_file *seq;
378 	int rc = -ENOMEM;
379 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380 
381 	if (!s)
382 		goto out;
383 	rc = seq_open(file, &rt_cache_seq_ops);
384 	if (rc)
385 		goto out_kfree;
386 	seq          = file->private_data;
387 	seq->private = s;
388 	memset(s, 0, sizeof(*s));
389 out:
390 	return rc;
391 out_kfree:
392 	kfree(s);
393 	goto out;
394 }
395 
396 static struct file_operations rt_cache_seq_fops = {
397 	.owner	 = THIS_MODULE,
398 	.open	 = rt_cache_seq_open,
399 	.read	 = seq_read,
400 	.llseek	 = seq_lseek,
401 	.release = seq_release_private,
402 };
403 
404 
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 {
407 	int cpu;
408 
409 	if (*pos == 0)
410 		return SEQ_START_TOKEN;
411 
412 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 		if (!cpu_possible(cpu))
414 			continue;
415 		*pos = cpu+1;
416 		return &per_cpu(rt_cache_stat, cpu);
417 	}
418 	return NULL;
419 }
420 
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 {
423 	int cpu;
424 
425 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 		if (!cpu_possible(cpu))
427 			continue;
428 		*pos = cpu+1;
429 		return &per_cpu(rt_cache_stat, cpu);
430 	}
431 	return NULL;
432 
433 }
434 
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436 {
437 
438 }
439 
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 {
442 	struct rt_cache_stat *st = v;
443 
444 	if (v == SEQ_START_TOKEN) {
445 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446 		return 0;
447 	}
448 
449 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
450 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 		   atomic_read(&ipv4_dst_ops.entries),
452 		   st->in_hit,
453 		   st->in_slow_tot,
454 		   st->in_slow_mc,
455 		   st->in_no_route,
456 		   st->in_brd,
457 		   st->in_martian_dst,
458 		   st->in_martian_src,
459 
460 		   st->out_hit,
461 		   st->out_slow_tot,
462 		   st->out_slow_mc,
463 
464 		   st->gc_total,
465 		   st->gc_ignored,
466 		   st->gc_goal_miss,
467 		   st->gc_dst_overflow,
468 		   st->in_hlist_search,
469 		   st->out_hlist_search
470 		);
471 	return 0;
472 }
473 
474 static struct seq_operations rt_cpu_seq_ops = {
475 	.start  = rt_cpu_seq_start,
476 	.next   = rt_cpu_seq_next,
477 	.stop   = rt_cpu_seq_stop,
478 	.show   = rt_cpu_seq_show,
479 };
480 
481 
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 {
484 	return seq_open(file, &rt_cpu_seq_ops);
485 }
486 
487 static struct file_operations rt_cpu_seq_fops = {
488 	.owner	 = THIS_MODULE,
489 	.open	 = rt_cpu_seq_open,
490 	.read	 = seq_read,
491 	.llseek	 = seq_lseek,
492 	.release = seq_release,
493 };
494 
495 #endif /* CONFIG_PROC_FS */
496 
497 static __inline__ void rt_free(struct rtable *rt)
498 {
499 	multipath_remove(rt);
500 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 }
502 
503 static __inline__ void rt_drop(struct rtable *rt)
504 {
505 	multipath_remove(rt);
506 	ip_rt_put(rt);
507 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508 }
509 
510 static __inline__ int rt_fast_clean(struct rtable *rth)
511 {
512 	/* Kill broadcast/multicast entries very aggresively, if they
513 	   collide in hash table with more useful entries */
514 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 		rth->fl.iif && rth->u.rt_next;
516 }
517 
518 static __inline__ int rt_valuable(struct rtable *rth)
519 {
520 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 		rth->u.dst.expires;
522 }
523 
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525 {
526 	unsigned long age;
527 	int ret = 0;
528 
529 	if (atomic_read(&rth->u.dst.__refcnt))
530 		goto out;
531 
532 	ret = 1;
533 	if (rth->u.dst.expires &&
534 	    time_after_eq(jiffies, rth->u.dst.expires))
535 		goto out;
536 
537 	age = jiffies - rth->u.dst.lastuse;
538 	ret = 0;
539 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 	    (age <= tmo2 && rt_valuable(rth)))
541 		goto out;
542 	ret = 1;
543 out:	return ret;
544 }
545 
546 /* Bits of score are:
547  * 31: very valuable
548  * 30: not quite useless
549  * 29..0: usage counter
550  */
551 static inline u32 rt_score(struct rtable *rt)
552 {
553 	u32 score = jiffies - rt->u.dst.lastuse;
554 
555 	score = ~score & ~(3<<30);
556 
557 	if (rt_valuable(rt))
558 		score |= (1<<31);
559 
560 	if (!rt->fl.iif ||
561 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 		score |= (1<<30);
563 
564 	return score;
565 }
566 
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 {
569 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571 		(fl1->mark ^ fl2->mark) |
572 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 		(fl1->oif ^ fl2->oif) |
575 		(fl1->iif ^ fl2->iif)) == 0;
576 }
577 
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 						struct rtable *expentry,
581 						int *removed_count)
582 {
583 	int passedexpired = 0;
584 	struct rtable **nextstep = NULL;
585 	struct rtable **rthp = chain_head;
586 	struct rtable *rth;
587 
588 	if (removed_count)
589 		*removed_count = 0;
590 
591 	while ((rth = *rthp) != NULL) {
592 		if (rth == expentry)
593 			passedexpired = 1;
594 
595 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
596 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 			if (*rthp == expentry) {
598 				*rthp = rth->u.rt_next;
599 				continue;
600 			} else {
601 				*rthp = rth->u.rt_next;
602 				rt_free(rth);
603 				if (removed_count)
604 					++(*removed_count);
605 			}
606 		} else {
607 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 			    passedexpired && !nextstep)
609 				nextstep = &rth->u.rt_next;
610 
611 			rthp = &rth->u.rt_next;
612 		}
613 	}
614 
615 	rt_free(expentry);
616 	if (removed_count)
617 		++(*removed_count);
618 
619 	return nextstep;
620 }
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 
623 
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
626 {
627 	static unsigned int rover;
628 	unsigned int i = rover, goal;
629 	struct rtable *rth, **rthp;
630 	unsigned long now = jiffies;
631 	u64 mult;
632 
633 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 	if (ip_rt_gc_timeout > 1)
635 		do_div(mult, ip_rt_gc_timeout);
636 	goal = (unsigned int)mult;
637 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 	for (; goal > 0; goal--) {
639 		unsigned long tmo = ip_rt_gc_timeout;
640 
641 		i = (i + 1) & rt_hash_mask;
642 		rthp = &rt_hash_table[i].chain;
643 
644 		if (*rthp == 0)
645 			continue;
646 		spin_lock(rt_hash_lock_addr(i));
647 		while ((rth = *rthp) != NULL) {
648 			if (rth->u.dst.expires) {
649 				/* Entry is expired even if it is in use */
650 				if (time_before_eq(now, rth->u.dst.expires)) {
651 					tmo >>= 1;
652 					rthp = &rth->u.rt_next;
653 					continue;
654 				}
655 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 				tmo >>= 1;
657 				rthp = &rth->u.rt_next;
658 				continue;
659 			}
660 
661 			/* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 			/* remove all related balanced entries if necessary */
664 			if (rth->u.dst.flags & DST_BALANCED) {
665 				rthp = rt_remove_balanced_route(
666 					&rt_hash_table[i].chain,
667 					rth, NULL);
668 				if (!rthp)
669 					break;
670 			} else {
671 				*rthp = rth->u.rt_next;
672 				rt_free(rth);
673 			}
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675  			*rthp = rth->u.rt_next;
676  			rt_free(rth);
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 		}
679 		spin_unlock(rt_hash_lock_addr(i));
680 
681 		/* Fallback loop breaker. */
682 		if (time_after(jiffies, now))
683 			break;
684 	}
685 	rover = i;
686 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
687 }
688 
689 /* This can run from both BH and non-BH contexts, the latter
690  * in the case of a forced flush event.
691  */
692 static void rt_run_flush(unsigned long dummy)
693 {
694 	int i;
695 	struct rtable *rth, *next;
696 
697 	rt_deadline = 0;
698 
699 	get_random_bytes(&rt_hash_rnd, 4);
700 
701 	for (i = rt_hash_mask; i >= 0; i--) {
702 		spin_lock_bh(rt_hash_lock_addr(i));
703 		rth = rt_hash_table[i].chain;
704 		if (rth)
705 			rt_hash_table[i].chain = NULL;
706 		spin_unlock_bh(rt_hash_lock_addr(i));
707 
708 		for (; rth; rth = next) {
709 			next = rth->u.rt_next;
710 			rt_free(rth);
711 		}
712 	}
713 }
714 
715 static DEFINE_SPINLOCK(rt_flush_lock);
716 
717 void rt_cache_flush(int delay)
718 {
719 	unsigned long now = jiffies;
720 	int user_mode = !in_softirq();
721 
722 	if (delay < 0)
723 		delay = ip_rt_min_delay;
724 
725 	/* flush existing multipath state*/
726 	multipath_flush();
727 
728 	spin_lock_bh(&rt_flush_lock);
729 
730 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 		long tmo = (long)(rt_deadline - now);
732 
733 		/* If flush timer is already running
734 		   and flush request is not immediate (delay > 0):
735 
736 		   if deadline is not achieved, prolongate timer to "delay",
737 		   otherwise fire it at deadline time.
738 		 */
739 
740 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 			tmo = 0;
742 
743 		if (delay > tmo)
744 			delay = tmo;
745 	}
746 
747 	if (delay <= 0) {
748 		spin_unlock_bh(&rt_flush_lock);
749 		rt_run_flush(0);
750 		return;
751 	}
752 
753 	if (rt_deadline == 0)
754 		rt_deadline = now + ip_rt_max_delay;
755 
756 	mod_timer(&rt_flush_timer, now+delay);
757 	spin_unlock_bh(&rt_flush_lock);
758 }
759 
760 static void rt_secret_rebuild(unsigned long dummy)
761 {
762 	unsigned long now = jiffies;
763 
764 	rt_cache_flush(0);
765 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766 }
767 
768 /*
769    Short description of GC goals.
770 
771    We want to build algorithm, which will keep routing cache
772    at some equilibrium point, when number of aged off entries
773    is kept approximately equal to newly generated ones.
774 
775    Current expiration strength is variable "expire".
776    We try to adjust it dynamically, so that if networking
777    is idle expires is large enough to keep enough of warm entries,
778    and when load increases it reduces to limit cache size.
779  */
780 
781 static int rt_garbage_collect(void)
782 {
783 	static unsigned long expire = RT_GC_TIMEOUT;
784 	static unsigned long last_gc;
785 	static int rover;
786 	static int equilibrium;
787 	struct rtable *rth, **rthp;
788 	unsigned long now = jiffies;
789 	int goal;
790 
791 	/*
792 	 * Garbage collection is pretty expensive,
793 	 * do not make it too frequently.
794 	 */
795 
796 	RT_CACHE_STAT_INC(gc_total);
797 
798 	if (now - last_gc < ip_rt_gc_min_interval &&
799 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 		RT_CACHE_STAT_INC(gc_ignored);
801 		goto out;
802 	}
803 
804 	/* Calculate number of entries, which we want to expire now. */
805 	goal = atomic_read(&ipv4_dst_ops.entries) -
806 		(ip_rt_gc_elasticity << rt_hash_log);
807 	if (goal <= 0) {
808 		if (equilibrium < ipv4_dst_ops.gc_thresh)
809 			equilibrium = ipv4_dst_ops.gc_thresh;
810 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 		if (goal > 0) {
812 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814 		}
815 	} else {
816 		/* We are in dangerous area. Try to reduce cache really
817 		 * aggressively.
818 		 */
819 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821 	}
822 
823 	if (now - last_gc >= ip_rt_gc_min_interval)
824 		last_gc = now;
825 
826 	if (goal <= 0) {
827 		equilibrium += goal;
828 		goto work_done;
829 	}
830 
831 	do {
832 		int i, k;
833 
834 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 			unsigned long tmo = expire;
836 
837 			k = (k + 1) & rt_hash_mask;
838 			rthp = &rt_hash_table[k].chain;
839 			spin_lock_bh(rt_hash_lock_addr(k));
840 			while ((rth = *rthp) != NULL) {
841 				if (!rt_may_expire(rth, tmo, expire)) {
842 					tmo >>= 1;
843 					rthp = &rth->u.rt_next;
844 					continue;
845 				}
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 				/* remove all related balanced entries
848 				 * if necessary
849 				 */
850 				if (rth->u.dst.flags & DST_BALANCED) {
851 					int r;
852 
853 					rthp = rt_remove_balanced_route(
854 						&rt_hash_table[k].chain,
855 						rth,
856 						&r);
857 					goal -= r;
858 					if (!rthp)
859 						break;
860 				} else {
861 					*rthp = rth->u.rt_next;
862 					rt_free(rth);
863 					goal--;
864 				}
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 				*rthp = rth->u.rt_next;
867 				rt_free(rth);
868 				goal--;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 			}
871 			spin_unlock_bh(rt_hash_lock_addr(k));
872 			if (goal <= 0)
873 				break;
874 		}
875 		rover = k;
876 
877 		if (goal <= 0)
878 			goto work_done;
879 
880 		/* Goal is not achieved. We stop process if:
881 
882 		   - if expire reduced to zero. Otherwise, expire is halfed.
883 		   - if table is not full.
884 		   - if we are called from interrupt.
885 		   - jiffies check is just fallback/debug loop breaker.
886 		     We will not spin here for long time in any case.
887 		 */
888 
889 		RT_CACHE_STAT_INC(gc_goal_miss);
890 
891 		if (expire == 0)
892 			break;
893 
894 		expire >>= 1;
895 #if RT_CACHE_DEBUG >= 2
896 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 				atomic_read(&ipv4_dst_ops.entries), goal, i);
898 #endif
899 
900 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 			goto out;
902 	} while (!in_softirq() && time_before_eq(jiffies, now));
903 
904 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 		goto out;
906 	if (net_ratelimit())
907 		printk(KERN_WARNING "dst cache overflow\n");
908 	RT_CACHE_STAT_INC(gc_dst_overflow);
909 	return 1;
910 
911 work_done:
912 	expire += ip_rt_gc_min_interval;
913 	if (expire > ip_rt_gc_timeout ||
914 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 		expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
919 #endif
920 out:	return 0;
921 }
922 
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924 {
925 	struct rtable	*rth, **rthp;
926 	unsigned long	now;
927 	struct rtable *cand, **candp;
928 	u32 		min_score;
929 	int		chain_length;
930 	int attempts = !in_softirq();
931 
932 restart:
933 	chain_length = 0;
934 	min_score = ~(u32)0;
935 	cand = NULL;
936 	candp = NULL;
937 	now = jiffies;
938 
939 	rthp = &rt_hash_table[hash].chain;
940 
941 	spin_lock_bh(rt_hash_lock_addr(hash));
942 	while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 		if (!(rth->u.dst.flags & DST_BALANCED) &&
945 		    compare_keys(&rth->fl, &rt->fl)) {
946 #else
947 		if (compare_keys(&rth->fl, &rt->fl)) {
948 #endif
949 			/* Put it first */
950 			*rthp = rth->u.rt_next;
951 			/*
952 			 * Since lookup is lockfree, the deletion
953 			 * must be visible to another weakly ordered CPU before
954 			 * the insertion at the start of the hash chain.
955 			 */
956 			rcu_assign_pointer(rth->u.rt_next,
957 					   rt_hash_table[hash].chain);
958 			/*
959 			 * Since lookup is lockfree, the update writes
960 			 * must be ordered for consistency on SMP.
961 			 */
962 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963 
964 			rth->u.dst.__use++;
965 			dst_hold(&rth->u.dst);
966 			rth->u.dst.lastuse = now;
967 			spin_unlock_bh(rt_hash_lock_addr(hash));
968 
969 			rt_drop(rt);
970 			*rp = rth;
971 			return 0;
972 		}
973 
974 		if (!atomic_read(&rth->u.dst.__refcnt)) {
975 			u32 score = rt_score(rth);
976 
977 			if (score <= min_score) {
978 				cand = rth;
979 				candp = rthp;
980 				min_score = score;
981 			}
982 		}
983 
984 		chain_length++;
985 
986 		rthp = &rth->u.rt_next;
987 	}
988 
989 	if (cand) {
990 		/* ip_rt_gc_elasticity used to be average length of chain
991 		 * length, when exceeded gc becomes really aggressive.
992 		 *
993 		 * The second limit is less certain. At the moment it allows
994 		 * only 2 entries per bucket. We will see.
995 		 */
996 		if (chain_length > ip_rt_gc_elasticity) {
997 			*candp = cand->u.rt_next;
998 			rt_free(cand);
999 		}
1000 	}
1001 
1002 	/* Try to bind route to arp only if it is output
1003 	   route or unicast forwarding path.
1004 	 */
1005 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 		int err = arp_bind_neighbour(&rt->u.dst);
1007 		if (err) {
1008 			spin_unlock_bh(rt_hash_lock_addr(hash));
1009 
1010 			if (err != -ENOBUFS) {
1011 				rt_drop(rt);
1012 				return err;
1013 			}
1014 
1015 			/* Neighbour tables are full and nothing
1016 			   can be released. Try to shrink route cache,
1017 			   it is most likely it holds some neighbour records.
1018 			 */
1019 			if (attempts-- > 0) {
1020 				int saved_elasticity = ip_rt_gc_elasticity;
1021 				int saved_int = ip_rt_gc_min_interval;
1022 				ip_rt_gc_elasticity	= 1;
1023 				ip_rt_gc_min_interval	= 0;
1024 				rt_garbage_collect();
1025 				ip_rt_gc_min_interval	= saved_int;
1026 				ip_rt_gc_elasticity	= saved_elasticity;
1027 				goto restart;
1028 			}
1029 
1030 			if (net_ratelimit())
1031 				printk(KERN_WARNING "Neighbour table overflow.\n");
1032 			rt_drop(rt);
1033 			return -ENOBUFS;
1034 		}
1035 	}
1036 
1037 	rt->u.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039 	if (rt->u.rt_next) {
1040 		struct rtable *trt;
1041 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 		       NIPQUAD(rt->rt_dst));
1043 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 		printk("\n");
1046 	}
1047 #endif
1048 	rt_hash_table[hash].chain = rt;
1049 	spin_unlock_bh(rt_hash_lock_addr(hash));
1050 	*rp = rt;
1051 	return 0;
1052 }
1053 
1054 void rt_bind_peer(struct rtable *rt, int create)
1055 {
1056 	static DEFINE_SPINLOCK(rt_peer_lock);
1057 	struct inet_peer *peer;
1058 
1059 	peer = inet_getpeer(rt->rt_dst, create);
1060 
1061 	spin_lock_bh(&rt_peer_lock);
1062 	if (rt->peer == NULL) {
1063 		rt->peer = peer;
1064 		peer = NULL;
1065 	}
1066 	spin_unlock_bh(&rt_peer_lock);
1067 	if (peer)
1068 		inet_putpeer(peer);
1069 }
1070 
1071 /*
1072  * Peer allocation may fail only in serious out-of-memory conditions.  However
1073  * we still can generate some output.
1074  * Random ID selection looks a bit dangerous because we have no chances to
1075  * select ID being unique in a reasonable period of time.
1076  * But broken packet identifier may be better than no packet at all.
1077  */
1078 static void ip_select_fb_ident(struct iphdr *iph)
1079 {
1080 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 	static u32 ip_fallback_id;
1082 	u32 salt;
1083 
1084 	spin_lock_bh(&ip_fb_id_lock);
1085 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086 	iph->id = htons(salt & 0xFFFF);
1087 	ip_fallback_id = salt;
1088 	spin_unlock_bh(&ip_fb_id_lock);
1089 }
1090 
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 {
1093 	struct rtable *rt = (struct rtable *) dst;
1094 
1095 	if (rt) {
1096 		if (rt->peer == NULL)
1097 			rt_bind_peer(rt, 1);
1098 
1099 		/* If peer is attached to destination, it is never detached,
1100 		   so that we need not to grab a lock to dereference it.
1101 		 */
1102 		if (rt->peer) {
1103 			iph->id = htons(inet_getid(rt->peer, more));
1104 			return;
1105 		}
1106 	} else
1107 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 		       __builtin_return_address(0));
1109 
1110 	ip_select_fb_ident(iph);
1111 }
1112 
1113 static void rt_del(unsigned hash, struct rtable *rt)
1114 {
1115 	struct rtable **rthp;
1116 
1117 	spin_lock_bh(rt_hash_lock_addr(hash));
1118 	ip_rt_put(rt);
1119 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 	     rthp = &(*rthp)->u.rt_next)
1121 		if (*rthp == rt) {
1122 			*rthp = rt->u.rt_next;
1123 			rt_free(rt);
1124 			break;
1125 		}
1126 	spin_unlock_bh(rt_hash_lock_addr(hash));
1127 }
1128 
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 		    __be32 saddr, struct net_device *dev)
1131 {
1132 	int i, k;
1133 	struct in_device *in_dev = in_dev_get(dev);
1134 	struct rtable *rth, **rthp;
1135 	__be32  skeys[2] = { saddr, 0 };
1136 	int  ikeys[2] = { dev->ifindex, 0 };
1137 	struct netevent_redirect netevent;
1138 
1139 	if (!in_dev)
1140 		return;
1141 
1142 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 		goto reject_redirect;
1145 
1146 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 			goto reject_redirect;
1149 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 			goto reject_redirect;
1151 	} else {
1152 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 			goto reject_redirect;
1154 	}
1155 
1156 	for (i = 0; i < 2; i++) {
1157 		for (k = 0; k < 2; k++) {
1158 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159 
1160 			rthp=&rt_hash_table[hash].chain;
1161 
1162 			rcu_read_lock();
1163 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 				struct rtable *rt;
1165 
1166 				if (rth->fl.fl4_dst != daddr ||
1167 				    rth->fl.fl4_src != skeys[i] ||
1168 				    rth->fl.oif != ikeys[k] ||
1169 				    rth->fl.iif != 0) {
1170 					rthp = &rth->u.rt_next;
1171 					continue;
1172 				}
1173 
1174 				if (rth->rt_dst != daddr ||
1175 				    rth->rt_src != saddr ||
1176 				    rth->u.dst.error ||
1177 				    rth->rt_gateway != old_gw ||
1178 				    rth->u.dst.dev != dev)
1179 					break;
1180 
1181 				dst_hold(&rth->u.dst);
1182 				rcu_read_unlock();
1183 
1184 				rt = dst_alloc(&ipv4_dst_ops);
1185 				if (rt == NULL) {
1186 					ip_rt_put(rth);
1187 					in_dev_put(in_dev);
1188 					return;
1189 				}
1190 
1191 				/* Copy all the information. */
1192 				*rt = *rth;
1193  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 				rt->u.dst.__use		= 1;
1195 				atomic_set(&rt->u.dst.__refcnt, 1);
1196 				rt->u.dst.child		= NULL;
1197 				if (rt->u.dst.dev)
1198 					dev_hold(rt->u.dst.dev);
1199 				if (rt->idev)
1200 					in_dev_hold(rt->idev);
1201 				rt->u.dst.obsolete	= 0;
1202 				rt->u.dst.lastuse	= jiffies;
1203 				rt->u.dst.path		= &rt->u.dst;
1204 				rt->u.dst.neighbour	= NULL;
1205 				rt->u.dst.hh		= NULL;
1206 				rt->u.dst.xfrm		= NULL;
1207 
1208 				rt->rt_flags		|= RTCF_REDIRECTED;
1209 
1210 				/* Gateway is different ... */
1211 				rt->rt_gateway		= new_gw;
1212 
1213 				/* Redirect received -> path was valid */
1214 				dst_confirm(&rth->u.dst);
1215 
1216 				if (rt->peer)
1217 					atomic_inc(&rt->peer->refcnt);
1218 
1219 				if (arp_bind_neighbour(&rt->u.dst) ||
1220 				    !(rt->u.dst.neighbour->nud_state &
1221 					    NUD_VALID)) {
1222 					if (rt->u.dst.neighbour)
1223 						neigh_event_send(rt->u.dst.neighbour, NULL);
1224 					ip_rt_put(rth);
1225 					rt_drop(rt);
1226 					goto do_next;
1227 				}
1228 
1229 				netevent.old = &rth->u.dst;
1230 				netevent.new = &rt->u.dst;
1231 				call_netevent_notifiers(NETEVENT_REDIRECT,
1232 						        &netevent);
1233 
1234 				rt_del(hash, rth);
1235 				if (!rt_intern_hash(hash, rt, &rt))
1236 					ip_rt_put(rt);
1237 				goto do_next;
1238 			}
1239 			rcu_read_unlock();
1240 		do_next:
1241 			;
1242 		}
1243 	}
1244 	in_dev_put(in_dev);
1245 	return;
1246 
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 			"%u.%u.%u.%u ignored.\n"
1252 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254 		       NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256 	in_dev_put(in_dev);
1257 }
1258 
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 {
1261 	struct rtable *rt = (struct rtable*)dst;
1262 	struct dst_entry *ret = dst;
1263 
1264 	if (rt) {
1265 		if (dst->obsolete) {
1266 			ip_rt_put(rt);
1267 			ret = NULL;
1268 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 			   rt->u.dst.expires) {
1270 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 						rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 					  "%u.%u.%u.%u/%02x dropped\n",
1275 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277 			rt_del(hash, rt);
1278 			ret = NULL;
1279 		}
1280 	}
1281 	return ret;
1282 }
1283 
1284 /*
1285  * Algorithm:
1286  *	1. The first ip_rt_redirect_number redirects are sent
1287  *	   with exponential backoff, then we stop sending them at all,
1288  *	   assuming that the host ignores our redirects.
1289  *	2. If we did not see packets requiring redirects
1290  *	   during ip_rt_redirect_silence, we assume that the host
1291  *	   forgot redirected route and start to send redirects again.
1292  *
1293  * This algorithm is much cheaper and more intelligent than dumb load limiting
1294  * in icmp.c.
1295  *
1296  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298  */
1299 
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1301 {
1302 	struct rtable *rt = (struct rtable*)skb->dst;
1303 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304 
1305 	if (!in_dev)
1306 		return;
1307 
1308 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 		goto out;
1310 
1311 	/* No redirected packets during ip_rt_redirect_silence;
1312 	 * reset the algorithm.
1313 	 */
1314 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 		rt->u.dst.rate_tokens = 0;
1316 
1317 	/* Too many ignored redirects; do not send anything
1318 	 * set u.dst.rate_last to the last seen redirected packet.
1319 	 */
1320 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 		rt->u.dst.rate_last = jiffies;
1322 		goto out;
1323 	}
1324 
1325 	/* Check for load limit; set rate_last to the latest sent
1326 	 * redirect.
1327 	 */
1328 	if (rt->u.dst.rate_tokens == 0 ||
1329 	    time_after(jiffies,
1330 		       (rt->u.dst.rate_last +
1331 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333 		rt->u.dst.rate_last = jiffies;
1334 		++rt->u.dst.rate_tokens;
1335 #ifdef CONFIG_IP_ROUTE_VERBOSE
1336 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338 		    net_ratelimit())
1339 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341 				NIPQUAD(rt->rt_src), rt->rt_iif,
1342 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343 #endif
1344 	}
1345 out:
1346         in_dev_put(in_dev);
1347 }
1348 
1349 static int ip_error(struct sk_buff *skb)
1350 {
1351 	struct rtable *rt = (struct rtable*)skb->dst;
1352 	unsigned long now;
1353 	int code;
1354 
1355 	switch (rt->u.dst.error) {
1356 		case EINVAL:
1357 		default:
1358 			goto out;
1359 		case EHOSTUNREACH:
1360 			code = ICMP_HOST_UNREACH;
1361 			break;
1362 		case ENETUNREACH:
1363 			code = ICMP_NET_UNREACH;
1364 			break;
1365 		case EACCES:
1366 			code = ICMP_PKT_FILTERED;
1367 			break;
1368 	}
1369 
1370 	now = jiffies;
1371 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 	rt->u.dst.rate_last = now;
1375 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378 	}
1379 
1380 out:	kfree_skb(skb);
1381 	return 0;
1382 }
1383 
1384 /*
1385  *	The last two values are not from the RFC but
1386  *	are needed for AMPRnet AX.25 paths.
1387  */
1388 
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391 
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393 {
1394 	int i;
1395 
1396 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 		if (old_mtu > mtu_plateau[i])
1398 			return mtu_plateau[i];
1399 	return 68;
1400 }
1401 
1402 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1403 {
1404 	int i;
1405 	unsigned short old_mtu = ntohs(iph->tot_len);
1406 	struct rtable *rth;
1407 	__be32  skeys[2] = { iph->saddr, 0, };
1408 	__be32  daddr = iph->daddr;
1409 	unsigned short est_mtu = 0;
1410 
1411 	if (ipv4_config.no_pmtu_disc)
1412 		return 0;
1413 
1414 	for (i = 0; i < 2; i++) {
1415 		unsigned hash = rt_hash(daddr, skeys[i], 0);
1416 
1417 		rcu_read_lock();
1418 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 		     rth = rcu_dereference(rth->u.rt_next)) {
1420 			if (rth->fl.fl4_dst == daddr &&
1421 			    rth->fl.fl4_src == skeys[i] &&
1422 			    rth->rt_dst  == daddr &&
1423 			    rth->rt_src  == iph->saddr &&
1424 			    rth->fl.iif == 0 &&
1425 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426 				unsigned short mtu = new_mtu;
1427 
1428 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1429 
1430 					/* BSD 4.2 compatibility hack :-( */
1431 					if (mtu == 0 &&
1432 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433 					    old_mtu >= 68 + (iph->ihl << 2))
1434 						old_mtu -= iph->ihl << 2;
1435 
1436 					mtu = guess_mtu(old_mtu);
1437 				}
1438 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440 						dst_confirm(&rth->u.dst);
1441 						if (mtu < ip_rt_min_pmtu) {
1442 							mtu = ip_rt_min_pmtu;
1443 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1444 								(1 << RTAX_MTU);
1445 						}
1446 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447 						dst_set_expires(&rth->u.dst,
1448 							ip_rt_mtu_expires);
1449 					}
1450 					est_mtu = mtu;
1451 				}
1452 			}
1453 		}
1454 		rcu_read_unlock();
1455 	}
1456 	return est_mtu ? : new_mtu;
1457 }
1458 
1459 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460 {
1461 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1463 		if (mtu < ip_rt_min_pmtu) {
1464 			mtu = ip_rt_min_pmtu;
1465 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466 		}
1467 		dst->metrics[RTAX_MTU-1] = mtu;
1468 		dst_set_expires(dst, ip_rt_mtu_expires);
1469 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1470 	}
1471 }
1472 
1473 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1474 {
1475 	return NULL;
1476 }
1477 
1478 static void ipv4_dst_destroy(struct dst_entry *dst)
1479 {
1480 	struct rtable *rt = (struct rtable *) dst;
1481 	struct inet_peer *peer = rt->peer;
1482 	struct in_device *idev = rt->idev;
1483 
1484 	if (peer) {
1485 		rt->peer = NULL;
1486 		inet_putpeer(peer);
1487 	}
1488 
1489 	if (idev) {
1490 		rt->idev = NULL;
1491 		in_dev_put(idev);
1492 	}
1493 }
1494 
1495 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496 			    int how)
1497 {
1498 	struct rtable *rt = (struct rtable *) dst;
1499 	struct in_device *idev = rt->idev;
1500 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1501 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502 		if (loopback_idev) {
1503 			rt->idev = loopback_idev;
1504 			in_dev_put(idev);
1505 		}
1506 	}
1507 }
1508 
1509 static void ipv4_link_failure(struct sk_buff *skb)
1510 {
1511 	struct rtable *rt;
1512 
1513 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514 
1515 	rt = (struct rtable *) skb->dst;
1516 	if (rt)
1517 		dst_set_expires(&rt->u.dst, 0);
1518 }
1519 
1520 static int ip_rt_bug(struct sk_buff *skb)
1521 {
1522 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524 		skb->dev ? skb->dev->name : "?");
1525 	kfree_skb(skb);
1526 	return 0;
1527 }
1528 
1529 /*
1530    We do not cache source address of outgoing interface,
1531    because it is used only by IP RR, TS and SRR options,
1532    so that it out of fast path.
1533 
1534    BTW remember: "addr" is allowed to be not aligned
1535    in IP options!
1536  */
1537 
1538 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1539 {
1540 	__be32 src;
1541 	struct fib_result res;
1542 
1543 	if (rt->fl.iif == 0)
1544 		src = rt->rt_src;
1545 	else if (fib_lookup(&rt->fl, &res) == 0) {
1546 		src = FIB_RES_PREFSRC(res);
1547 		fib_res_put(&res);
1548 	} else
1549 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550 					RT_SCOPE_UNIVERSE);
1551 	memcpy(addr, &src, 4);
1552 }
1553 
1554 #ifdef CONFIG_NET_CLS_ROUTE
1555 static void set_class_tag(struct rtable *rt, u32 tag)
1556 {
1557 	if (!(rt->u.dst.tclassid & 0xFFFF))
1558 		rt->u.dst.tclassid |= tag & 0xFFFF;
1559 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1561 }
1562 #endif
1563 
1564 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565 {
1566 	struct fib_info *fi = res->fi;
1567 
1568 	if (fi) {
1569 		if (FIB_RES_GW(*res) &&
1570 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571 			rt->rt_gateway = FIB_RES_GW(*res);
1572 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573 		       sizeof(rt->u.dst.metrics));
1574 		if (fi->fib_mtu == 0) {
1575 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577 			    rt->rt_gateway != rt->rt_dst &&
1578 			    rt->u.dst.dev->mtu > 576)
1579 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580 		}
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583 #endif
1584 	} else
1585 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586 
1587 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593 				       ip_rt_min_advmss);
1594 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596 
1597 #ifdef CONFIG_NET_CLS_ROUTE
1598 #ifdef CONFIG_IP_MULTIPLE_TABLES
1599 	set_class_tag(rt, fib_rules_tclass(res));
1600 #endif
1601 	set_class_tag(rt, itag);
1602 #endif
1603         rt->rt_type = res->type;
1604 }
1605 
1606 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1607 				u8 tos, struct net_device *dev, int our)
1608 {
1609 	unsigned hash;
1610 	struct rtable *rth;
1611 	__be32 spec_dst;
1612 	struct in_device *in_dev = in_dev_get(dev);
1613 	u32 itag = 0;
1614 
1615 	/* Primary sanity checks. */
1616 
1617 	if (in_dev == NULL)
1618 		return -EINVAL;
1619 
1620 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621 	    skb->protocol != htons(ETH_P_IP))
1622 		goto e_inval;
1623 
1624 	if (ZERONET(saddr)) {
1625 		if (!LOCAL_MCAST(daddr))
1626 			goto e_inval;
1627 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628 	} else if (fib_validate_source(saddr, 0, tos, 0,
1629 					dev, &spec_dst, &itag) < 0)
1630 		goto e_inval;
1631 
1632 	rth = dst_alloc(&ipv4_dst_ops);
1633 	if (!rth)
1634 		goto e_nobufs;
1635 
1636 	rth->u.dst.output= ip_rt_bug;
1637 
1638 	atomic_set(&rth->u.dst.__refcnt, 1);
1639 	rth->u.dst.flags= DST_HOST;
1640 	if (in_dev->cnf.no_policy)
1641 		rth->u.dst.flags |= DST_NOPOLICY;
1642 	rth->fl.fl4_dst	= daddr;
1643 	rth->rt_dst	= daddr;
1644 	rth->fl.fl4_tos	= tos;
1645 	rth->fl.mark    = skb->mark;
1646 	rth->fl.fl4_src	= saddr;
1647 	rth->rt_src	= saddr;
1648 #ifdef CONFIG_NET_CLS_ROUTE
1649 	rth->u.dst.tclassid = itag;
1650 #endif
1651 	rth->rt_iif	=
1652 	rth->fl.iif	= dev->ifindex;
1653 	rth->u.dst.dev	= &loopback_dev;
1654 	dev_hold(rth->u.dst.dev);
1655 	rth->idev	= in_dev_get(rth->u.dst.dev);
1656 	rth->fl.oif	= 0;
1657 	rth->rt_gateway	= daddr;
1658 	rth->rt_spec_dst= spec_dst;
1659 	rth->rt_type	= RTN_MULTICAST;
1660 	rth->rt_flags	= RTCF_MULTICAST;
1661 	if (our) {
1662 		rth->u.dst.input= ip_local_deliver;
1663 		rth->rt_flags |= RTCF_LOCAL;
1664 	}
1665 
1666 #ifdef CONFIG_IP_MROUTE
1667 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668 		rth->u.dst.input = ip_mr_input;
1669 #endif
1670 	RT_CACHE_STAT_INC(in_slow_mc);
1671 
1672 	in_dev_put(in_dev);
1673 	hash = rt_hash(daddr, saddr, dev->ifindex);
1674 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1675 
1676 e_nobufs:
1677 	in_dev_put(in_dev);
1678 	return -ENOBUFS;
1679 
1680 e_inval:
1681 	in_dev_put(in_dev);
1682 	return -EINVAL;
1683 }
1684 
1685 
1686 static void ip_handle_martian_source(struct net_device *dev,
1687 				     struct in_device *in_dev,
1688 				     struct sk_buff *skb,
1689 				     __be32 daddr,
1690 				     __be32 saddr)
1691 {
1692 	RT_CACHE_STAT_INC(in_martian_src);
1693 #ifdef CONFIG_IP_ROUTE_VERBOSE
1694 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695 		/*
1696 		 *	RFC1812 recommendation, if source is martian,
1697 		 *	the only hint is MAC header.
1698 		 */
1699 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700 			"%u.%u.%u.%u, on dev %s\n",
1701 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1702 		if (dev->hard_header_len && skb->mac.raw) {
1703 			int i;
1704 			unsigned char *p = skb->mac.raw;
1705 			printk(KERN_WARNING "ll header: ");
1706 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1707 				printk("%02x", *p);
1708 				if (i < (dev->hard_header_len - 1))
1709 					printk(":");
1710 			}
1711 			printk("\n");
1712 		}
1713 	}
1714 #endif
1715 }
1716 
1717 static inline int __mkroute_input(struct sk_buff *skb,
1718 				  struct fib_result* res,
1719 				  struct in_device *in_dev,
1720 				  __be32 daddr, __be32 saddr, u32 tos,
1721 				  struct rtable **result)
1722 {
1723 
1724 	struct rtable *rth;
1725 	int err;
1726 	struct in_device *out_dev;
1727 	unsigned flags = 0;
1728 	__be32 spec_dst;
1729 	u32 itag;
1730 
1731 	/* get a working reference to the output device */
1732 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1733 	if (out_dev == NULL) {
1734 		if (net_ratelimit())
1735 			printk(KERN_CRIT "Bug in ip_route_input" \
1736 			       "_slow(). Please, report\n");
1737 		return -EINVAL;
1738 	}
1739 
1740 
1741 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742 				  in_dev->dev, &spec_dst, &itag);
1743 	if (err < 0) {
1744 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1745 					 saddr);
1746 
1747 		err = -EINVAL;
1748 		goto cleanup;
1749 	}
1750 
1751 	if (err)
1752 		flags |= RTCF_DIRECTSRC;
1753 
1754 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1756 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757 		flags |= RTCF_DOREDIRECT;
1758 
1759 	if (skb->protocol != htons(ETH_P_IP)) {
1760 		/* Not IP (i.e. ARP). Do not create route, if it is
1761 		 * invalid for proxy arp. DNAT routes are always valid.
1762 		 */
1763 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764 			err = -EINVAL;
1765 			goto cleanup;
1766 		}
1767 	}
1768 
1769 
1770 	rth = dst_alloc(&ipv4_dst_ops);
1771 	if (!rth) {
1772 		err = -ENOBUFS;
1773 		goto cleanup;
1774 	}
1775 
1776 	atomic_set(&rth->u.dst.__refcnt, 1);
1777 	rth->u.dst.flags= DST_HOST;
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779 	if (res->fi->fib_nhs > 1)
1780 		rth->u.dst.flags |= DST_BALANCED;
1781 #endif
1782 	if (in_dev->cnf.no_policy)
1783 		rth->u.dst.flags |= DST_NOPOLICY;
1784 	if (out_dev->cnf.no_xfrm)
1785 		rth->u.dst.flags |= DST_NOXFRM;
1786 	rth->fl.fl4_dst	= daddr;
1787 	rth->rt_dst	= daddr;
1788 	rth->fl.fl4_tos	= tos;
1789 	rth->fl.mark    = skb->mark;
1790 	rth->fl.fl4_src	= saddr;
1791 	rth->rt_src	= saddr;
1792 	rth->rt_gateway	= daddr;
1793 	rth->rt_iif 	=
1794 		rth->fl.iif	= in_dev->dev->ifindex;
1795 	rth->u.dst.dev	= (out_dev)->dev;
1796 	dev_hold(rth->u.dst.dev);
1797 	rth->idev	= in_dev_get(rth->u.dst.dev);
1798 	rth->fl.oif 	= 0;
1799 	rth->rt_spec_dst= spec_dst;
1800 
1801 	rth->u.dst.input = ip_forward;
1802 	rth->u.dst.output = ip_output;
1803 
1804 	rt_set_nexthop(rth, res, itag);
1805 
1806 	rth->rt_flags = flags;
1807 
1808 	*result = rth;
1809 	err = 0;
1810  cleanup:
1811 	/* release the working reference to the output device */
1812 	in_dev_put(out_dev);
1813 	return err;
1814 }
1815 
1816 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817 				       struct fib_result* res,
1818 				       const struct flowi *fl,
1819 				       struct in_device *in_dev,
1820 				       __be32 daddr, __be32 saddr, u32 tos)
1821 {
1822 	struct rtable* rth = NULL;
1823 	int err;
1824 	unsigned hash;
1825 
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1827 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828 		fib_select_multipath(fl, res);
1829 #endif
1830 
1831 	/* create a routing cache entry */
1832 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833 	if (err)
1834 		return err;
1835 
1836 	/* put it into the cache */
1837 	hash = rt_hash(daddr, saddr, fl->iif);
1838 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1839 }
1840 
1841 static inline int ip_mkroute_input(struct sk_buff *skb,
1842 				   struct fib_result* res,
1843 				   const struct flowi *fl,
1844 				   struct in_device *in_dev,
1845 				   __be32 daddr, __be32 saddr, u32 tos)
1846 {
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1848 	struct rtable* rth = NULL, *rtres;
1849 	unsigned char hop, hopcount;
1850 	int err = -EINVAL;
1851 	unsigned int hash;
1852 
1853 	if (res->fi)
1854 		hopcount = res->fi->fib_nhs;
1855 	else
1856 		hopcount = 1;
1857 
1858 	/* distinguish between multipath and singlepath */
1859 	if (hopcount < 2)
1860 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861 					    saddr, tos);
1862 
1863 	/* add all alternatives to the routing cache */
1864 	for (hop = 0; hop < hopcount; hop++) {
1865 		res->nh_sel = hop;
1866 
1867 		/* put reference to previous result */
1868 		if (hop)
1869 			ip_rt_put(rtres);
1870 
1871 		/* create a routing cache entry */
1872 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873 				      &rth);
1874 		if (err)
1875 			return err;
1876 
1877 		/* put it into the cache */
1878 		hash = rt_hash(daddr, saddr, fl->iif);
1879 		err = rt_intern_hash(hash, rth, &rtres);
1880 		if (err)
1881 			return err;
1882 
1883 		/* forward hop information to multipath impl. */
1884 		multipath_set_nhinfo(rth,
1885 				     FIB_RES_NETWORK(*res),
1886 				     FIB_RES_NETMASK(*res),
1887 				     res->prefixlen,
1888 				     &FIB_RES_NH(*res));
1889 	}
1890 	skb->dst = &rtres->u.dst;
1891 	return err;
1892 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1893 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1895 }
1896 
1897 
1898 /*
1899  *	NOTE. We drop all the packets that has local source
1900  *	addresses, because every properly looped back packet
1901  *	must have correct destination already attached by output routine.
1902  *
1903  *	Such approach solves two big problems:
1904  *	1. Not simplex devices are handled properly.
1905  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1906  */
1907 
1908 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909 			       u8 tos, struct net_device *dev)
1910 {
1911 	struct fib_result res;
1912 	struct in_device *in_dev = in_dev_get(dev);
1913 	struct flowi fl = { .nl_u = { .ip4_u =
1914 				      { .daddr = daddr,
1915 					.saddr = saddr,
1916 					.tos = tos,
1917 					.scope = RT_SCOPE_UNIVERSE,
1918 				      } },
1919 			    .mark = skb->mark,
1920 			    .iif = dev->ifindex };
1921 	unsigned	flags = 0;
1922 	u32		itag = 0;
1923 	struct rtable * rth;
1924 	unsigned	hash;
1925 	__be32		spec_dst;
1926 	int		err = -EINVAL;
1927 	int		free_res = 0;
1928 
1929 	/* IP on this device is disabled. */
1930 
1931 	if (!in_dev)
1932 		goto out;
1933 
1934 	/* Check for the most weird martians, which can be not detected
1935 	   by fib_lookup.
1936 	 */
1937 
1938 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939 		goto martian_source;
1940 
1941 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1942 		goto brd_input;
1943 
1944 	/* Accept zero addresses only to limited broadcast;
1945 	 * I even do not know to fix it or not. Waiting for complains :-)
1946 	 */
1947 	if (ZERONET(saddr))
1948 		goto martian_source;
1949 
1950 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951 		goto martian_destination;
1952 
1953 	/*
1954 	 *	Now we are ready to route packet.
1955 	 */
1956 	if ((err = fib_lookup(&fl, &res)) != 0) {
1957 		if (!IN_DEV_FORWARD(in_dev))
1958 			goto e_hostunreach;
1959 		goto no_route;
1960 	}
1961 	free_res = 1;
1962 
1963 	RT_CACHE_STAT_INC(in_slow_tot);
1964 
1965 	if (res.type == RTN_BROADCAST)
1966 		goto brd_input;
1967 
1968 	if (res.type == RTN_LOCAL) {
1969 		int result;
1970 		result = fib_validate_source(saddr, daddr, tos,
1971 					     loopback_dev.ifindex,
1972 					     dev, &spec_dst, &itag);
1973 		if (result < 0)
1974 			goto martian_source;
1975 		if (result)
1976 			flags |= RTCF_DIRECTSRC;
1977 		spec_dst = daddr;
1978 		goto local_input;
1979 	}
1980 
1981 	if (!IN_DEV_FORWARD(in_dev))
1982 		goto e_hostunreach;
1983 	if (res.type != RTN_UNICAST)
1984 		goto martian_destination;
1985 
1986 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987 	if (err == -ENOBUFS)
1988 		goto e_nobufs;
1989 	if (err == -EINVAL)
1990 		goto e_inval;
1991 
1992 done:
1993 	in_dev_put(in_dev);
1994 	if (free_res)
1995 		fib_res_put(&res);
1996 out:	return err;
1997 
1998 brd_input:
1999 	if (skb->protocol != htons(ETH_P_IP))
2000 		goto e_inval;
2001 
2002 	if (ZERONET(saddr))
2003 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004 	else {
2005 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006 					  &itag);
2007 		if (err < 0)
2008 			goto martian_source;
2009 		if (err)
2010 			flags |= RTCF_DIRECTSRC;
2011 	}
2012 	flags |= RTCF_BROADCAST;
2013 	res.type = RTN_BROADCAST;
2014 	RT_CACHE_STAT_INC(in_brd);
2015 
2016 local_input:
2017 	rth = dst_alloc(&ipv4_dst_ops);
2018 	if (!rth)
2019 		goto e_nobufs;
2020 
2021 	rth->u.dst.output= ip_rt_bug;
2022 
2023 	atomic_set(&rth->u.dst.__refcnt, 1);
2024 	rth->u.dst.flags= DST_HOST;
2025 	if (in_dev->cnf.no_policy)
2026 		rth->u.dst.flags |= DST_NOPOLICY;
2027 	rth->fl.fl4_dst	= daddr;
2028 	rth->rt_dst	= daddr;
2029 	rth->fl.fl4_tos	= tos;
2030 	rth->fl.mark    = skb->mark;
2031 	rth->fl.fl4_src	= saddr;
2032 	rth->rt_src	= saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034 	rth->u.dst.tclassid = itag;
2035 #endif
2036 	rth->rt_iif	=
2037 	rth->fl.iif	= dev->ifindex;
2038 	rth->u.dst.dev	= &loopback_dev;
2039 	dev_hold(rth->u.dst.dev);
2040 	rth->idev	= in_dev_get(rth->u.dst.dev);
2041 	rth->rt_gateway	= daddr;
2042 	rth->rt_spec_dst= spec_dst;
2043 	rth->u.dst.input= ip_local_deliver;
2044 	rth->rt_flags 	= flags|RTCF_LOCAL;
2045 	if (res.type == RTN_UNREACHABLE) {
2046 		rth->u.dst.input= ip_error;
2047 		rth->u.dst.error= -err;
2048 		rth->rt_flags 	&= ~RTCF_LOCAL;
2049 	}
2050 	rth->rt_type	= res.type;
2051 	hash = rt_hash(daddr, saddr, fl.iif);
2052 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053 	goto done;
2054 
2055 no_route:
2056 	RT_CACHE_STAT_INC(in_no_route);
2057 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 	res.type = RTN_UNREACHABLE;
2059 	goto local_input;
2060 
2061 	/*
2062 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2063 	 */
2064 martian_destination:
2065 	RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 			"%u.%u.%u.%u, dev %s\n",
2070 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071 #endif
2072 
2073 e_hostunreach:
2074         err = -EHOSTUNREACH;
2075         goto done;
2076 
2077 e_inval:
2078 	err = -EINVAL;
2079 	goto done;
2080 
2081 e_nobufs:
2082 	err = -ENOBUFS;
2083 	goto done;
2084 
2085 martian_source:
2086 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087 	goto e_inval;
2088 }
2089 
2090 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 		   u8 tos, struct net_device *dev)
2092 {
2093 	struct rtable * rth;
2094 	unsigned	hash;
2095 	int iif = dev->ifindex;
2096 
2097 	tos &= IPTOS_RT_MASK;
2098 	hash = rt_hash(daddr, saddr, iif);
2099 
2100 	rcu_read_lock();
2101 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 	     rth = rcu_dereference(rth->u.rt_next)) {
2103 		if (rth->fl.fl4_dst == daddr &&
2104 		    rth->fl.fl4_src == saddr &&
2105 		    rth->fl.iif == iif &&
2106 		    rth->fl.oif == 0 &&
2107 		    rth->fl.mark == skb->mark &&
2108 		    rth->fl.fl4_tos == tos) {
2109 			rth->u.dst.lastuse = jiffies;
2110 			dst_hold(&rth->u.dst);
2111 			rth->u.dst.__use++;
2112 			RT_CACHE_STAT_INC(in_hit);
2113 			rcu_read_unlock();
2114 			skb->dst = (struct dst_entry*)rth;
2115 			return 0;
2116 		}
2117 		RT_CACHE_STAT_INC(in_hlist_search);
2118 	}
2119 	rcu_read_unlock();
2120 
2121 	/* Multicast recognition logic is moved from route cache to here.
2122 	   The problem was that too many Ethernet cards have broken/missing
2123 	   hardware multicast filters :-( As result the host on multicasting
2124 	   network acquires a lot of useless route cache entries, sort of
2125 	   SDR messages from all the world. Now we try to get rid of them.
2126 	   Really, provided software IP multicast filter is organized
2127 	   reasonably (at least, hashed), it does not result in a slowdown
2128 	   comparing with route cache reject entries.
2129 	   Note, that multicast routers are not affected, because
2130 	   route cache entry is created eventually.
2131 	 */
2132 	if (MULTICAST(daddr)) {
2133 		struct in_device *in_dev;
2134 
2135 		rcu_read_lock();
2136 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2137 			int our = ip_check_mc(in_dev, daddr, saddr,
2138 				skb->nh.iph->protocol);
2139 			if (our
2140 #ifdef CONFIG_IP_MROUTE
2141 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142 #endif
2143 			    ) {
2144 				rcu_read_unlock();
2145 				return ip_route_input_mc(skb, daddr, saddr,
2146 							 tos, dev, our);
2147 			}
2148 		}
2149 		rcu_read_unlock();
2150 		return -EINVAL;
2151 	}
2152 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153 }
2154 
2155 static inline int __mkroute_output(struct rtable **result,
2156 				   struct fib_result* res,
2157 				   const struct flowi *fl,
2158 				   const struct flowi *oldflp,
2159 				   struct net_device *dev_out,
2160 				   unsigned flags)
2161 {
2162 	struct rtable *rth;
2163 	struct in_device *in_dev;
2164 	u32 tos = RT_FL_TOS(oldflp);
2165 	int err = 0;
2166 
2167 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168 		return -EINVAL;
2169 
2170 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2171 		res->type = RTN_BROADCAST;
2172 	else if (MULTICAST(fl->fl4_dst))
2173 		res->type = RTN_MULTICAST;
2174 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175 		return -EINVAL;
2176 
2177 	if (dev_out->flags & IFF_LOOPBACK)
2178 		flags |= RTCF_LOCAL;
2179 
2180 	/* get work reference to inet device */
2181 	in_dev = in_dev_get(dev_out);
2182 	if (!in_dev)
2183 		return -EINVAL;
2184 
2185 	if (res->type == RTN_BROADCAST) {
2186 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187 		if (res->fi) {
2188 			fib_info_put(res->fi);
2189 			res->fi = NULL;
2190 		}
2191 	} else if (res->type == RTN_MULTICAST) {
2192 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194 				 oldflp->proto))
2195 			flags &= ~RTCF_LOCAL;
2196 		/* If multicast route do not exist use
2197 		   default one, but do not gateway in this case.
2198 		   Yes, it is hack.
2199 		 */
2200 		if (res->fi && res->prefixlen < 4) {
2201 			fib_info_put(res->fi);
2202 			res->fi = NULL;
2203 		}
2204 	}
2205 
2206 
2207 	rth = dst_alloc(&ipv4_dst_ops);
2208 	if (!rth) {
2209 		err = -ENOBUFS;
2210 		goto cleanup;
2211 	}
2212 
2213 	atomic_set(&rth->u.dst.__refcnt, 1);
2214 	rth->u.dst.flags= DST_HOST;
2215 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216 	if (res->fi) {
2217 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218 		if (res->fi->fib_nhs > 1)
2219 			rth->u.dst.flags |= DST_BALANCED;
2220 	}
2221 #endif
2222 	if (in_dev->cnf.no_xfrm)
2223 		rth->u.dst.flags |= DST_NOXFRM;
2224 	if (in_dev->cnf.no_policy)
2225 		rth->u.dst.flags |= DST_NOPOLICY;
2226 
2227 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2228 	rth->fl.fl4_tos	= tos;
2229 	rth->fl.fl4_src	= oldflp->fl4_src;
2230 	rth->fl.oif	= oldflp->oif;
2231 	rth->fl.mark    = oldflp->mark;
2232 	rth->rt_dst	= fl->fl4_dst;
2233 	rth->rt_src	= fl->fl4_src;
2234 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2235 	/* get references to the devices that are to be hold by the routing
2236 	   cache entry */
2237 	rth->u.dst.dev	= dev_out;
2238 	dev_hold(dev_out);
2239 	rth->idev	= in_dev_get(dev_out);
2240 	rth->rt_gateway = fl->fl4_dst;
2241 	rth->rt_spec_dst= fl->fl4_src;
2242 
2243 	rth->u.dst.output=ip_output;
2244 
2245 	RT_CACHE_STAT_INC(out_slow_tot);
2246 
2247 	if (flags & RTCF_LOCAL) {
2248 		rth->u.dst.input = ip_local_deliver;
2249 		rth->rt_spec_dst = fl->fl4_dst;
2250 	}
2251 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 		rth->rt_spec_dst = fl->fl4_src;
2253 		if (flags & RTCF_LOCAL &&
2254 		    !(dev_out->flags & IFF_LOOPBACK)) {
2255 			rth->u.dst.output = ip_mc_output;
2256 			RT_CACHE_STAT_INC(out_slow_mc);
2257 		}
2258 #ifdef CONFIG_IP_MROUTE
2259 		if (res->type == RTN_MULTICAST) {
2260 			if (IN_DEV_MFORWARD(in_dev) &&
2261 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2262 				rth->u.dst.input = ip_mr_input;
2263 				rth->u.dst.output = ip_mc_output;
2264 			}
2265 		}
2266 #endif
2267 	}
2268 
2269 	rt_set_nexthop(rth, res, 0);
2270 
2271 	rth->rt_flags = flags;
2272 
2273 	*result = rth;
2274  cleanup:
2275 	/* release work reference to inet device */
2276 	in_dev_put(in_dev);
2277 
2278 	return err;
2279 }
2280 
2281 static inline int ip_mkroute_output_def(struct rtable **rp,
2282 					struct fib_result* res,
2283 					const struct flowi *fl,
2284 					const struct flowi *oldflp,
2285 					struct net_device *dev_out,
2286 					unsigned flags)
2287 {
2288 	struct rtable *rth = NULL;
2289 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290 	unsigned hash;
2291 	if (err == 0) {
2292 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2293 		err = rt_intern_hash(hash, rth, rp);
2294 	}
2295 
2296 	return err;
2297 }
2298 
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300 				    struct fib_result* res,
2301 				    const struct flowi *fl,
2302 				    const struct flowi *oldflp,
2303 				    struct net_device *dev_out,
2304 				    unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 	unsigned char hop;
2308 	unsigned hash;
2309 	int err = -EINVAL;
2310 	struct rtable *rth = NULL;
2311 
2312 	if (res->fi && res->fi->fib_nhs > 1) {
2313 		unsigned char hopcount = res->fi->fib_nhs;
2314 
2315 		for (hop = 0; hop < hopcount; hop++) {
2316 			struct net_device *dev2nexthop;
2317 
2318 			res->nh_sel = hop;
2319 
2320 			/* hold a work reference to the output device */
2321 			dev2nexthop = FIB_RES_DEV(*res);
2322 			dev_hold(dev2nexthop);
2323 
2324 			/* put reference to previous result */
2325 			if (hop)
2326 				ip_rt_put(*rp);
2327 
2328 			err = __mkroute_output(&rth, res, fl, oldflp,
2329 					       dev2nexthop, flags);
2330 
2331 			if (err != 0)
2332 				goto cleanup;
2333 
2334 			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335 					oldflp->oif);
2336 			err = rt_intern_hash(hash, rth, rp);
2337 
2338 			/* forward hop information to multipath impl. */
2339 			multipath_set_nhinfo(rth,
2340 					     FIB_RES_NETWORK(*res),
2341 					     FIB_RES_NETMASK(*res),
2342 					     res->prefixlen,
2343 					     &FIB_RES_NH(*res));
2344 		cleanup:
2345 			/* release work reference to output device */
2346 			dev_put(dev2nexthop);
2347 
2348 			if (err != 0)
2349 				return err;
2350 		}
2351 		return err;
2352 	} else {
2353 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354 					     flags);
2355 	}
2356 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358 #endif
2359 }
2360 
2361 /*
2362  * Major route resolver routine.
2363  */
2364 
2365 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366 {
2367 	u32 tos	= RT_FL_TOS(oldflp);
2368 	struct flowi fl = { .nl_u = { .ip4_u =
2369 				      { .daddr = oldflp->fl4_dst,
2370 					.saddr = oldflp->fl4_src,
2371 					.tos = tos & IPTOS_RT_MASK,
2372 					.scope = ((tos & RTO_ONLINK) ?
2373 						  RT_SCOPE_LINK :
2374 						  RT_SCOPE_UNIVERSE),
2375 				      } },
2376 			    .mark = oldflp->mark,
2377 			    .iif = loopback_dev.ifindex,
2378 			    .oif = oldflp->oif };
2379 	struct fib_result res;
2380 	unsigned flags = 0;
2381 	struct net_device *dev_out = NULL;
2382 	int free_res = 0;
2383 	int err;
2384 
2385 
2386 	res.fi		= NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2388 	res.r		= NULL;
2389 #endif
2390 
2391 	if (oldflp->fl4_src) {
2392 		err = -EINVAL;
2393 		if (MULTICAST(oldflp->fl4_src) ||
2394 		    BADCLASS(oldflp->fl4_src) ||
2395 		    ZERONET(oldflp->fl4_src))
2396 			goto out;
2397 
2398 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 		dev_out = ip_dev_find(oldflp->fl4_src);
2400 		if (dev_out == NULL)
2401 			goto out;
2402 
2403 		/* I removed check for oif == dev_out->oif here.
2404 		   It was wrong for two reasons:
2405 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 		      assigned to multiple interfaces.
2407 		   2. Moreover, we are allowed to send packets with saddr
2408 		      of another iface. --ANK
2409 		 */
2410 
2411 		if (oldflp->oif == 0
2412 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2413 			/* Special hack: user can direct multicasts
2414 			   and limited broadcast via necessary interface
2415 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 			   This hack is not just for fun, it allows
2417 			   vic,vat and friends to work.
2418 			   They bind socket to loopback, set ttl to zero
2419 			   and expect that it will work.
2420 			   From the viewpoint of routing cache they are broken,
2421 			   because we are not allowed to build multicast path
2422 			   with loopback source addr (look, routing cache
2423 			   cannot know, that ttl is zero, so that packet
2424 			   will not leave this host and route is valid).
2425 			   Luckily, this hack is good workaround.
2426 			 */
2427 
2428 			fl.oif = dev_out->ifindex;
2429 			goto make_route;
2430 		}
2431 		if (dev_out)
2432 			dev_put(dev_out);
2433 		dev_out = NULL;
2434 	}
2435 
2436 
2437 	if (oldflp->oif) {
2438 		dev_out = dev_get_by_index(oldflp->oif);
2439 		err = -ENODEV;
2440 		if (dev_out == NULL)
2441 			goto out;
2442 
2443 		/* RACE: Check return value of inet_select_addr instead. */
2444 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2445 			dev_put(dev_out);
2446 			goto out;	/* Wrong error code */
2447 		}
2448 
2449 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2450 			if (!fl.fl4_src)
2451 				fl.fl4_src = inet_select_addr(dev_out, 0,
2452 							      RT_SCOPE_LINK);
2453 			goto make_route;
2454 		}
2455 		if (!fl.fl4_src) {
2456 			if (MULTICAST(oldflp->fl4_dst))
2457 				fl.fl4_src = inet_select_addr(dev_out, 0,
2458 							      fl.fl4_scope);
2459 			else if (!oldflp->fl4_dst)
2460 				fl.fl4_src = inet_select_addr(dev_out, 0,
2461 							      RT_SCOPE_HOST);
2462 		}
2463 	}
2464 
2465 	if (!fl.fl4_dst) {
2466 		fl.fl4_dst = fl.fl4_src;
2467 		if (!fl.fl4_dst)
2468 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 		if (dev_out)
2470 			dev_put(dev_out);
2471 		dev_out = &loopback_dev;
2472 		dev_hold(dev_out);
2473 		fl.oif = loopback_dev.ifindex;
2474 		res.type = RTN_LOCAL;
2475 		flags |= RTCF_LOCAL;
2476 		goto make_route;
2477 	}
2478 
2479 	if (fib_lookup(&fl, &res)) {
2480 		res.fi = NULL;
2481 		if (oldflp->oif) {
2482 			/* Apparently, routing tables are wrong. Assume,
2483 			   that the destination is on link.
2484 
2485 			   WHY? DW.
2486 			   Because we are allowed to send to iface
2487 			   even if it has NO routes and NO assigned
2488 			   addresses. When oif is specified, routing
2489 			   tables are looked up with only one purpose:
2490 			   to catch if destination is gatewayed, rather than
2491 			   direct. Moreover, if MSG_DONTROUTE is set,
2492 			   we send packet, ignoring both routing tables
2493 			   and ifaddr state. --ANK
2494 
2495 
2496 			   We could make it even if oif is unknown,
2497 			   likely IPv6, but we do not.
2498 			 */
2499 
2500 			if (fl.fl4_src == 0)
2501 				fl.fl4_src = inet_select_addr(dev_out, 0,
2502 							      RT_SCOPE_LINK);
2503 			res.type = RTN_UNICAST;
2504 			goto make_route;
2505 		}
2506 		if (dev_out)
2507 			dev_put(dev_out);
2508 		err = -ENETUNREACH;
2509 		goto out;
2510 	}
2511 	free_res = 1;
2512 
2513 	if (res.type == RTN_LOCAL) {
2514 		if (!fl.fl4_src)
2515 			fl.fl4_src = fl.fl4_dst;
2516 		if (dev_out)
2517 			dev_put(dev_out);
2518 		dev_out = &loopback_dev;
2519 		dev_hold(dev_out);
2520 		fl.oif = dev_out->ifindex;
2521 		if (res.fi)
2522 			fib_info_put(res.fi);
2523 		res.fi = NULL;
2524 		flags |= RTCF_LOCAL;
2525 		goto make_route;
2526 	}
2527 
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 		fib_select_multipath(&fl, &res);
2531 	else
2532 #endif
2533 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 		fib_select_default(&fl, &res);
2535 
2536 	if (!fl.fl4_src)
2537 		fl.fl4_src = FIB_RES_PREFSRC(res);
2538 
2539 	if (dev_out)
2540 		dev_put(dev_out);
2541 	dev_out = FIB_RES_DEV(res);
2542 	dev_hold(dev_out);
2543 	fl.oif = dev_out->ifindex;
2544 
2545 
2546 make_route:
2547 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548 
2549 
2550 	if (free_res)
2551 		fib_res_put(&res);
2552 	if (dev_out)
2553 		dev_put(dev_out);
2554 out:	return err;
2555 }
2556 
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558 {
2559 	unsigned hash;
2560 	struct rtable *rth;
2561 
2562 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2563 
2564 	rcu_read_lock_bh();
2565 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 		rth = rcu_dereference(rth->u.rt_next)) {
2567 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 		    rth->fl.fl4_src == flp->fl4_src &&
2569 		    rth->fl.iif == 0 &&
2570 		    rth->fl.oif == flp->oif &&
2571 		    rth->fl.mark == flp->mark &&
2572 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2574 
2575 			/* check for multipath routes and choose one if
2576 			 * necessary
2577 			 */
2578 			if (multipath_select_route(flp, rth, rp)) {
2579 				dst_hold(&(*rp)->u.dst);
2580 				RT_CACHE_STAT_INC(out_hit);
2581 				rcu_read_unlock_bh();
2582 				return 0;
2583 			}
2584 
2585 			rth->u.dst.lastuse = jiffies;
2586 			dst_hold(&rth->u.dst);
2587 			rth->u.dst.__use++;
2588 			RT_CACHE_STAT_INC(out_hit);
2589 			rcu_read_unlock_bh();
2590 			*rp = rth;
2591 			return 0;
2592 		}
2593 		RT_CACHE_STAT_INC(out_hlist_search);
2594 	}
2595 	rcu_read_unlock_bh();
2596 
2597 	return ip_route_output_slow(rp, flp);
2598 }
2599 
2600 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601 
2602 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2603 {
2604 	int err;
2605 
2606 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2607 		return err;
2608 
2609 	if (flp->proto) {
2610 		if (!flp->fl4_src)
2611 			flp->fl4_src = (*rp)->rt_src;
2612 		if (!flp->fl4_dst)
2613 			flp->fl4_dst = (*rp)->rt_dst;
2614 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2615 	}
2616 
2617 	return 0;
2618 }
2619 
2620 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2621 
2622 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623 {
2624 	return ip_route_output_flow(rp, flp, NULL, 0);
2625 }
2626 
2627 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2628 			int nowait, unsigned int flags)
2629 {
2630 	struct rtable *rt = (struct rtable*)skb->dst;
2631 	struct rtmsg *r;
2632 	struct nlmsghdr *nlh;
2633 	long expires;
2634 	u32 id = 0, ts = 0, tsage = 0, error;
2635 
2636 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637 	if (nlh == NULL)
2638 		return -ENOBUFS;
2639 
2640 	r = nlmsg_data(nlh);
2641 	r->rtm_family	 = AF_INET;
2642 	r->rtm_dst_len	= 32;
2643 	r->rtm_src_len	= 0;
2644 	r->rtm_tos	= rt->fl.fl4_tos;
2645 	r->rtm_table	= RT_TABLE_MAIN;
2646 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2647 	r->rtm_type	= rt->rt_type;
2648 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2649 	r->rtm_protocol = RTPROT_UNSPEC;
2650 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 	if (rt->rt_flags & RTCF_NOTIFY)
2652 		r->rtm_flags |= RTM_F_NOTIFY;
2653 
2654 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2655 
2656 	if (rt->fl.fl4_src) {
2657 		r->rtm_src_len = 32;
2658 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2659 	}
2660 	if (rt->u.dst.dev)
2661 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663 	if (rt->u.dst.tclassid)
2664 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2665 #endif
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668 		NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2669 #endif
2670 	if (rt->fl.iif)
2671 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2672 	else if (rt->rt_src != rt->fl.fl4_src)
2673 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2674 
2675 	if (rt->rt_dst != rt->rt_gateway)
2676 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2677 
2678 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2679 		goto nla_put_failure;
2680 
2681 	error = rt->u.dst.error;
2682 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2683 	if (rt->peer) {
2684 		id = rt->peer->ip_id_count;
2685 		if (rt->peer->tcp_ts_stamp) {
2686 			ts = rt->peer->tcp_ts;
2687 			tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2688 		}
2689 	}
2690 
2691 	if (rt->fl.iif) {
2692 #ifdef CONFIG_IP_MROUTE
2693 		__be32 dst = rt->rt_dst;
2694 
2695 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696 		    ipv4_devconf.mc_forwarding) {
2697 			int err = ipmr_get_route(skb, r, nowait);
2698 			if (err <= 0) {
2699 				if (!nowait) {
2700 					if (err == 0)
2701 						return 0;
2702 					goto nla_put_failure;
2703 				} else {
2704 					if (err == -EMSGSIZE)
2705 						goto nla_put_failure;
2706 					error = err;
2707 				}
2708 			}
2709 		} else
2710 #endif
2711 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2712 	}
2713 
2714 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715 			       expires, error) < 0)
2716 		goto nla_put_failure;
2717 
2718 	return nlmsg_end(skb, nlh);
2719 
2720 nla_put_failure:
2721 	return nlmsg_cancel(skb, nlh);
2722 }
2723 
2724 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2725 {
2726 	struct rtmsg *rtm;
2727 	struct nlattr *tb[RTA_MAX+1];
2728 	struct rtable *rt = NULL;
2729 	__be32 dst = 0;
2730 	__be32 src = 0;
2731 	u32 iif;
2732 	int err;
2733 	struct sk_buff *skb;
2734 
2735 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2736 	if (err < 0)
2737 		goto errout;
2738 
2739 	rtm = nlmsg_data(nlh);
2740 
2741 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2742 	if (skb == NULL) {
2743 		err = -ENOBUFS;
2744 		goto errout;
2745 	}
2746 
2747 	/* Reserve room for dummy headers, this skb can pass
2748 	   through good chunk of routing engine.
2749 	 */
2750 	skb->mac.raw = skb->nh.raw = skb->data;
2751 
2752 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2753 	skb->nh.iph->protocol = IPPROTO_ICMP;
2754 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 
2756 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2757 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2758 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2759 
2760 	if (iif) {
2761 		struct net_device *dev;
2762 
2763 		dev = __dev_get_by_index(iif);
2764 		if (dev == NULL) {
2765 			err = -ENODEV;
2766 			goto errout_free;
2767 		}
2768 
2769 		skb->protocol	= htons(ETH_P_IP);
2770 		skb->dev	= dev;
2771 		local_bh_disable();
2772 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2773 		local_bh_enable();
2774 
2775 		rt = (struct rtable*) skb->dst;
2776 		if (err == 0 && rt->u.dst.error)
2777 			err = -rt->u.dst.error;
2778 	} else {
2779 		struct flowi fl = {
2780 			.nl_u = {
2781 				.ip4_u = {
2782 					.daddr = dst,
2783 					.saddr = src,
2784 					.tos = rtm->rtm_tos,
2785 				},
2786 			},
2787 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2788 		};
2789 		err = ip_route_output_key(&rt, &fl);
2790 	}
2791 
2792 	if (err)
2793 		goto errout_free;
2794 
2795 	skb->dst = &rt->u.dst;
2796 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2797 		rt->rt_flags |= RTCF_NOTIFY;
2798 
2799 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2800 				RTM_NEWROUTE, 0, 0);
2801 	if (err <= 0)
2802 		goto errout_free;
2803 
2804 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2805 errout:
2806 	return err;
2807 
2808 errout_free:
2809 	kfree_skb(skb);
2810 	goto errout;
2811 }
2812 
2813 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2814 {
2815 	struct rtable *rt;
2816 	int h, s_h;
2817 	int idx, s_idx;
2818 
2819 	s_h = cb->args[0];
2820 	s_idx = idx = cb->args[1];
2821 	for (h = 0; h <= rt_hash_mask; h++) {
2822 		if (h < s_h) continue;
2823 		if (h > s_h)
2824 			s_idx = 0;
2825 		rcu_read_lock_bh();
2826 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 			if (idx < s_idx)
2829 				continue;
2830 			skb->dst = dst_clone(&rt->u.dst);
2831 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 					 1, NLM_F_MULTI) <= 0) {
2834 				dst_release(xchg(&skb->dst, NULL));
2835 				rcu_read_unlock_bh();
2836 				goto done;
2837 			}
2838 			dst_release(xchg(&skb->dst, NULL));
2839 		}
2840 		rcu_read_unlock_bh();
2841 	}
2842 
2843 done:
2844 	cb->args[0] = h;
2845 	cb->args[1] = idx;
2846 	return skb->len;
2847 }
2848 
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2850 {
2851 	rt_cache_flush(0);
2852 }
2853 
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2856 
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 					struct file *filp, void __user *buffer,
2859 					size_t *lenp, loff_t *ppos)
2860 {
2861 	if (write) {
2862 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 		rt_cache_flush(flush_delay);
2864 		return 0;
2865 	}
2866 
2867 	return -EINVAL;
2868 }
2869 
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 						int __user *name,
2872 						int nlen,
2873 						void __user *oldval,
2874 						size_t __user *oldlenp,
2875 						void __user *newval,
2876 						size_t newlen)
2877 {
2878 	int delay;
2879 	if (newlen != sizeof(int))
2880 		return -EINVAL;
2881 	if (get_user(delay, (int __user *)newval))
2882 		return -EFAULT;
2883 	rt_cache_flush(delay);
2884 	return 0;
2885 }
2886 
2887 ctl_table ipv4_route_table[] = {
2888         {
2889 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2890 		.procname	= "flush",
2891 		.data		= &flush_delay,
2892 		.maxlen		= sizeof(int),
2893 		.mode		= 0200,
2894 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2895 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2896 	},
2897 	{
2898 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2899 		.procname	= "min_delay",
2900 		.data		= &ip_rt_min_delay,
2901 		.maxlen		= sizeof(int),
2902 		.mode		= 0644,
2903 		.proc_handler	= &proc_dointvec_jiffies,
2904 		.strategy	= &sysctl_jiffies,
2905 	},
2906 	{
2907 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2908 		.procname	= "max_delay",
2909 		.data		= &ip_rt_max_delay,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= &proc_dointvec_jiffies,
2913 		.strategy	= &sysctl_jiffies,
2914 	},
2915 	{
2916 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2917 		.procname	= "gc_thresh",
2918 		.data		= &ipv4_dst_ops.gc_thresh,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= &proc_dointvec,
2922 	},
2923 	{
2924 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2925 		.procname	= "max_size",
2926 		.data		= &ip_rt_max_size,
2927 		.maxlen		= sizeof(int),
2928 		.mode		= 0644,
2929 		.proc_handler	= &proc_dointvec,
2930 	},
2931 	{
2932 		/*  Deprecated. Use gc_min_interval_ms */
2933 
2934 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 		.procname	= "gc_min_interval",
2936 		.data		= &ip_rt_gc_min_interval,
2937 		.maxlen		= sizeof(int),
2938 		.mode		= 0644,
2939 		.proc_handler	= &proc_dointvec_jiffies,
2940 		.strategy	= &sysctl_jiffies,
2941 	},
2942 	{
2943 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 		.procname	= "gc_min_interval_ms",
2945 		.data		= &ip_rt_gc_min_interval,
2946 		.maxlen		= sizeof(int),
2947 		.mode		= 0644,
2948 		.proc_handler	= &proc_dointvec_ms_jiffies,
2949 		.strategy	= &sysctl_ms_jiffies,
2950 	},
2951 	{
2952 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2953 		.procname	= "gc_timeout",
2954 		.data		= &ip_rt_gc_timeout,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= &proc_dointvec_jiffies,
2958 		.strategy	= &sysctl_jiffies,
2959 	},
2960 	{
2961 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2962 		.procname	= "gc_interval",
2963 		.data		= &ip_rt_gc_interval,
2964 		.maxlen		= sizeof(int),
2965 		.mode		= 0644,
2966 		.proc_handler	= &proc_dointvec_jiffies,
2967 		.strategy	= &sysctl_jiffies,
2968 	},
2969 	{
2970 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 		.procname	= "redirect_load",
2972 		.data		= &ip_rt_redirect_load,
2973 		.maxlen		= sizeof(int),
2974 		.mode		= 0644,
2975 		.proc_handler	= &proc_dointvec,
2976 	},
2977 	{
2978 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 		.procname	= "redirect_number",
2980 		.data		= &ip_rt_redirect_number,
2981 		.maxlen		= sizeof(int),
2982 		.mode		= 0644,
2983 		.proc_handler	= &proc_dointvec,
2984 	},
2985 	{
2986 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 		.procname	= "redirect_silence",
2988 		.data		= &ip_rt_redirect_silence,
2989 		.maxlen		= sizeof(int),
2990 		.mode		= 0644,
2991 		.proc_handler	= &proc_dointvec,
2992 	},
2993 	{
2994 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2995 		.procname	= "error_cost",
2996 		.data		= &ip_rt_error_cost,
2997 		.maxlen		= sizeof(int),
2998 		.mode		= 0644,
2999 		.proc_handler	= &proc_dointvec,
3000 	},
3001 	{
3002 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3003 		.procname	= "error_burst",
3004 		.data		= &ip_rt_error_burst,
3005 		.maxlen		= sizeof(int),
3006 		.mode		= 0644,
3007 		.proc_handler	= &proc_dointvec,
3008 	},
3009 	{
3010 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3011 		.procname	= "gc_elasticity",
3012 		.data		= &ip_rt_gc_elasticity,
3013 		.maxlen		= sizeof(int),
3014 		.mode		= 0644,
3015 		.proc_handler	= &proc_dointvec,
3016 	},
3017 	{
3018 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3019 		.procname	= "mtu_expires",
3020 		.data		= &ip_rt_mtu_expires,
3021 		.maxlen		= sizeof(int),
3022 		.mode		= 0644,
3023 		.proc_handler	= &proc_dointvec_jiffies,
3024 		.strategy	= &sysctl_jiffies,
3025 	},
3026 	{
3027 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3028 		.procname	= "min_pmtu",
3029 		.data		= &ip_rt_min_pmtu,
3030 		.maxlen		= sizeof(int),
3031 		.mode		= 0644,
3032 		.proc_handler	= &proc_dointvec,
3033 	},
3034 	{
3035 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3036 		.procname	= "min_adv_mss",
3037 		.data		= &ip_rt_min_advmss,
3038 		.maxlen		= sizeof(int),
3039 		.mode		= 0644,
3040 		.proc_handler	= &proc_dointvec,
3041 	},
3042 	{
3043 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 		.procname	= "secret_interval",
3045 		.data		= &ip_rt_secret_interval,
3046 		.maxlen		= sizeof(int),
3047 		.mode		= 0644,
3048 		.proc_handler	= &proc_dointvec_jiffies,
3049 		.strategy	= &sysctl_jiffies,
3050 	},
3051 	{ .ctl_name = 0 }
3052 };
3053 #endif
3054 
3055 #ifdef CONFIG_NET_CLS_ROUTE
3056 struct ip_rt_acct *ip_rt_acct;
3057 
3058 /* This code sucks.  But you should have seen it before! --RR */
3059 
3060 /* IP route accounting ptr for this logical cpu number. */
3061 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062 
3063 #ifdef CONFIG_PROC_FS
3064 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 			   int length, int *eof, void *data)
3066 {
3067 	unsigned int i;
3068 
3069 	if ((offset & 3) || (length & 3))
3070 		return -EIO;
3071 
3072 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 		*eof = 1;
3074 		return 0;
3075 	}
3076 
3077 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 		*eof = 1;
3080 	}
3081 
3082 	offset /= sizeof(u32);
3083 
3084 	if (length > 0) {
3085 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 		u32 *dst = (u32 *) buffer;
3087 
3088 		/* Copy first cpu. */
3089 		*start = buffer;
3090 		memcpy(dst, src, length);
3091 
3092 		/* Add the other cpus in, one int at a time */
3093 		for_each_possible_cpu(i) {
3094 			unsigned int j;
3095 
3096 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097 
3098 			for (j = 0; j < length/4; j++)
3099 				dst[j] += src[j];
3100 		}
3101 	}
3102 	return length;
3103 }
3104 #endif /* CONFIG_PROC_FS */
3105 #endif /* CONFIG_NET_CLS_ROUTE */
3106 
3107 static __initdata unsigned long rhash_entries;
3108 static int __init set_rhash_entries(char *str)
3109 {
3110 	if (!str)
3111 		return 0;
3112 	rhash_entries = simple_strtoul(str, &str, 0);
3113 	return 1;
3114 }
3115 __setup("rhash_entries=", set_rhash_entries);
3116 
3117 int __init ip_rt_init(void)
3118 {
3119 	int rc = 0;
3120 
3121 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 			     (jiffies ^ (jiffies >> 7)));
3123 
3124 #ifdef CONFIG_NET_CLS_ROUTE
3125 	{
3126 	int order;
3127 	for (order = 0;
3128 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 		/* NOTHING */;
3130 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 	if (!ip_rt_acct)
3132 		panic("IP: failed to allocate ip_rt_acct\n");
3133 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134 	}
3135 #endif
3136 
3137 	ipv4_dst_ops.kmem_cachep =
3138 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3140 
3141 	rt_hash_table = (struct rt_hash_bucket *)
3142 		alloc_large_system_hash("IP route cache",
3143 					sizeof(struct rt_hash_bucket),
3144 					rhash_entries,
3145 					(num_physpages >= 128 * 1024) ?
3146 					15 : 17,
3147 					0,
3148 					&rt_hash_log,
3149 					&rt_hash_mask,
3150 					0);
3151 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 	rt_hash_lock_init();
3153 
3154 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156 
3157 	devinet_init();
3158 	ip_fib_init();
3159 
3160 	init_timer(&rt_flush_timer);
3161 	rt_flush_timer.function = rt_run_flush;
3162 	init_timer(&rt_periodic_timer);
3163 	rt_periodic_timer.function = rt_check_expire;
3164 	init_timer(&rt_secret_timer);
3165 	rt_secret_timer.function = rt_secret_rebuild;
3166 
3167 	/* All the timers, started at system startup tend
3168 	   to synchronize. Perturb it a bit.
3169 	 */
3170 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 					ip_rt_gc_interval;
3172 	add_timer(&rt_periodic_timer);
3173 
3174 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175 		ip_rt_secret_interval;
3176 	add_timer(&rt_secret_timer);
3177 
3178 #ifdef CONFIG_PROC_FS
3179 	{
3180 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183 			    		     proc_net_stat))) {
3184 		return -ENOMEM;
3185 	}
3186 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 	}
3188 #ifdef CONFIG_NET_CLS_ROUTE
3189 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190 #endif
3191 #endif
3192 #ifdef CONFIG_XFRM
3193 	xfrm_init();
3194 	xfrm4_init();
3195 #endif
3196 	return rc;
3197 }
3198 
3199 EXPORT_SYMBOL(__ip_select_ident);
3200 EXPORT_SYMBOL(ip_route_input);
3201 EXPORT_SYMBOL(ip_route_output_key);
3202