xref: /linux/net/ipv4/route.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/config.h>
68 #include <linux/module.h>
69 #include <asm/uaccess.h>
70 #include <asm/system.h>
71 #include <linux/bitops.h>
72 #include <linux/types.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/bootmem.h>
77 #include <linux/string.h>
78 #include <linux/socket.h>
79 #include <linux/sockios.h>
80 #include <linux/errno.h>
81 #include <linux/in.h>
82 #include <linux/inet.h>
83 #include <linux/netdevice.h>
84 #include <linux/proc_fs.h>
85 #include <linux/init.h>
86 #include <linux/skbuff.h>
87 #include <linux/rtnetlink.h>
88 #include <linux/inetdevice.h>
89 #include <linux/igmp.h>
90 #include <linux/pkt_sched.h>
91 #include <linux/mroute.h>
92 #include <linux/netfilter_ipv4.h>
93 #include <linux/random.h>
94 #include <linux/jhash.h>
95 #include <linux/rcupdate.h>
96 #include <linux/times.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/ip_mp_alg.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_min_delay		= 2 * HZ;
120 static int ip_rt_max_delay		= 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval		= 60 * HZ;
124 static int ip_rt_gc_min_interval	= HZ / 2;
125 static int ip_rt_redirect_number	= 9;
126 static int ip_rt_redirect_load		= HZ / 50;
127 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost		= HZ;
129 static int ip_rt_error_burst		= 5 * HZ;
130 static int ip_rt_gc_elasticity		= 8;
131 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu		= 512 + 20 + 20;
133 static int ip_rt_min_advmss		= 256;
134 static int ip_rt_secret_interval	= 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136 
137 #define RTprint(a...)	printk(KERN_DEBUG a)
138 
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142 
143 /*
144  *	Interface to generic destination cache.
145  */
146 
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
150 					 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void		 ipv4_link_failure(struct sk_buff *skb);
153 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155 
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		__constant_htons(ETH_P_IP),
160 	.gc =			rt_garbage_collect,
161 	.check =		ipv4_dst_check,
162 	.destroy =		ipv4_dst_destroy,
163 	.ifdown =		ipv4_dst_ifdown,
164 	.negative_advice =	ipv4_negative_advice,
165 	.link_failure =		ipv4_link_failure,
166 	.update_pmtu =		ip_rt_update_pmtu,
167 	.entry_size =		sizeof(struct rtable),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
210 /*
211  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212  * The size of this table is a power of two and depends on the number of CPUS.
213  */
214 #if NR_CPUS >= 32
215 #define RT_HASH_LOCK_SZ	4096
216 #elif NR_CPUS >= 16
217 #define RT_HASH_LOCK_SZ	2048
218 #elif NR_CPUS >= 8
219 #define RT_HASH_LOCK_SZ	1024
220 #elif NR_CPUS >= 4
221 #define RT_HASH_LOCK_SZ	512
222 #else
223 #define RT_HASH_LOCK_SZ	256
224 #endif
225 
226 static spinlock_t	*rt_hash_locks;
227 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
228 # define rt_hash_lock_init()	{ \
229 		int i; \
230 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
231 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
232 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
233 			spin_lock_init(&rt_hash_locks[i]); \
234 		}
235 #else
236 # define rt_hash_lock_addr(slot) NULL
237 # define rt_hash_lock_init()
238 #endif
239 
240 static struct rt_hash_bucket 	*rt_hash_table;
241 static unsigned			rt_hash_mask;
242 static int			rt_hash_log;
243 static unsigned int		rt_hash_rnd;
244 
245 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
246 #define RT_CACHE_STAT_INC(field) \
247 	(per_cpu(rt_cache_stat, raw_smp_processor_id()).field++)
248 
249 static int rt_intern_hash(unsigned hash, struct rtable *rth,
250 				struct rtable **res);
251 
252 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
253 {
254 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
255 		& rt_hash_mask);
256 }
257 
258 #ifdef CONFIG_PROC_FS
259 struct rt_cache_iter_state {
260 	int bucket;
261 };
262 
263 static struct rtable *rt_cache_get_first(struct seq_file *seq)
264 {
265 	struct rtable *r = NULL;
266 	struct rt_cache_iter_state *st = seq->private;
267 
268 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
269 		rcu_read_lock_bh();
270 		r = rt_hash_table[st->bucket].chain;
271 		if (r)
272 			break;
273 		rcu_read_unlock_bh();
274 	}
275 	return r;
276 }
277 
278 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
279 {
280 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
281 
282 	r = r->u.rt_next;
283 	while (!r) {
284 		rcu_read_unlock_bh();
285 		if (--st->bucket < 0)
286 			break;
287 		rcu_read_lock_bh();
288 		r = rt_hash_table[st->bucket].chain;
289 	}
290 	return r;
291 }
292 
293 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
294 {
295 	struct rtable *r = rt_cache_get_first(seq);
296 
297 	if (r)
298 		while (pos && (r = rt_cache_get_next(seq, r)))
299 			--pos;
300 	return pos ? NULL : r;
301 }
302 
303 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
304 {
305 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
306 }
307 
308 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
309 {
310 	struct rtable *r = NULL;
311 
312 	if (v == SEQ_START_TOKEN)
313 		r = rt_cache_get_first(seq);
314 	else
315 		r = rt_cache_get_next(seq, v);
316 	++*pos;
317 	return r;
318 }
319 
320 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
321 {
322 	if (v && v != SEQ_START_TOKEN)
323 		rcu_read_unlock_bh();
324 }
325 
326 static int rt_cache_seq_show(struct seq_file *seq, void *v)
327 {
328 	if (v == SEQ_START_TOKEN)
329 		seq_printf(seq, "%-127s\n",
330 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
331 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
332 			   "HHUptod\tSpecDst");
333 	else {
334 		struct rtable *r = v;
335 		char temp[256];
336 
337 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
338 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
339 			r->u.dst.dev ? r->u.dst.dev->name : "*",
340 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
341 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
342 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
343 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
344 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
345 			dst_metric(&r->u.dst, RTAX_WINDOW),
346 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
347 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
348 			r->fl.fl4_tos,
349 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
350 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
351 				       dev_queue_xmit) : 0,
352 			r->rt_spec_dst);
353 		seq_printf(seq, "%-127s\n", temp);
354         }
355   	return 0;
356 }
357 
358 static struct seq_operations rt_cache_seq_ops = {
359 	.start  = rt_cache_seq_start,
360 	.next   = rt_cache_seq_next,
361 	.stop   = rt_cache_seq_stop,
362 	.show   = rt_cache_seq_show,
363 };
364 
365 static int rt_cache_seq_open(struct inode *inode, struct file *file)
366 {
367 	struct seq_file *seq;
368 	int rc = -ENOMEM;
369 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
370 
371 	if (!s)
372 		goto out;
373 	rc = seq_open(file, &rt_cache_seq_ops);
374 	if (rc)
375 		goto out_kfree;
376 	seq          = file->private_data;
377 	seq->private = s;
378 	memset(s, 0, sizeof(*s));
379 out:
380 	return rc;
381 out_kfree:
382 	kfree(s);
383 	goto out;
384 }
385 
386 static struct file_operations rt_cache_seq_fops = {
387 	.owner	 = THIS_MODULE,
388 	.open	 = rt_cache_seq_open,
389 	.read	 = seq_read,
390 	.llseek	 = seq_lseek,
391 	.release = seq_release_private,
392 };
393 
394 
395 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
396 {
397 	int cpu;
398 
399 	if (*pos == 0)
400 		return SEQ_START_TOKEN;
401 
402 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
403 		if (!cpu_possible(cpu))
404 			continue;
405 		*pos = cpu+1;
406 		return &per_cpu(rt_cache_stat, cpu);
407 	}
408 	return NULL;
409 }
410 
411 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
412 {
413 	int cpu;
414 
415 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
416 		if (!cpu_possible(cpu))
417 			continue;
418 		*pos = cpu+1;
419 		return &per_cpu(rt_cache_stat, cpu);
420 	}
421 	return NULL;
422 
423 }
424 
425 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
426 {
427 
428 }
429 
430 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
431 {
432 	struct rt_cache_stat *st = v;
433 
434 	if (v == SEQ_START_TOKEN) {
435 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
436 		return 0;
437 	}
438 
439 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
440 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
441 		   atomic_read(&ipv4_dst_ops.entries),
442 		   st->in_hit,
443 		   st->in_slow_tot,
444 		   st->in_slow_mc,
445 		   st->in_no_route,
446 		   st->in_brd,
447 		   st->in_martian_dst,
448 		   st->in_martian_src,
449 
450 		   st->out_hit,
451 		   st->out_slow_tot,
452 		   st->out_slow_mc,
453 
454 		   st->gc_total,
455 		   st->gc_ignored,
456 		   st->gc_goal_miss,
457 		   st->gc_dst_overflow,
458 		   st->in_hlist_search,
459 		   st->out_hlist_search
460 		);
461 	return 0;
462 }
463 
464 static struct seq_operations rt_cpu_seq_ops = {
465 	.start  = rt_cpu_seq_start,
466 	.next   = rt_cpu_seq_next,
467 	.stop   = rt_cpu_seq_stop,
468 	.show   = rt_cpu_seq_show,
469 };
470 
471 
472 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
473 {
474 	return seq_open(file, &rt_cpu_seq_ops);
475 }
476 
477 static struct file_operations rt_cpu_seq_fops = {
478 	.owner	 = THIS_MODULE,
479 	.open	 = rt_cpu_seq_open,
480 	.read	 = seq_read,
481 	.llseek	 = seq_lseek,
482 	.release = seq_release,
483 };
484 
485 #endif /* CONFIG_PROC_FS */
486 
487 static __inline__ void rt_free(struct rtable *rt)
488 {
489 	multipath_remove(rt);
490 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
491 }
492 
493 static __inline__ void rt_drop(struct rtable *rt)
494 {
495 	multipath_remove(rt);
496 	ip_rt_put(rt);
497 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 }
499 
500 static __inline__ int rt_fast_clean(struct rtable *rth)
501 {
502 	/* Kill broadcast/multicast entries very aggresively, if they
503 	   collide in hash table with more useful entries */
504 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
505 		rth->fl.iif && rth->u.rt_next;
506 }
507 
508 static __inline__ int rt_valuable(struct rtable *rth)
509 {
510 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
511 		rth->u.dst.expires;
512 }
513 
514 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
515 {
516 	unsigned long age;
517 	int ret = 0;
518 
519 	if (atomic_read(&rth->u.dst.__refcnt))
520 		goto out;
521 
522 	ret = 1;
523 	if (rth->u.dst.expires &&
524 	    time_after_eq(jiffies, rth->u.dst.expires))
525 		goto out;
526 
527 	age = jiffies - rth->u.dst.lastuse;
528 	ret = 0;
529 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
530 	    (age <= tmo2 && rt_valuable(rth)))
531 		goto out;
532 	ret = 1;
533 out:	return ret;
534 }
535 
536 /* Bits of score are:
537  * 31: very valuable
538  * 30: not quite useless
539  * 29..0: usage counter
540  */
541 static inline u32 rt_score(struct rtable *rt)
542 {
543 	u32 score = jiffies - rt->u.dst.lastuse;
544 
545 	score = ~score & ~(3<<30);
546 
547 	if (rt_valuable(rt))
548 		score |= (1<<31);
549 
550 	if (!rt->fl.iif ||
551 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
552 		score |= (1<<30);
553 
554 	return score;
555 }
556 
557 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
558 {
559 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
560 	       fl1->oif     == fl2->oif &&
561 	       fl1->iif     == fl2->iif;
562 }
563 
564 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
565 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
566 						struct rtable *expentry,
567 						int *removed_count)
568 {
569 	int passedexpired = 0;
570 	struct rtable **nextstep = NULL;
571 	struct rtable **rthp = chain_head;
572 	struct rtable *rth;
573 
574 	if (removed_count)
575 		*removed_count = 0;
576 
577 	while ((rth = *rthp) != NULL) {
578 		if (rth == expentry)
579 			passedexpired = 1;
580 
581 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
582 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
583 			if (*rthp == expentry) {
584 				*rthp = rth->u.rt_next;
585 				continue;
586 			} else {
587 				*rthp = rth->u.rt_next;
588 				rt_free(rth);
589 				if (removed_count)
590 					++(*removed_count);
591 			}
592 		} else {
593 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
594 			    passedexpired && !nextstep)
595 				nextstep = &rth->u.rt_next;
596 
597 			rthp = &rth->u.rt_next;
598 		}
599 	}
600 
601 	rt_free(expentry);
602 	if (removed_count)
603 		++(*removed_count);
604 
605 	return nextstep;
606 }
607 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
608 
609 
610 /* This runs via a timer and thus is always in BH context. */
611 static void rt_check_expire(unsigned long dummy)
612 {
613 	static unsigned int rover;
614 	unsigned int i = rover, goal;
615 	struct rtable *rth, **rthp;
616 	unsigned long now = jiffies;
617 	u64 mult;
618 
619 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
620 	if (ip_rt_gc_timeout > 1)
621 		do_div(mult, ip_rt_gc_timeout);
622 	goal = (unsigned int)mult;
623 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
624 	for (; goal > 0; goal--) {
625 		unsigned long tmo = ip_rt_gc_timeout;
626 
627 		i = (i + 1) & rt_hash_mask;
628 		rthp = &rt_hash_table[i].chain;
629 
630 		if (*rthp == 0)
631 			continue;
632 		spin_lock(rt_hash_lock_addr(i));
633 		while ((rth = *rthp) != NULL) {
634 			if (rth->u.dst.expires) {
635 				/* Entry is expired even if it is in use */
636 				if (time_before_eq(now, rth->u.dst.expires)) {
637 					tmo >>= 1;
638 					rthp = &rth->u.rt_next;
639 					continue;
640 				}
641 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
642 				tmo >>= 1;
643 				rthp = &rth->u.rt_next;
644 				continue;
645 			}
646 
647 			/* Cleanup aged off entries. */
648 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
649 			/* remove all related balanced entries if necessary */
650 			if (rth->u.dst.flags & DST_BALANCED) {
651 				rthp = rt_remove_balanced_route(
652 					&rt_hash_table[i].chain,
653 					rth, NULL);
654 				if (!rthp)
655 					break;
656 			} else {
657 				*rthp = rth->u.rt_next;
658 				rt_free(rth);
659 			}
660 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
661  			*rthp = rth->u.rt_next;
662  			rt_free(rth);
663 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
664 		}
665 		spin_unlock(rt_hash_lock_addr(i));
666 
667 		/* Fallback loop breaker. */
668 		if (time_after(jiffies, now))
669 			break;
670 	}
671 	rover = i;
672 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
673 }
674 
675 /* This can run from both BH and non-BH contexts, the latter
676  * in the case of a forced flush event.
677  */
678 static void rt_run_flush(unsigned long dummy)
679 {
680 	int i;
681 	struct rtable *rth, *next;
682 
683 	rt_deadline = 0;
684 
685 	get_random_bytes(&rt_hash_rnd, 4);
686 
687 	for (i = rt_hash_mask; i >= 0; i--) {
688 		spin_lock_bh(rt_hash_lock_addr(i));
689 		rth = rt_hash_table[i].chain;
690 		if (rth)
691 			rt_hash_table[i].chain = NULL;
692 		spin_unlock_bh(rt_hash_lock_addr(i));
693 
694 		for (; rth; rth = next) {
695 			next = rth->u.rt_next;
696 			rt_free(rth);
697 		}
698 	}
699 }
700 
701 static DEFINE_SPINLOCK(rt_flush_lock);
702 
703 void rt_cache_flush(int delay)
704 {
705 	unsigned long now = jiffies;
706 	int user_mode = !in_softirq();
707 
708 	if (delay < 0)
709 		delay = ip_rt_min_delay;
710 
711 	/* flush existing multipath state*/
712 	multipath_flush();
713 
714 	spin_lock_bh(&rt_flush_lock);
715 
716 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
717 		long tmo = (long)(rt_deadline - now);
718 
719 		/* If flush timer is already running
720 		   and flush request is not immediate (delay > 0):
721 
722 		   if deadline is not achieved, prolongate timer to "delay",
723 		   otherwise fire it at deadline time.
724 		 */
725 
726 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
727 			tmo = 0;
728 
729 		if (delay > tmo)
730 			delay = tmo;
731 	}
732 
733 	if (delay <= 0) {
734 		spin_unlock_bh(&rt_flush_lock);
735 		rt_run_flush(0);
736 		return;
737 	}
738 
739 	if (rt_deadline == 0)
740 		rt_deadline = now + ip_rt_max_delay;
741 
742 	mod_timer(&rt_flush_timer, now+delay);
743 	spin_unlock_bh(&rt_flush_lock);
744 }
745 
746 static void rt_secret_rebuild(unsigned long dummy)
747 {
748 	unsigned long now = jiffies;
749 
750 	rt_cache_flush(0);
751 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
752 }
753 
754 /*
755    Short description of GC goals.
756 
757    We want to build algorithm, which will keep routing cache
758    at some equilibrium point, when number of aged off entries
759    is kept approximately equal to newly generated ones.
760 
761    Current expiration strength is variable "expire".
762    We try to adjust it dynamically, so that if networking
763    is idle expires is large enough to keep enough of warm entries,
764    and when load increases it reduces to limit cache size.
765  */
766 
767 static int rt_garbage_collect(void)
768 {
769 	static unsigned long expire = RT_GC_TIMEOUT;
770 	static unsigned long last_gc;
771 	static int rover;
772 	static int equilibrium;
773 	struct rtable *rth, **rthp;
774 	unsigned long now = jiffies;
775 	int goal;
776 
777 	/*
778 	 * Garbage collection is pretty expensive,
779 	 * do not make it too frequently.
780 	 */
781 
782 	RT_CACHE_STAT_INC(gc_total);
783 
784 	if (now - last_gc < ip_rt_gc_min_interval &&
785 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
786 		RT_CACHE_STAT_INC(gc_ignored);
787 		goto out;
788 	}
789 
790 	/* Calculate number of entries, which we want to expire now. */
791 	goal = atomic_read(&ipv4_dst_ops.entries) -
792 		(ip_rt_gc_elasticity << rt_hash_log);
793 	if (goal <= 0) {
794 		if (equilibrium < ipv4_dst_ops.gc_thresh)
795 			equilibrium = ipv4_dst_ops.gc_thresh;
796 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
797 		if (goal > 0) {
798 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
799 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
800 		}
801 	} else {
802 		/* We are in dangerous area. Try to reduce cache really
803 		 * aggressively.
804 		 */
805 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
806 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
807 	}
808 
809 	if (now - last_gc >= ip_rt_gc_min_interval)
810 		last_gc = now;
811 
812 	if (goal <= 0) {
813 		equilibrium += goal;
814 		goto work_done;
815 	}
816 
817 	do {
818 		int i, k;
819 
820 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
821 			unsigned long tmo = expire;
822 
823 			k = (k + 1) & rt_hash_mask;
824 			rthp = &rt_hash_table[k].chain;
825 			spin_lock_bh(rt_hash_lock_addr(k));
826 			while ((rth = *rthp) != NULL) {
827 				if (!rt_may_expire(rth, tmo, expire)) {
828 					tmo >>= 1;
829 					rthp = &rth->u.rt_next;
830 					continue;
831 				}
832 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
833 				/* remove all related balanced entries
834 				 * if necessary
835 				 */
836 				if (rth->u.dst.flags & DST_BALANCED) {
837 					int r;
838 
839 					rthp = rt_remove_balanced_route(
840 						&rt_hash_table[k].chain,
841 						rth,
842 						&r);
843 					goal -= r;
844 					if (!rthp)
845 						break;
846 				} else {
847 					*rthp = rth->u.rt_next;
848 					rt_free(rth);
849 					goal--;
850 				}
851 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
852 				*rthp = rth->u.rt_next;
853 				rt_free(rth);
854 				goal--;
855 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
856 			}
857 			spin_unlock_bh(rt_hash_lock_addr(k));
858 			if (goal <= 0)
859 				break;
860 		}
861 		rover = k;
862 
863 		if (goal <= 0)
864 			goto work_done;
865 
866 		/* Goal is not achieved. We stop process if:
867 
868 		   - if expire reduced to zero. Otherwise, expire is halfed.
869 		   - if table is not full.
870 		   - if we are called from interrupt.
871 		   - jiffies check is just fallback/debug loop breaker.
872 		     We will not spin here for long time in any case.
873 		 */
874 
875 		RT_CACHE_STAT_INC(gc_goal_miss);
876 
877 		if (expire == 0)
878 			break;
879 
880 		expire >>= 1;
881 #if RT_CACHE_DEBUG >= 2
882 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
883 				atomic_read(&ipv4_dst_ops.entries), goal, i);
884 #endif
885 
886 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887 			goto out;
888 	} while (!in_softirq() && time_before_eq(jiffies, now));
889 
890 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
891 		goto out;
892 	if (net_ratelimit())
893 		printk(KERN_WARNING "dst cache overflow\n");
894 	RT_CACHE_STAT_INC(gc_dst_overflow);
895 	return 1;
896 
897 work_done:
898 	expire += ip_rt_gc_min_interval;
899 	if (expire > ip_rt_gc_timeout ||
900 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
901 		expire = ip_rt_gc_timeout;
902 #if RT_CACHE_DEBUG >= 2
903 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
904 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
905 #endif
906 out:	return 0;
907 }
908 
909 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
910 {
911 	struct rtable	*rth, **rthp;
912 	unsigned long	now;
913 	struct rtable *cand, **candp;
914 	u32 		min_score;
915 	int		chain_length;
916 	int attempts = !in_softirq();
917 
918 restart:
919 	chain_length = 0;
920 	min_score = ~(u32)0;
921 	cand = NULL;
922 	candp = NULL;
923 	now = jiffies;
924 
925 	rthp = &rt_hash_table[hash].chain;
926 
927 	spin_lock_bh(rt_hash_lock_addr(hash));
928 	while ((rth = *rthp) != NULL) {
929 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
930 		if (!(rth->u.dst.flags & DST_BALANCED) &&
931 		    compare_keys(&rth->fl, &rt->fl)) {
932 #else
933 		if (compare_keys(&rth->fl, &rt->fl)) {
934 #endif
935 			/* Put it first */
936 			*rthp = rth->u.rt_next;
937 			/*
938 			 * Since lookup is lockfree, the deletion
939 			 * must be visible to another weakly ordered CPU before
940 			 * the insertion at the start of the hash chain.
941 			 */
942 			rcu_assign_pointer(rth->u.rt_next,
943 					   rt_hash_table[hash].chain);
944 			/*
945 			 * Since lookup is lockfree, the update writes
946 			 * must be ordered for consistency on SMP.
947 			 */
948 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
949 
950 			rth->u.dst.__use++;
951 			dst_hold(&rth->u.dst);
952 			rth->u.dst.lastuse = now;
953 			spin_unlock_bh(rt_hash_lock_addr(hash));
954 
955 			rt_drop(rt);
956 			*rp = rth;
957 			return 0;
958 		}
959 
960 		if (!atomic_read(&rth->u.dst.__refcnt)) {
961 			u32 score = rt_score(rth);
962 
963 			if (score <= min_score) {
964 				cand = rth;
965 				candp = rthp;
966 				min_score = score;
967 			}
968 		}
969 
970 		chain_length++;
971 
972 		rthp = &rth->u.rt_next;
973 	}
974 
975 	if (cand) {
976 		/* ip_rt_gc_elasticity used to be average length of chain
977 		 * length, when exceeded gc becomes really aggressive.
978 		 *
979 		 * The second limit is less certain. At the moment it allows
980 		 * only 2 entries per bucket. We will see.
981 		 */
982 		if (chain_length > ip_rt_gc_elasticity) {
983 			*candp = cand->u.rt_next;
984 			rt_free(cand);
985 		}
986 	}
987 
988 	/* Try to bind route to arp only if it is output
989 	   route or unicast forwarding path.
990 	 */
991 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
992 		int err = arp_bind_neighbour(&rt->u.dst);
993 		if (err) {
994 			spin_unlock_bh(rt_hash_lock_addr(hash));
995 
996 			if (err != -ENOBUFS) {
997 				rt_drop(rt);
998 				return err;
999 			}
1000 
1001 			/* Neighbour tables are full and nothing
1002 			   can be released. Try to shrink route cache,
1003 			   it is most likely it holds some neighbour records.
1004 			 */
1005 			if (attempts-- > 0) {
1006 				int saved_elasticity = ip_rt_gc_elasticity;
1007 				int saved_int = ip_rt_gc_min_interval;
1008 				ip_rt_gc_elasticity	= 1;
1009 				ip_rt_gc_min_interval	= 0;
1010 				rt_garbage_collect();
1011 				ip_rt_gc_min_interval	= saved_int;
1012 				ip_rt_gc_elasticity	= saved_elasticity;
1013 				goto restart;
1014 			}
1015 
1016 			if (net_ratelimit())
1017 				printk(KERN_WARNING "Neighbour table overflow.\n");
1018 			rt_drop(rt);
1019 			return -ENOBUFS;
1020 		}
1021 	}
1022 
1023 	rt->u.rt_next = rt_hash_table[hash].chain;
1024 #if RT_CACHE_DEBUG >= 2
1025 	if (rt->u.rt_next) {
1026 		struct rtable *trt;
1027 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1028 		       NIPQUAD(rt->rt_dst));
1029 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1030 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1031 		printk("\n");
1032 	}
1033 #endif
1034 	rt_hash_table[hash].chain = rt;
1035 	spin_unlock_bh(rt_hash_lock_addr(hash));
1036 	*rp = rt;
1037 	return 0;
1038 }
1039 
1040 void rt_bind_peer(struct rtable *rt, int create)
1041 {
1042 	static DEFINE_SPINLOCK(rt_peer_lock);
1043 	struct inet_peer *peer;
1044 
1045 	peer = inet_getpeer(rt->rt_dst, create);
1046 
1047 	spin_lock_bh(&rt_peer_lock);
1048 	if (rt->peer == NULL) {
1049 		rt->peer = peer;
1050 		peer = NULL;
1051 	}
1052 	spin_unlock_bh(&rt_peer_lock);
1053 	if (peer)
1054 		inet_putpeer(peer);
1055 }
1056 
1057 /*
1058  * Peer allocation may fail only in serious out-of-memory conditions.  However
1059  * we still can generate some output.
1060  * Random ID selection looks a bit dangerous because we have no chances to
1061  * select ID being unique in a reasonable period of time.
1062  * But broken packet identifier may be better than no packet at all.
1063  */
1064 static void ip_select_fb_ident(struct iphdr *iph)
1065 {
1066 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1067 	static u32 ip_fallback_id;
1068 	u32 salt;
1069 
1070 	spin_lock_bh(&ip_fb_id_lock);
1071 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1072 	iph->id = htons(salt & 0xFFFF);
1073 	ip_fallback_id = salt;
1074 	spin_unlock_bh(&ip_fb_id_lock);
1075 }
1076 
1077 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1078 {
1079 	struct rtable *rt = (struct rtable *) dst;
1080 
1081 	if (rt) {
1082 		if (rt->peer == NULL)
1083 			rt_bind_peer(rt, 1);
1084 
1085 		/* If peer is attached to destination, it is never detached,
1086 		   so that we need not to grab a lock to dereference it.
1087 		 */
1088 		if (rt->peer) {
1089 			iph->id = htons(inet_getid(rt->peer, more));
1090 			return;
1091 		}
1092 	} else
1093 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1094 		       __builtin_return_address(0));
1095 
1096 	ip_select_fb_ident(iph);
1097 }
1098 
1099 static void rt_del(unsigned hash, struct rtable *rt)
1100 {
1101 	struct rtable **rthp;
1102 
1103 	spin_lock_bh(rt_hash_lock_addr(hash));
1104 	ip_rt_put(rt);
1105 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1106 	     rthp = &(*rthp)->u.rt_next)
1107 		if (*rthp == rt) {
1108 			*rthp = rt->u.rt_next;
1109 			rt_free(rt);
1110 			break;
1111 		}
1112 	spin_unlock_bh(rt_hash_lock_addr(hash));
1113 }
1114 
1115 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1116 		    u32 saddr, struct net_device *dev)
1117 {
1118 	int i, k;
1119 	struct in_device *in_dev = in_dev_get(dev);
1120 	struct rtable *rth, **rthp;
1121 	u32  skeys[2] = { saddr, 0 };
1122 	int  ikeys[2] = { dev->ifindex, 0 };
1123 
1124 	if (!in_dev)
1125 		return;
1126 
1127 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 		goto reject_redirect;
1130 
1131 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 			goto reject_redirect;
1134 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 			goto reject_redirect;
1136 	} else {
1137 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 			goto reject_redirect;
1139 	}
1140 
1141 	for (i = 0; i < 2; i++) {
1142 		for (k = 0; k < 2; k++) {
1143 			unsigned hash = rt_hash_code(daddr,
1144 						     skeys[i] ^ (ikeys[k] << 5));
1145 
1146 			rthp=&rt_hash_table[hash].chain;
1147 
1148 			rcu_read_lock();
1149 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1150 				struct rtable *rt;
1151 
1152 				if (rth->fl.fl4_dst != daddr ||
1153 				    rth->fl.fl4_src != skeys[i] ||
1154 				    rth->fl.oif != ikeys[k] ||
1155 				    rth->fl.iif != 0) {
1156 					rthp = &rth->u.rt_next;
1157 					continue;
1158 				}
1159 
1160 				if (rth->rt_dst != daddr ||
1161 				    rth->rt_src != saddr ||
1162 				    rth->u.dst.error ||
1163 				    rth->rt_gateway != old_gw ||
1164 				    rth->u.dst.dev != dev)
1165 					break;
1166 
1167 				dst_hold(&rth->u.dst);
1168 				rcu_read_unlock();
1169 
1170 				rt = dst_alloc(&ipv4_dst_ops);
1171 				if (rt == NULL) {
1172 					ip_rt_put(rth);
1173 					in_dev_put(in_dev);
1174 					return;
1175 				}
1176 
1177 				/* Copy all the information. */
1178 				*rt = *rth;
1179  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180 				rt->u.dst.__use		= 1;
1181 				atomic_set(&rt->u.dst.__refcnt, 1);
1182 				rt->u.dst.child		= NULL;
1183 				if (rt->u.dst.dev)
1184 					dev_hold(rt->u.dst.dev);
1185 				if (rt->idev)
1186 					in_dev_hold(rt->idev);
1187 				rt->u.dst.obsolete	= 0;
1188 				rt->u.dst.lastuse	= jiffies;
1189 				rt->u.dst.path		= &rt->u.dst;
1190 				rt->u.dst.neighbour	= NULL;
1191 				rt->u.dst.hh		= NULL;
1192 				rt->u.dst.xfrm		= NULL;
1193 
1194 				rt->rt_flags		|= RTCF_REDIRECTED;
1195 
1196 				/* Gateway is different ... */
1197 				rt->rt_gateway		= new_gw;
1198 
1199 				/* Redirect received -> path was valid */
1200 				dst_confirm(&rth->u.dst);
1201 
1202 				if (rt->peer)
1203 					atomic_inc(&rt->peer->refcnt);
1204 
1205 				if (arp_bind_neighbour(&rt->u.dst) ||
1206 				    !(rt->u.dst.neighbour->nud_state &
1207 					    NUD_VALID)) {
1208 					if (rt->u.dst.neighbour)
1209 						neigh_event_send(rt->u.dst.neighbour, NULL);
1210 					ip_rt_put(rth);
1211 					rt_drop(rt);
1212 					goto do_next;
1213 				}
1214 
1215 				rt_del(hash, rth);
1216 				if (!rt_intern_hash(hash, rt, &rt))
1217 					ip_rt_put(rt);
1218 				goto do_next;
1219 			}
1220 			rcu_read_unlock();
1221 		do_next:
1222 			;
1223 		}
1224 	}
1225 	in_dev_put(in_dev);
1226 	return;
1227 
1228 reject_redirect:
1229 #ifdef CONFIG_IP_ROUTE_VERBOSE
1230 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232 			"%u.%u.%u.%u ignored.\n"
1233 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1234 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1235 		       NIPQUAD(saddr), NIPQUAD(daddr));
1236 #endif
1237 	in_dev_put(in_dev);
1238 }
1239 
1240 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1241 {
1242 	struct rtable *rt = (struct rtable*)dst;
1243 	struct dst_entry *ret = dst;
1244 
1245 	if (rt) {
1246 		if (dst->obsolete) {
1247 			ip_rt_put(rt);
1248 			ret = NULL;
1249 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1250 			   rt->u.dst.expires) {
1251 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1252 						     rt->fl.fl4_src ^
1253 							(rt->fl.oif << 5));
1254 #if RT_CACHE_DEBUG >= 1
1255 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1256 					  "%u.%u.%u.%u/%02x dropped\n",
1257 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1258 #endif
1259 			rt_del(hash, rt);
1260 			ret = NULL;
1261 		}
1262 	}
1263 	return ret;
1264 }
1265 
1266 /*
1267  * Algorithm:
1268  *	1. The first ip_rt_redirect_number redirects are sent
1269  *	   with exponential backoff, then we stop sending them at all,
1270  *	   assuming that the host ignores our redirects.
1271  *	2. If we did not see packets requiring redirects
1272  *	   during ip_rt_redirect_silence, we assume that the host
1273  *	   forgot redirected route and start to send redirects again.
1274  *
1275  * This algorithm is much cheaper and more intelligent than dumb load limiting
1276  * in icmp.c.
1277  *
1278  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1279  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1280  */
1281 
1282 void ip_rt_send_redirect(struct sk_buff *skb)
1283 {
1284 	struct rtable *rt = (struct rtable*)skb->dst;
1285 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1286 
1287 	if (!in_dev)
1288 		return;
1289 
1290 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1291 		goto out;
1292 
1293 	/* No redirected packets during ip_rt_redirect_silence;
1294 	 * reset the algorithm.
1295 	 */
1296 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1297 		rt->u.dst.rate_tokens = 0;
1298 
1299 	/* Too many ignored redirects; do not send anything
1300 	 * set u.dst.rate_last to the last seen redirected packet.
1301 	 */
1302 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1303 		rt->u.dst.rate_last = jiffies;
1304 		goto out;
1305 	}
1306 
1307 	/* Check for load limit; set rate_last to the latest sent
1308 	 * redirect.
1309 	 */
1310 	if (time_after(jiffies,
1311 		       (rt->u.dst.rate_last +
1312 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1313 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1314 		rt->u.dst.rate_last = jiffies;
1315 		++rt->u.dst.rate_tokens;
1316 #ifdef CONFIG_IP_ROUTE_VERBOSE
1317 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1318 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1319 		    net_ratelimit())
1320 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1321 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1322 				NIPQUAD(rt->rt_src), rt->rt_iif,
1323 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1324 #endif
1325 	}
1326 out:
1327         in_dev_put(in_dev);
1328 }
1329 
1330 static int ip_error(struct sk_buff *skb)
1331 {
1332 	struct rtable *rt = (struct rtable*)skb->dst;
1333 	unsigned long now;
1334 	int code;
1335 
1336 	switch (rt->u.dst.error) {
1337 		case EINVAL:
1338 		default:
1339 			goto out;
1340 		case EHOSTUNREACH:
1341 			code = ICMP_HOST_UNREACH;
1342 			break;
1343 		case ENETUNREACH:
1344 			code = ICMP_NET_UNREACH;
1345 			break;
1346 		case EACCES:
1347 			code = ICMP_PKT_FILTERED;
1348 			break;
1349 	}
1350 
1351 	now = jiffies;
1352 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1353 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1354 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1355 	rt->u.dst.rate_last = now;
1356 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1357 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1358 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1359 	}
1360 
1361 out:	kfree_skb(skb);
1362 	return 0;
1363 }
1364 
1365 /*
1366  *	The last two values are not from the RFC but
1367  *	are needed for AMPRnet AX.25 paths.
1368  */
1369 
1370 static const unsigned short mtu_plateau[] =
1371 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1372 
1373 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1374 {
1375 	int i;
1376 
1377 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1378 		if (old_mtu > mtu_plateau[i])
1379 			return mtu_plateau[i];
1380 	return 68;
1381 }
1382 
1383 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1384 {
1385 	int i;
1386 	unsigned short old_mtu = ntohs(iph->tot_len);
1387 	struct rtable *rth;
1388 	u32  skeys[2] = { iph->saddr, 0, };
1389 	u32  daddr = iph->daddr;
1390 	unsigned short est_mtu = 0;
1391 
1392 	if (ipv4_config.no_pmtu_disc)
1393 		return 0;
1394 
1395 	for (i = 0; i < 2; i++) {
1396 		unsigned hash = rt_hash_code(daddr, skeys[i]);
1397 
1398 		rcu_read_lock();
1399 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1400 		     rth = rcu_dereference(rth->u.rt_next)) {
1401 			if (rth->fl.fl4_dst == daddr &&
1402 			    rth->fl.fl4_src == skeys[i] &&
1403 			    rth->rt_dst  == daddr &&
1404 			    rth->rt_src  == iph->saddr &&
1405 			    rth->fl.iif == 0 &&
1406 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1407 				unsigned short mtu = new_mtu;
1408 
1409 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1410 
1411 					/* BSD 4.2 compatibility hack :-( */
1412 					if (mtu == 0 &&
1413 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1414 					    old_mtu >= 68 + (iph->ihl << 2))
1415 						old_mtu -= iph->ihl << 2;
1416 
1417 					mtu = guess_mtu(old_mtu);
1418 				}
1419 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1420 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1421 						dst_confirm(&rth->u.dst);
1422 						if (mtu < ip_rt_min_pmtu) {
1423 							mtu = ip_rt_min_pmtu;
1424 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1425 								(1 << RTAX_MTU);
1426 						}
1427 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1428 						dst_set_expires(&rth->u.dst,
1429 							ip_rt_mtu_expires);
1430 					}
1431 					est_mtu = mtu;
1432 				}
1433 			}
1434 		}
1435 		rcu_read_unlock();
1436 	}
1437 	return est_mtu ? : new_mtu;
1438 }
1439 
1440 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1441 {
1442 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1443 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1444 		if (mtu < ip_rt_min_pmtu) {
1445 			mtu = ip_rt_min_pmtu;
1446 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1447 		}
1448 		dst->metrics[RTAX_MTU-1] = mtu;
1449 		dst_set_expires(dst, ip_rt_mtu_expires);
1450 	}
1451 }
1452 
1453 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1454 {
1455 	return NULL;
1456 }
1457 
1458 static void ipv4_dst_destroy(struct dst_entry *dst)
1459 {
1460 	struct rtable *rt = (struct rtable *) dst;
1461 	struct inet_peer *peer = rt->peer;
1462 	struct in_device *idev = rt->idev;
1463 
1464 	if (peer) {
1465 		rt->peer = NULL;
1466 		inet_putpeer(peer);
1467 	}
1468 
1469 	if (idev) {
1470 		rt->idev = NULL;
1471 		in_dev_put(idev);
1472 	}
1473 }
1474 
1475 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1476 			    int how)
1477 {
1478 	struct rtable *rt = (struct rtable *) dst;
1479 	struct in_device *idev = rt->idev;
1480 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1481 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1482 		if (loopback_idev) {
1483 			rt->idev = loopback_idev;
1484 			in_dev_put(idev);
1485 		}
1486 	}
1487 }
1488 
1489 static void ipv4_link_failure(struct sk_buff *skb)
1490 {
1491 	struct rtable *rt;
1492 
1493 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1494 
1495 	rt = (struct rtable *) skb->dst;
1496 	if (rt)
1497 		dst_set_expires(&rt->u.dst, 0);
1498 }
1499 
1500 static int ip_rt_bug(struct sk_buff *skb)
1501 {
1502 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1503 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1504 		skb->dev ? skb->dev->name : "?");
1505 	kfree_skb(skb);
1506 	return 0;
1507 }
1508 
1509 /*
1510    We do not cache source address of outgoing interface,
1511    because it is used only by IP RR, TS and SRR options,
1512    so that it out of fast path.
1513 
1514    BTW remember: "addr" is allowed to be not aligned
1515    in IP options!
1516  */
1517 
1518 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1519 {
1520 	u32 src;
1521 	struct fib_result res;
1522 
1523 	if (rt->fl.iif == 0)
1524 		src = rt->rt_src;
1525 	else if (fib_lookup(&rt->fl, &res) == 0) {
1526 		src = FIB_RES_PREFSRC(res);
1527 		fib_res_put(&res);
1528 	} else
1529 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1530 					RT_SCOPE_UNIVERSE);
1531 	memcpy(addr, &src, 4);
1532 }
1533 
1534 #ifdef CONFIG_NET_CLS_ROUTE
1535 static void set_class_tag(struct rtable *rt, u32 tag)
1536 {
1537 	if (!(rt->u.dst.tclassid & 0xFFFF))
1538 		rt->u.dst.tclassid |= tag & 0xFFFF;
1539 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1540 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1541 }
1542 #endif
1543 
1544 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1545 {
1546 	struct fib_info *fi = res->fi;
1547 
1548 	if (fi) {
1549 		if (FIB_RES_GW(*res) &&
1550 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1551 			rt->rt_gateway = FIB_RES_GW(*res);
1552 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1553 		       sizeof(rt->u.dst.metrics));
1554 		if (fi->fib_mtu == 0) {
1555 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1556 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1557 			    rt->rt_gateway != rt->rt_dst &&
1558 			    rt->u.dst.dev->mtu > 576)
1559 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1560 		}
1561 #ifdef CONFIG_NET_CLS_ROUTE
1562 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1563 #endif
1564 	} else
1565 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1566 
1567 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1568 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1569 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1570 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1571 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1572 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1573 				       ip_rt_min_advmss);
1574 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1575 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1576 
1577 #ifdef CONFIG_NET_CLS_ROUTE
1578 #ifdef CONFIG_IP_MULTIPLE_TABLES
1579 	set_class_tag(rt, fib_rules_tclass(res));
1580 #endif
1581 	set_class_tag(rt, itag);
1582 #endif
1583         rt->rt_type = res->type;
1584 }
1585 
1586 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1587 				u8 tos, struct net_device *dev, int our)
1588 {
1589 	unsigned hash;
1590 	struct rtable *rth;
1591 	u32 spec_dst;
1592 	struct in_device *in_dev = in_dev_get(dev);
1593 	u32 itag = 0;
1594 
1595 	/* Primary sanity checks. */
1596 
1597 	if (in_dev == NULL)
1598 		return -EINVAL;
1599 
1600 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1601 	    skb->protocol != htons(ETH_P_IP))
1602 		goto e_inval;
1603 
1604 	if (ZERONET(saddr)) {
1605 		if (!LOCAL_MCAST(daddr))
1606 			goto e_inval;
1607 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1608 	} else if (fib_validate_source(saddr, 0, tos, 0,
1609 					dev, &spec_dst, &itag) < 0)
1610 		goto e_inval;
1611 
1612 	rth = dst_alloc(&ipv4_dst_ops);
1613 	if (!rth)
1614 		goto e_nobufs;
1615 
1616 	rth->u.dst.output= ip_rt_bug;
1617 
1618 	atomic_set(&rth->u.dst.__refcnt, 1);
1619 	rth->u.dst.flags= DST_HOST;
1620 	if (in_dev->cnf.no_policy)
1621 		rth->u.dst.flags |= DST_NOPOLICY;
1622 	rth->fl.fl4_dst	= daddr;
1623 	rth->rt_dst	= daddr;
1624 	rth->fl.fl4_tos	= tos;
1625 #ifdef CONFIG_IP_ROUTE_FWMARK
1626 	rth->fl.fl4_fwmark= skb->nfmark;
1627 #endif
1628 	rth->fl.fl4_src	= saddr;
1629 	rth->rt_src	= saddr;
1630 #ifdef CONFIG_NET_CLS_ROUTE
1631 	rth->u.dst.tclassid = itag;
1632 #endif
1633 	rth->rt_iif	=
1634 	rth->fl.iif	= dev->ifindex;
1635 	rth->u.dst.dev	= &loopback_dev;
1636 	dev_hold(rth->u.dst.dev);
1637 	rth->idev	= in_dev_get(rth->u.dst.dev);
1638 	rth->fl.oif	= 0;
1639 	rth->rt_gateway	= daddr;
1640 	rth->rt_spec_dst= spec_dst;
1641 	rth->rt_type	= RTN_MULTICAST;
1642 	rth->rt_flags	= RTCF_MULTICAST;
1643 	if (our) {
1644 		rth->u.dst.input= ip_local_deliver;
1645 		rth->rt_flags |= RTCF_LOCAL;
1646 	}
1647 
1648 #ifdef CONFIG_IP_MROUTE
1649 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1650 		rth->u.dst.input = ip_mr_input;
1651 #endif
1652 	RT_CACHE_STAT_INC(in_slow_mc);
1653 
1654 	in_dev_put(in_dev);
1655 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1656 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1657 
1658 e_nobufs:
1659 	in_dev_put(in_dev);
1660 	return -ENOBUFS;
1661 
1662 e_inval:
1663 	in_dev_put(in_dev);
1664 	return -EINVAL;
1665 }
1666 
1667 
1668 static void ip_handle_martian_source(struct net_device *dev,
1669 				     struct in_device *in_dev,
1670 				     struct sk_buff *skb,
1671 				     u32 daddr,
1672 				     u32 saddr)
1673 {
1674 	RT_CACHE_STAT_INC(in_martian_src);
1675 #ifdef CONFIG_IP_ROUTE_VERBOSE
1676 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1677 		/*
1678 		 *	RFC1812 recommendation, if source is martian,
1679 		 *	the only hint is MAC header.
1680 		 */
1681 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1682 			"%u.%u.%u.%u, on dev %s\n",
1683 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1684 		if (dev->hard_header_len && skb->mac.raw) {
1685 			int i;
1686 			unsigned char *p = skb->mac.raw;
1687 			printk(KERN_WARNING "ll header: ");
1688 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1689 				printk("%02x", *p);
1690 				if (i < (dev->hard_header_len - 1))
1691 					printk(":");
1692 			}
1693 			printk("\n");
1694 		}
1695 	}
1696 #endif
1697 }
1698 
1699 static inline int __mkroute_input(struct sk_buff *skb,
1700 				  struct fib_result* res,
1701 				  struct in_device *in_dev,
1702 				  u32 daddr, u32 saddr, u32 tos,
1703 				  struct rtable **result)
1704 {
1705 
1706 	struct rtable *rth;
1707 	int err;
1708 	struct in_device *out_dev;
1709 	unsigned flags = 0;
1710 	u32 spec_dst, itag;
1711 
1712 	/* get a working reference to the output device */
1713 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1714 	if (out_dev == NULL) {
1715 		if (net_ratelimit())
1716 			printk(KERN_CRIT "Bug in ip_route_input" \
1717 			       "_slow(). Please, report\n");
1718 		return -EINVAL;
1719 	}
1720 
1721 
1722 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1723 				  in_dev->dev, &spec_dst, &itag);
1724 	if (err < 0) {
1725 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1726 					 saddr);
1727 
1728 		err = -EINVAL;
1729 		goto cleanup;
1730 	}
1731 
1732 	if (err)
1733 		flags |= RTCF_DIRECTSRC;
1734 
1735 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1736 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1737 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1738 		flags |= RTCF_DOREDIRECT;
1739 
1740 	if (skb->protocol != htons(ETH_P_IP)) {
1741 		/* Not IP (i.e. ARP). Do not create route, if it is
1742 		 * invalid for proxy arp. DNAT routes are always valid.
1743 		 */
1744 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1745 			err = -EINVAL;
1746 			goto cleanup;
1747 		}
1748 	}
1749 
1750 
1751 	rth = dst_alloc(&ipv4_dst_ops);
1752 	if (!rth) {
1753 		err = -ENOBUFS;
1754 		goto cleanup;
1755 	}
1756 
1757 	atomic_set(&rth->u.dst.__refcnt, 1);
1758 	rth->u.dst.flags= DST_HOST;
1759 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1760 	if (res->fi->fib_nhs > 1)
1761 		rth->u.dst.flags |= DST_BALANCED;
1762 #endif
1763 	if (in_dev->cnf.no_policy)
1764 		rth->u.dst.flags |= DST_NOPOLICY;
1765 	if (in_dev->cnf.no_xfrm)
1766 		rth->u.dst.flags |= DST_NOXFRM;
1767 	rth->fl.fl4_dst	= daddr;
1768 	rth->rt_dst	= daddr;
1769 	rth->fl.fl4_tos	= tos;
1770 #ifdef CONFIG_IP_ROUTE_FWMARK
1771 	rth->fl.fl4_fwmark= skb->nfmark;
1772 #endif
1773 	rth->fl.fl4_src	= saddr;
1774 	rth->rt_src	= saddr;
1775 	rth->rt_gateway	= daddr;
1776 	rth->rt_iif 	=
1777 		rth->fl.iif	= in_dev->dev->ifindex;
1778 	rth->u.dst.dev	= (out_dev)->dev;
1779 	dev_hold(rth->u.dst.dev);
1780 	rth->idev	= in_dev_get(rth->u.dst.dev);
1781 	rth->fl.oif 	= 0;
1782 	rth->rt_spec_dst= spec_dst;
1783 
1784 	rth->u.dst.input = ip_forward;
1785 	rth->u.dst.output = ip_output;
1786 
1787 	rt_set_nexthop(rth, res, itag);
1788 
1789 	rth->rt_flags = flags;
1790 
1791 	*result = rth;
1792 	err = 0;
1793  cleanup:
1794 	/* release the working reference to the output device */
1795 	in_dev_put(out_dev);
1796 	return err;
1797 }
1798 
1799 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1800 				       struct fib_result* res,
1801 				       const struct flowi *fl,
1802 				       struct in_device *in_dev,
1803 				       u32 daddr, u32 saddr, u32 tos)
1804 {
1805 	struct rtable* rth = NULL;
1806 	int err;
1807 	unsigned hash;
1808 
1809 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1810 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1811 		fib_select_multipath(fl, res);
1812 #endif
1813 
1814 	/* create a routing cache entry */
1815 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1816 	if (err)
1817 		return err;
1818 
1819 	/* put it into the cache */
1820 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1821 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1822 }
1823 
1824 static inline int ip_mkroute_input(struct sk_buff *skb,
1825 				   struct fib_result* res,
1826 				   const struct flowi *fl,
1827 				   struct in_device *in_dev,
1828 				   u32 daddr, u32 saddr, u32 tos)
1829 {
1830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1831 	struct rtable* rth = NULL, *rtres;
1832 	unsigned char hop, hopcount;
1833 	int err = -EINVAL;
1834 	unsigned int hash;
1835 
1836 	if (res->fi)
1837 		hopcount = res->fi->fib_nhs;
1838 	else
1839 		hopcount = 1;
1840 
1841 	/* distinguish between multipath and singlepath */
1842 	if (hopcount < 2)
1843 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1844 					    saddr, tos);
1845 
1846 	/* add all alternatives to the routing cache */
1847 	for (hop = 0; hop < hopcount; hop++) {
1848 		res->nh_sel = hop;
1849 
1850 		/* put reference to previous result */
1851 		if (hop)
1852 			ip_rt_put(rtres);
1853 
1854 		/* create a routing cache entry */
1855 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1856 				      &rth);
1857 		if (err)
1858 			return err;
1859 
1860 		/* put it into the cache */
1861 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1862 		err = rt_intern_hash(hash, rth, &rtres);
1863 		if (err)
1864 			return err;
1865 
1866 		/* forward hop information to multipath impl. */
1867 		multipath_set_nhinfo(rth,
1868 				     FIB_RES_NETWORK(*res),
1869 				     FIB_RES_NETMASK(*res),
1870 				     res->prefixlen,
1871 				     &FIB_RES_NH(*res));
1872 	}
1873 	skb->dst = &rtres->u.dst;
1874 	return err;
1875 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1876 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1877 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1878 }
1879 
1880 
1881 /*
1882  *	NOTE. We drop all the packets that has local source
1883  *	addresses, because every properly looped back packet
1884  *	must have correct destination already attached by output routine.
1885  *
1886  *	Such approach solves two big problems:
1887  *	1. Not simplex devices are handled properly.
1888  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889  */
1890 
1891 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1892 			       u8 tos, struct net_device *dev)
1893 {
1894 	struct fib_result res;
1895 	struct in_device *in_dev = in_dev_get(dev);
1896 	struct flowi fl = { .nl_u = { .ip4_u =
1897 				      { .daddr = daddr,
1898 					.saddr = saddr,
1899 					.tos = tos,
1900 					.scope = RT_SCOPE_UNIVERSE,
1901 #ifdef CONFIG_IP_ROUTE_FWMARK
1902 					.fwmark = skb->nfmark
1903 #endif
1904 				      } },
1905 			    .iif = dev->ifindex };
1906 	unsigned	flags = 0;
1907 	u32		itag = 0;
1908 	struct rtable * rth;
1909 	unsigned	hash;
1910 	u32		spec_dst;
1911 	int		err = -EINVAL;
1912 	int		free_res = 0;
1913 
1914 	/* IP on this device is disabled. */
1915 
1916 	if (!in_dev)
1917 		goto out;
1918 
1919 	/* Check for the most weird martians, which can be not detected
1920 	   by fib_lookup.
1921 	 */
1922 
1923 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1924 		goto martian_source;
1925 
1926 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1927 		goto brd_input;
1928 
1929 	/* Accept zero addresses only to limited broadcast;
1930 	 * I even do not know to fix it or not. Waiting for complains :-)
1931 	 */
1932 	if (ZERONET(saddr))
1933 		goto martian_source;
1934 
1935 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1936 		goto martian_destination;
1937 
1938 	/*
1939 	 *	Now we are ready to route packet.
1940 	 */
1941 	if ((err = fib_lookup(&fl, &res)) != 0) {
1942 		if (!IN_DEV_FORWARD(in_dev))
1943 			goto e_hostunreach;
1944 		goto no_route;
1945 	}
1946 	free_res = 1;
1947 
1948 	RT_CACHE_STAT_INC(in_slow_tot);
1949 
1950 	if (res.type == RTN_BROADCAST)
1951 		goto brd_input;
1952 
1953 	if (res.type == RTN_LOCAL) {
1954 		int result;
1955 		result = fib_validate_source(saddr, daddr, tos,
1956 					     loopback_dev.ifindex,
1957 					     dev, &spec_dst, &itag);
1958 		if (result < 0)
1959 			goto martian_source;
1960 		if (result)
1961 			flags |= RTCF_DIRECTSRC;
1962 		spec_dst = daddr;
1963 		goto local_input;
1964 	}
1965 
1966 	if (!IN_DEV_FORWARD(in_dev))
1967 		goto e_hostunreach;
1968 	if (res.type != RTN_UNICAST)
1969 		goto martian_destination;
1970 
1971 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1972 	if (err == -ENOBUFS)
1973 		goto e_nobufs;
1974 	if (err == -EINVAL)
1975 		goto e_inval;
1976 
1977 done:
1978 	in_dev_put(in_dev);
1979 	if (free_res)
1980 		fib_res_put(&res);
1981 out:	return err;
1982 
1983 brd_input:
1984 	if (skb->protocol != htons(ETH_P_IP))
1985 		goto e_inval;
1986 
1987 	if (ZERONET(saddr))
1988 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1989 	else {
1990 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1991 					  &itag);
1992 		if (err < 0)
1993 			goto martian_source;
1994 		if (err)
1995 			flags |= RTCF_DIRECTSRC;
1996 	}
1997 	flags |= RTCF_BROADCAST;
1998 	res.type = RTN_BROADCAST;
1999 	RT_CACHE_STAT_INC(in_brd);
2000 
2001 local_input:
2002 	rth = dst_alloc(&ipv4_dst_ops);
2003 	if (!rth)
2004 		goto e_nobufs;
2005 
2006 	rth->u.dst.output= ip_rt_bug;
2007 
2008 	atomic_set(&rth->u.dst.__refcnt, 1);
2009 	rth->u.dst.flags= DST_HOST;
2010 	if (in_dev->cnf.no_policy)
2011 		rth->u.dst.flags |= DST_NOPOLICY;
2012 	rth->fl.fl4_dst	= daddr;
2013 	rth->rt_dst	= daddr;
2014 	rth->fl.fl4_tos	= tos;
2015 #ifdef CONFIG_IP_ROUTE_FWMARK
2016 	rth->fl.fl4_fwmark= skb->nfmark;
2017 #endif
2018 	rth->fl.fl4_src	= saddr;
2019 	rth->rt_src	= saddr;
2020 #ifdef CONFIG_NET_CLS_ROUTE
2021 	rth->u.dst.tclassid = itag;
2022 #endif
2023 	rth->rt_iif	=
2024 	rth->fl.iif	= dev->ifindex;
2025 	rth->u.dst.dev	= &loopback_dev;
2026 	dev_hold(rth->u.dst.dev);
2027 	rth->idev	= in_dev_get(rth->u.dst.dev);
2028 	rth->rt_gateway	= daddr;
2029 	rth->rt_spec_dst= spec_dst;
2030 	rth->u.dst.input= ip_local_deliver;
2031 	rth->rt_flags 	= flags|RTCF_LOCAL;
2032 	if (res.type == RTN_UNREACHABLE) {
2033 		rth->u.dst.input= ip_error;
2034 		rth->u.dst.error= -err;
2035 		rth->rt_flags 	&= ~RTCF_LOCAL;
2036 	}
2037 	rth->rt_type	= res.type;
2038 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2039 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2040 	goto done;
2041 
2042 no_route:
2043 	RT_CACHE_STAT_INC(in_no_route);
2044 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2045 	res.type = RTN_UNREACHABLE;
2046 	goto local_input;
2047 
2048 	/*
2049 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2050 	 */
2051 martian_destination:
2052 	RT_CACHE_STAT_INC(in_martian_dst);
2053 #ifdef CONFIG_IP_ROUTE_VERBOSE
2054 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2055 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2056 			"%u.%u.%u.%u, dev %s\n",
2057 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2058 #endif
2059 
2060 e_hostunreach:
2061         err = -EHOSTUNREACH;
2062         goto done;
2063 
2064 e_inval:
2065 	err = -EINVAL;
2066 	goto done;
2067 
2068 e_nobufs:
2069 	err = -ENOBUFS;
2070 	goto done;
2071 
2072 martian_source:
2073 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074 	goto e_inval;
2075 }
2076 
2077 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2078 		   u8 tos, struct net_device *dev)
2079 {
2080 	struct rtable * rth;
2081 	unsigned	hash;
2082 	int iif = dev->ifindex;
2083 
2084 	tos &= IPTOS_RT_MASK;
2085 	hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2086 
2087 	rcu_read_lock();
2088 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2089 	     rth = rcu_dereference(rth->u.rt_next)) {
2090 		if (rth->fl.fl4_dst == daddr &&
2091 		    rth->fl.fl4_src == saddr &&
2092 		    rth->fl.iif == iif &&
2093 		    rth->fl.oif == 0 &&
2094 #ifdef CONFIG_IP_ROUTE_FWMARK
2095 		    rth->fl.fl4_fwmark == skb->nfmark &&
2096 #endif
2097 		    rth->fl.fl4_tos == tos) {
2098 			rth->u.dst.lastuse = jiffies;
2099 			dst_hold(&rth->u.dst);
2100 			rth->u.dst.__use++;
2101 			RT_CACHE_STAT_INC(in_hit);
2102 			rcu_read_unlock();
2103 			skb->dst = (struct dst_entry*)rth;
2104 			return 0;
2105 		}
2106 		RT_CACHE_STAT_INC(in_hlist_search);
2107 	}
2108 	rcu_read_unlock();
2109 
2110 	/* Multicast recognition logic is moved from route cache to here.
2111 	   The problem was that too many Ethernet cards have broken/missing
2112 	   hardware multicast filters :-( As result the host on multicasting
2113 	   network acquires a lot of useless route cache entries, sort of
2114 	   SDR messages from all the world. Now we try to get rid of them.
2115 	   Really, provided software IP multicast filter is organized
2116 	   reasonably (at least, hashed), it does not result in a slowdown
2117 	   comparing with route cache reject entries.
2118 	   Note, that multicast routers are not affected, because
2119 	   route cache entry is created eventually.
2120 	 */
2121 	if (MULTICAST(daddr)) {
2122 		struct in_device *in_dev;
2123 
2124 		rcu_read_lock();
2125 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2126 			int our = ip_check_mc(in_dev, daddr, saddr,
2127 				skb->nh.iph->protocol);
2128 			if (our
2129 #ifdef CONFIG_IP_MROUTE
2130 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2131 #endif
2132 			    ) {
2133 				rcu_read_unlock();
2134 				return ip_route_input_mc(skb, daddr, saddr,
2135 							 tos, dev, our);
2136 			}
2137 		}
2138 		rcu_read_unlock();
2139 		return -EINVAL;
2140 	}
2141 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2142 }
2143 
2144 static inline int __mkroute_output(struct rtable **result,
2145 				   struct fib_result* res,
2146 				   const struct flowi *fl,
2147 				   const struct flowi *oldflp,
2148 				   struct net_device *dev_out,
2149 				   unsigned flags)
2150 {
2151 	struct rtable *rth;
2152 	struct in_device *in_dev;
2153 	u32 tos = RT_FL_TOS(oldflp);
2154 	int err = 0;
2155 
2156 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2157 		return -EINVAL;
2158 
2159 	if (fl->fl4_dst == 0xFFFFFFFF)
2160 		res->type = RTN_BROADCAST;
2161 	else if (MULTICAST(fl->fl4_dst))
2162 		res->type = RTN_MULTICAST;
2163 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2164 		return -EINVAL;
2165 
2166 	if (dev_out->flags & IFF_LOOPBACK)
2167 		flags |= RTCF_LOCAL;
2168 
2169 	/* get work reference to inet device */
2170 	in_dev = in_dev_get(dev_out);
2171 	if (!in_dev)
2172 		return -EINVAL;
2173 
2174 	if (res->type == RTN_BROADCAST) {
2175 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2176 		if (res->fi) {
2177 			fib_info_put(res->fi);
2178 			res->fi = NULL;
2179 		}
2180 	} else if (res->type == RTN_MULTICAST) {
2181 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2182 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2183 				 oldflp->proto))
2184 			flags &= ~RTCF_LOCAL;
2185 		/* If multicast route do not exist use
2186 		   default one, but do not gateway in this case.
2187 		   Yes, it is hack.
2188 		 */
2189 		if (res->fi && res->prefixlen < 4) {
2190 			fib_info_put(res->fi);
2191 			res->fi = NULL;
2192 		}
2193 	}
2194 
2195 
2196 	rth = dst_alloc(&ipv4_dst_ops);
2197 	if (!rth) {
2198 		err = -ENOBUFS;
2199 		goto cleanup;
2200 	}
2201 
2202 	atomic_set(&rth->u.dst.__refcnt, 1);
2203 	rth->u.dst.flags= DST_HOST;
2204 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2205 	if (res->fi) {
2206 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2207 		if (res->fi->fib_nhs > 1)
2208 			rth->u.dst.flags |= DST_BALANCED;
2209 	}
2210 #endif
2211 	if (in_dev->cnf.no_xfrm)
2212 		rth->u.dst.flags |= DST_NOXFRM;
2213 	if (in_dev->cnf.no_policy)
2214 		rth->u.dst.flags |= DST_NOPOLICY;
2215 
2216 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2217 	rth->fl.fl4_tos	= tos;
2218 	rth->fl.fl4_src	= oldflp->fl4_src;
2219 	rth->fl.oif	= oldflp->oif;
2220 #ifdef CONFIG_IP_ROUTE_FWMARK
2221 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2222 #endif
2223 	rth->rt_dst	= fl->fl4_dst;
2224 	rth->rt_src	= fl->fl4_src;
2225 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2226 	/* get references to the devices that are to be hold by the routing
2227 	   cache entry */
2228 	rth->u.dst.dev	= dev_out;
2229 	dev_hold(dev_out);
2230 	rth->idev	= in_dev_get(dev_out);
2231 	rth->rt_gateway = fl->fl4_dst;
2232 	rth->rt_spec_dst= fl->fl4_src;
2233 
2234 	rth->u.dst.output=ip_output;
2235 
2236 	RT_CACHE_STAT_INC(out_slow_tot);
2237 
2238 	if (flags & RTCF_LOCAL) {
2239 		rth->u.dst.input = ip_local_deliver;
2240 		rth->rt_spec_dst = fl->fl4_dst;
2241 	}
2242 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2243 		rth->rt_spec_dst = fl->fl4_src;
2244 		if (flags & RTCF_LOCAL &&
2245 		    !(dev_out->flags & IFF_LOOPBACK)) {
2246 			rth->u.dst.output = ip_mc_output;
2247 			RT_CACHE_STAT_INC(out_slow_mc);
2248 		}
2249 #ifdef CONFIG_IP_MROUTE
2250 		if (res->type == RTN_MULTICAST) {
2251 			if (IN_DEV_MFORWARD(in_dev) &&
2252 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2253 				rth->u.dst.input = ip_mr_input;
2254 				rth->u.dst.output = ip_mc_output;
2255 			}
2256 		}
2257 #endif
2258 	}
2259 
2260 	rt_set_nexthop(rth, res, 0);
2261 
2262 	rth->rt_flags = flags;
2263 
2264 	*result = rth;
2265  cleanup:
2266 	/* release work reference to inet device */
2267 	in_dev_put(in_dev);
2268 
2269 	return err;
2270 }
2271 
2272 static inline int ip_mkroute_output_def(struct rtable **rp,
2273 					struct fib_result* res,
2274 					const struct flowi *fl,
2275 					const struct flowi *oldflp,
2276 					struct net_device *dev_out,
2277 					unsigned flags)
2278 {
2279 	struct rtable *rth = NULL;
2280 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2281 	unsigned hash;
2282 	if (err == 0) {
2283 		hash = rt_hash_code(oldflp->fl4_dst,
2284 				    oldflp->fl4_src ^ (oldflp->oif << 5));
2285 		err = rt_intern_hash(hash, rth, rp);
2286 	}
2287 
2288 	return err;
2289 }
2290 
2291 static inline int ip_mkroute_output(struct rtable** rp,
2292 				    struct fib_result* res,
2293 				    const struct flowi *fl,
2294 				    const struct flowi *oldflp,
2295 				    struct net_device *dev_out,
2296 				    unsigned flags)
2297 {
2298 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2299 	unsigned char hop;
2300 	unsigned hash;
2301 	int err = -EINVAL;
2302 	struct rtable *rth = NULL;
2303 
2304 	if (res->fi && res->fi->fib_nhs > 1) {
2305 		unsigned char hopcount = res->fi->fib_nhs;
2306 
2307 		for (hop = 0; hop < hopcount; hop++) {
2308 			struct net_device *dev2nexthop;
2309 
2310 			res->nh_sel = hop;
2311 
2312 			/* hold a work reference to the output device */
2313 			dev2nexthop = FIB_RES_DEV(*res);
2314 			dev_hold(dev2nexthop);
2315 
2316 			/* put reference to previous result */
2317 			if (hop)
2318 				ip_rt_put(*rp);
2319 
2320 			err = __mkroute_output(&rth, res, fl, oldflp,
2321 					       dev2nexthop, flags);
2322 
2323 			if (err != 0)
2324 				goto cleanup;
2325 
2326 			hash = rt_hash_code(oldflp->fl4_dst,
2327 					    oldflp->fl4_src ^
2328 					    (oldflp->oif << 5));
2329 			err = rt_intern_hash(hash, rth, rp);
2330 
2331 			/* forward hop information to multipath impl. */
2332 			multipath_set_nhinfo(rth,
2333 					     FIB_RES_NETWORK(*res),
2334 					     FIB_RES_NETMASK(*res),
2335 					     res->prefixlen,
2336 					     &FIB_RES_NH(*res));
2337 		cleanup:
2338 			/* release work reference to output device */
2339 			dev_put(dev2nexthop);
2340 
2341 			if (err != 0)
2342 				return err;
2343 		}
2344 		return err;
2345 	} else {
2346 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2347 					     flags);
2348 	}
2349 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2350 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2351 #endif
2352 }
2353 
2354 /*
2355  * Major route resolver routine.
2356  */
2357 
2358 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2359 {
2360 	u32 tos	= RT_FL_TOS(oldflp);
2361 	struct flowi fl = { .nl_u = { .ip4_u =
2362 				      { .daddr = oldflp->fl4_dst,
2363 					.saddr = oldflp->fl4_src,
2364 					.tos = tos & IPTOS_RT_MASK,
2365 					.scope = ((tos & RTO_ONLINK) ?
2366 						  RT_SCOPE_LINK :
2367 						  RT_SCOPE_UNIVERSE),
2368 #ifdef CONFIG_IP_ROUTE_FWMARK
2369 					.fwmark = oldflp->fl4_fwmark
2370 #endif
2371 				      } },
2372 			    .iif = loopback_dev.ifindex,
2373 			    .oif = oldflp->oif };
2374 	struct fib_result res;
2375 	unsigned flags = 0;
2376 	struct net_device *dev_out = NULL;
2377 	int free_res = 0;
2378 	int err;
2379 
2380 
2381 	res.fi		= NULL;
2382 #ifdef CONFIG_IP_MULTIPLE_TABLES
2383 	res.r		= NULL;
2384 #endif
2385 
2386 	if (oldflp->fl4_src) {
2387 		err = -EINVAL;
2388 		if (MULTICAST(oldflp->fl4_src) ||
2389 		    BADCLASS(oldflp->fl4_src) ||
2390 		    ZERONET(oldflp->fl4_src))
2391 			goto out;
2392 
2393 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2394 		dev_out = ip_dev_find(oldflp->fl4_src);
2395 		if (dev_out == NULL)
2396 			goto out;
2397 
2398 		/* I removed check for oif == dev_out->oif here.
2399 		   It was wrong for two reasons:
2400 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2401 		      assigned to multiple interfaces.
2402 		   2. Moreover, we are allowed to send packets with saddr
2403 		      of another iface. --ANK
2404 		 */
2405 
2406 		if (oldflp->oif == 0
2407 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2408 			/* Special hack: user can direct multicasts
2409 			   and limited broadcast via necessary interface
2410 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2411 			   This hack is not just for fun, it allows
2412 			   vic,vat and friends to work.
2413 			   They bind socket to loopback, set ttl to zero
2414 			   and expect that it will work.
2415 			   From the viewpoint of routing cache they are broken,
2416 			   because we are not allowed to build multicast path
2417 			   with loopback source addr (look, routing cache
2418 			   cannot know, that ttl is zero, so that packet
2419 			   will not leave this host and route is valid).
2420 			   Luckily, this hack is good workaround.
2421 			 */
2422 
2423 			fl.oif = dev_out->ifindex;
2424 			goto make_route;
2425 		}
2426 		if (dev_out)
2427 			dev_put(dev_out);
2428 		dev_out = NULL;
2429 	}
2430 
2431 
2432 	if (oldflp->oif) {
2433 		dev_out = dev_get_by_index(oldflp->oif);
2434 		err = -ENODEV;
2435 		if (dev_out == NULL)
2436 			goto out;
2437 
2438 		/* RACE: Check return value of inet_select_addr instead. */
2439 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2440 			dev_put(dev_out);
2441 			goto out;	/* Wrong error code */
2442 		}
2443 
2444 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2445 			if (!fl.fl4_src)
2446 				fl.fl4_src = inet_select_addr(dev_out, 0,
2447 							      RT_SCOPE_LINK);
2448 			goto make_route;
2449 		}
2450 		if (!fl.fl4_src) {
2451 			if (MULTICAST(oldflp->fl4_dst))
2452 				fl.fl4_src = inet_select_addr(dev_out, 0,
2453 							      fl.fl4_scope);
2454 			else if (!oldflp->fl4_dst)
2455 				fl.fl4_src = inet_select_addr(dev_out, 0,
2456 							      RT_SCOPE_HOST);
2457 		}
2458 	}
2459 
2460 	if (!fl.fl4_dst) {
2461 		fl.fl4_dst = fl.fl4_src;
2462 		if (!fl.fl4_dst)
2463 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2464 		if (dev_out)
2465 			dev_put(dev_out);
2466 		dev_out = &loopback_dev;
2467 		dev_hold(dev_out);
2468 		fl.oif = loopback_dev.ifindex;
2469 		res.type = RTN_LOCAL;
2470 		flags |= RTCF_LOCAL;
2471 		goto make_route;
2472 	}
2473 
2474 	if (fib_lookup(&fl, &res)) {
2475 		res.fi = NULL;
2476 		if (oldflp->oif) {
2477 			/* Apparently, routing tables are wrong. Assume,
2478 			   that the destination is on link.
2479 
2480 			   WHY? DW.
2481 			   Because we are allowed to send to iface
2482 			   even if it has NO routes and NO assigned
2483 			   addresses. When oif is specified, routing
2484 			   tables are looked up with only one purpose:
2485 			   to catch if destination is gatewayed, rather than
2486 			   direct. Moreover, if MSG_DONTROUTE is set,
2487 			   we send packet, ignoring both routing tables
2488 			   and ifaddr state. --ANK
2489 
2490 
2491 			   We could make it even if oif is unknown,
2492 			   likely IPv6, but we do not.
2493 			 */
2494 
2495 			if (fl.fl4_src == 0)
2496 				fl.fl4_src = inet_select_addr(dev_out, 0,
2497 							      RT_SCOPE_LINK);
2498 			res.type = RTN_UNICAST;
2499 			goto make_route;
2500 		}
2501 		if (dev_out)
2502 			dev_put(dev_out);
2503 		err = -ENETUNREACH;
2504 		goto out;
2505 	}
2506 	free_res = 1;
2507 
2508 	if (res.type == RTN_LOCAL) {
2509 		if (!fl.fl4_src)
2510 			fl.fl4_src = fl.fl4_dst;
2511 		if (dev_out)
2512 			dev_put(dev_out);
2513 		dev_out = &loopback_dev;
2514 		dev_hold(dev_out);
2515 		fl.oif = dev_out->ifindex;
2516 		if (res.fi)
2517 			fib_info_put(res.fi);
2518 		res.fi = NULL;
2519 		flags |= RTCF_LOCAL;
2520 		goto make_route;
2521 	}
2522 
2523 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2524 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2525 		fib_select_multipath(&fl, &res);
2526 	else
2527 #endif
2528 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2529 		fib_select_default(&fl, &res);
2530 
2531 	if (!fl.fl4_src)
2532 		fl.fl4_src = FIB_RES_PREFSRC(res);
2533 
2534 	if (dev_out)
2535 		dev_put(dev_out);
2536 	dev_out = FIB_RES_DEV(res);
2537 	dev_hold(dev_out);
2538 	fl.oif = dev_out->ifindex;
2539 
2540 
2541 make_route:
2542 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2543 
2544 
2545 	if (free_res)
2546 		fib_res_put(&res);
2547 	if (dev_out)
2548 		dev_put(dev_out);
2549 out:	return err;
2550 }
2551 
2552 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2553 {
2554 	unsigned hash;
2555 	struct rtable *rth;
2556 
2557 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2558 
2559 	rcu_read_lock_bh();
2560 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2561 		rth = rcu_dereference(rth->u.rt_next)) {
2562 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2563 		    rth->fl.fl4_src == flp->fl4_src &&
2564 		    rth->fl.iif == 0 &&
2565 		    rth->fl.oif == flp->oif &&
2566 #ifdef CONFIG_IP_ROUTE_FWMARK
2567 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2568 #endif
2569 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2570 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2571 
2572 			/* check for multipath routes and choose one if
2573 			 * necessary
2574 			 */
2575 			if (multipath_select_route(flp, rth, rp)) {
2576 				dst_hold(&(*rp)->u.dst);
2577 				RT_CACHE_STAT_INC(out_hit);
2578 				rcu_read_unlock_bh();
2579 				return 0;
2580 			}
2581 
2582 			rth->u.dst.lastuse = jiffies;
2583 			dst_hold(&rth->u.dst);
2584 			rth->u.dst.__use++;
2585 			RT_CACHE_STAT_INC(out_hit);
2586 			rcu_read_unlock_bh();
2587 			*rp = rth;
2588 			return 0;
2589 		}
2590 		RT_CACHE_STAT_INC(out_hlist_search);
2591 	}
2592 	rcu_read_unlock_bh();
2593 
2594 	return ip_route_output_slow(rp, flp);
2595 }
2596 
2597 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2598 
2599 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2600 {
2601 	int err;
2602 
2603 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2604 		return err;
2605 
2606 	if (flp->proto) {
2607 		if (!flp->fl4_src)
2608 			flp->fl4_src = (*rp)->rt_src;
2609 		if (!flp->fl4_dst)
2610 			flp->fl4_dst = (*rp)->rt_dst;
2611 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2612 	}
2613 
2614 	return 0;
2615 }
2616 
2617 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2618 
2619 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2620 {
2621 	return ip_route_output_flow(rp, flp, NULL, 0);
2622 }
2623 
2624 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2625 			int nowait, unsigned int flags)
2626 {
2627 	struct rtable *rt = (struct rtable*)skb->dst;
2628 	struct rtmsg *r;
2629 	struct nlmsghdr  *nlh;
2630 	unsigned char	 *b = skb->tail;
2631 	struct rta_cacheinfo ci;
2632 #ifdef CONFIG_IP_MROUTE
2633 	struct rtattr *eptr;
2634 #endif
2635 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2636 	r = NLMSG_DATA(nlh);
2637 	r->rtm_family	 = AF_INET;
2638 	r->rtm_dst_len	= 32;
2639 	r->rtm_src_len	= 0;
2640 	r->rtm_tos	= rt->fl.fl4_tos;
2641 	r->rtm_table	= RT_TABLE_MAIN;
2642 	r->rtm_type	= rt->rt_type;
2643 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2644 	r->rtm_protocol = RTPROT_UNSPEC;
2645 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2646 	if (rt->rt_flags & RTCF_NOTIFY)
2647 		r->rtm_flags |= RTM_F_NOTIFY;
2648 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2649 	if (rt->fl.fl4_src) {
2650 		r->rtm_src_len = 32;
2651 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2652 	}
2653 	if (rt->u.dst.dev)
2654 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2655 #ifdef CONFIG_NET_CLS_ROUTE
2656 	if (rt->u.dst.tclassid)
2657 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2658 #endif
2659 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2660 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2661 		__u32 alg = rt->rt_multipath_alg;
2662 
2663 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2664 	}
2665 #endif
2666 	if (rt->fl.iif)
2667 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2668 	else if (rt->rt_src != rt->fl.fl4_src)
2669 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2670 	if (rt->rt_dst != rt->rt_gateway)
2671 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2672 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2673 		goto rtattr_failure;
2674 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2675 	ci.rta_used	= rt->u.dst.__use;
2676 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2677 	if (rt->u.dst.expires)
2678 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2679 	else
2680 		ci.rta_expires = 0;
2681 	ci.rta_error	= rt->u.dst.error;
2682 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2683 	if (rt->peer) {
2684 		ci.rta_id = rt->peer->ip_id_count;
2685 		if (rt->peer->tcp_ts_stamp) {
2686 			ci.rta_ts = rt->peer->tcp_ts;
2687 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2688 		}
2689 	}
2690 #ifdef CONFIG_IP_MROUTE
2691 	eptr = (struct rtattr*)skb->tail;
2692 #endif
2693 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2694 	if (rt->fl.iif) {
2695 #ifdef CONFIG_IP_MROUTE
2696 		u32 dst = rt->rt_dst;
2697 
2698 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2699 		    ipv4_devconf.mc_forwarding) {
2700 			int err = ipmr_get_route(skb, r, nowait);
2701 			if (err <= 0) {
2702 				if (!nowait) {
2703 					if (err == 0)
2704 						return 0;
2705 					goto nlmsg_failure;
2706 				} else {
2707 					if (err == -EMSGSIZE)
2708 						goto nlmsg_failure;
2709 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2710 				}
2711 			}
2712 		} else
2713 #endif
2714 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2715 	}
2716 
2717 	nlh->nlmsg_len = skb->tail - b;
2718 	return skb->len;
2719 
2720 nlmsg_failure:
2721 rtattr_failure:
2722 	skb_trim(skb, b - skb->data);
2723 	return -1;
2724 }
2725 
2726 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2727 {
2728 	struct rtattr **rta = arg;
2729 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2730 	struct rtable *rt = NULL;
2731 	u32 dst = 0;
2732 	u32 src = 0;
2733 	int iif = 0;
2734 	int err = -ENOBUFS;
2735 	struct sk_buff *skb;
2736 
2737 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2738 	if (!skb)
2739 		goto out;
2740 
2741 	/* Reserve room for dummy headers, this skb can pass
2742 	   through good chunk of routing engine.
2743 	 */
2744 	skb->mac.raw = skb->nh.raw = skb->data;
2745 
2746 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2747 	skb->nh.iph->protocol = IPPROTO_ICMP;
2748 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2749 
2750 	if (rta[RTA_SRC - 1])
2751 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2752 	if (rta[RTA_DST - 1])
2753 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2754 	if (rta[RTA_IIF - 1])
2755 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2756 
2757 	if (iif) {
2758 		struct net_device *dev = __dev_get_by_index(iif);
2759 		err = -ENODEV;
2760 		if (!dev)
2761 			goto out_free;
2762 		skb->protocol	= htons(ETH_P_IP);
2763 		skb->dev	= dev;
2764 		local_bh_disable();
2765 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2766 		local_bh_enable();
2767 		rt = (struct rtable*)skb->dst;
2768 		if (!err && rt->u.dst.error)
2769 			err = -rt->u.dst.error;
2770 	} else {
2771 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2772 							 .saddr = src,
2773 							 .tos = rtm->rtm_tos } } };
2774 		int oif = 0;
2775 		if (rta[RTA_OIF - 1])
2776 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2777 		fl.oif = oif;
2778 		err = ip_route_output_key(&rt, &fl);
2779 	}
2780 	if (err)
2781 		goto out_free;
2782 
2783 	skb->dst = &rt->u.dst;
2784 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2785 		rt->rt_flags |= RTCF_NOTIFY;
2786 
2787 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2788 
2789 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2790 				RTM_NEWROUTE, 0, 0);
2791 	if (!err)
2792 		goto out_free;
2793 	if (err < 0) {
2794 		err = -EMSGSIZE;
2795 		goto out_free;
2796 	}
2797 
2798 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2799 	if (err > 0)
2800 		err = 0;
2801 out:	return err;
2802 
2803 out_free:
2804 	kfree_skb(skb);
2805 	goto out;
2806 }
2807 
2808 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2809 {
2810 	struct rtable *rt;
2811 	int h, s_h;
2812 	int idx, s_idx;
2813 
2814 	s_h = cb->args[0];
2815 	s_idx = idx = cb->args[1];
2816 	for (h = 0; h <= rt_hash_mask; h++) {
2817 		if (h < s_h) continue;
2818 		if (h > s_h)
2819 			s_idx = 0;
2820 		rcu_read_lock_bh();
2821 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2822 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2823 			if (idx < s_idx)
2824 				continue;
2825 			skb->dst = dst_clone(&rt->u.dst);
2826 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2827 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2828 					 1, NLM_F_MULTI) <= 0) {
2829 				dst_release(xchg(&skb->dst, NULL));
2830 				rcu_read_unlock_bh();
2831 				goto done;
2832 			}
2833 			dst_release(xchg(&skb->dst, NULL));
2834 		}
2835 		rcu_read_unlock_bh();
2836 	}
2837 
2838 done:
2839 	cb->args[0] = h;
2840 	cb->args[1] = idx;
2841 	return skb->len;
2842 }
2843 
2844 void ip_rt_multicast_event(struct in_device *in_dev)
2845 {
2846 	rt_cache_flush(0);
2847 }
2848 
2849 #ifdef CONFIG_SYSCTL
2850 static int flush_delay;
2851 
2852 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2853 					struct file *filp, void __user *buffer,
2854 					size_t *lenp, loff_t *ppos)
2855 {
2856 	if (write) {
2857 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2858 		rt_cache_flush(flush_delay);
2859 		return 0;
2860 	}
2861 
2862 	return -EINVAL;
2863 }
2864 
2865 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2866 						int __user *name,
2867 						int nlen,
2868 						void __user *oldval,
2869 						size_t __user *oldlenp,
2870 						void __user *newval,
2871 						size_t newlen,
2872 						void **context)
2873 {
2874 	int delay;
2875 	if (newlen != sizeof(int))
2876 		return -EINVAL;
2877 	if (get_user(delay, (int __user *)newval))
2878 		return -EFAULT;
2879 	rt_cache_flush(delay);
2880 	return 0;
2881 }
2882 
2883 ctl_table ipv4_route_table[] = {
2884         {
2885 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2886 		.procname	= "flush",
2887 		.data		= &flush_delay,
2888 		.maxlen		= sizeof(int),
2889 		.mode		= 0200,
2890 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2891 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2892 	},
2893 	{
2894 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2895 		.procname	= "min_delay",
2896 		.data		= &ip_rt_min_delay,
2897 		.maxlen		= sizeof(int),
2898 		.mode		= 0644,
2899 		.proc_handler	= &proc_dointvec_jiffies,
2900 		.strategy	= &sysctl_jiffies,
2901 	},
2902 	{
2903 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2904 		.procname	= "max_delay",
2905 		.data		= &ip_rt_max_delay,
2906 		.maxlen		= sizeof(int),
2907 		.mode		= 0644,
2908 		.proc_handler	= &proc_dointvec_jiffies,
2909 		.strategy	= &sysctl_jiffies,
2910 	},
2911 	{
2912 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2913 		.procname	= "gc_thresh",
2914 		.data		= &ipv4_dst_ops.gc_thresh,
2915 		.maxlen		= sizeof(int),
2916 		.mode		= 0644,
2917 		.proc_handler	= &proc_dointvec,
2918 	},
2919 	{
2920 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2921 		.procname	= "max_size",
2922 		.data		= &ip_rt_max_size,
2923 		.maxlen		= sizeof(int),
2924 		.mode		= 0644,
2925 		.proc_handler	= &proc_dointvec,
2926 	},
2927 	{
2928 		/*  Deprecated. Use gc_min_interval_ms */
2929 
2930 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2931 		.procname	= "gc_min_interval",
2932 		.data		= &ip_rt_gc_min_interval,
2933 		.maxlen		= sizeof(int),
2934 		.mode		= 0644,
2935 		.proc_handler	= &proc_dointvec_jiffies,
2936 		.strategy	= &sysctl_jiffies,
2937 	},
2938 	{
2939 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2940 		.procname	= "gc_min_interval_ms",
2941 		.data		= &ip_rt_gc_min_interval,
2942 		.maxlen		= sizeof(int),
2943 		.mode		= 0644,
2944 		.proc_handler	= &proc_dointvec_ms_jiffies,
2945 		.strategy	= &sysctl_ms_jiffies,
2946 	},
2947 	{
2948 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2949 		.procname	= "gc_timeout",
2950 		.data		= &ip_rt_gc_timeout,
2951 		.maxlen		= sizeof(int),
2952 		.mode		= 0644,
2953 		.proc_handler	= &proc_dointvec_jiffies,
2954 		.strategy	= &sysctl_jiffies,
2955 	},
2956 	{
2957 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2958 		.procname	= "gc_interval",
2959 		.data		= &ip_rt_gc_interval,
2960 		.maxlen		= sizeof(int),
2961 		.mode		= 0644,
2962 		.proc_handler	= &proc_dointvec_jiffies,
2963 		.strategy	= &sysctl_jiffies,
2964 	},
2965 	{
2966 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2967 		.procname	= "redirect_load",
2968 		.data		= &ip_rt_redirect_load,
2969 		.maxlen		= sizeof(int),
2970 		.mode		= 0644,
2971 		.proc_handler	= &proc_dointvec,
2972 	},
2973 	{
2974 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2975 		.procname	= "redirect_number",
2976 		.data		= &ip_rt_redirect_number,
2977 		.maxlen		= sizeof(int),
2978 		.mode		= 0644,
2979 		.proc_handler	= &proc_dointvec,
2980 	},
2981 	{
2982 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2983 		.procname	= "redirect_silence",
2984 		.data		= &ip_rt_redirect_silence,
2985 		.maxlen		= sizeof(int),
2986 		.mode		= 0644,
2987 		.proc_handler	= &proc_dointvec,
2988 	},
2989 	{
2990 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2991 		.procname	= "error_cost",
2992 		.data		= &ip_rt_error_cost,
2993 		.maxlen		= sizeof(int),
2994 		.mode		= 0644,
2995 		.proc_handler	= &proc_dointvec,
2996 	},
2997 	{
2998 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2999 		.procname	= "error_burst",
3000 		.data		= &ip_rt_error_burst,
3001 		.maxlen		= sizeof(int),
3002 		.mode		= 0644,
3003 		.proc_handler	= &proc_dointvec,
3004 	},
3005 	{
3006 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3007 		.procname	= "gc_elasticity",
3008 		.data		= &ip_rt_gc_elasticity,
3009 		.maxlen		= sizeof(int),
3010 		.mode		= 0644,
3011 		.proc_handler	= &proc_dointvec,
3012 	},
3013 	{
3014 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3015 		.procname	= "mtu_expires",
3016 		.data		= &ip_rt_mtu_expires,
3017 		.maxlen		= sizeof(int),
3018 		.mode		= 0644,
3019 		.proc_handler	= &proc_dointvec_jiffies,
3020 		.strategy	= &sysctl_jiffies,
3021 	},
3022 	{
3023 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3024 		.procname	= "min_pmtu",
3025 		.data		= &ip_rt_min_pmtu,
3026 		.maxlen		= sizeof(int),
3027 		.mode		= 0644,
3028 		.proc_handler	= &proc_dointvec,
3029 	},
3030 	{
3031 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3032 		.procname	= "min_adv_mss",
3033 		.data		= &ip_rt_min_advmss,
3034 		.maxlen		= sizeof(int),
3035 		.mode		= 0644,
3036 		.proc_handler	= &proc_dointvec,
3037 	},
3038 	{
3039 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3040 		.procname	= "secret_interval",
3041 		.data		= &ip_rt_secret_interval,
3042 		.maxlen		= sizeof(int),
3043 		.mode		= 0644,
3044 		.proc_handler	= &proc_dointvec_jiffies,
3045 		.strategy	= &sysctl_jiffies,
3046 	},
3047 	{ .ctl_name = 0 }
3048 };
3049 #endif
3050 
3051 #ifdef CONFIG_NET_CLS_ROUTE
3052 struct ip_rt_acct *ip_rt_acct;
3053 
3054 /* This code sucks.  But you should have seen it before! --RR */
3055 
3056 /* IP route accounting ptr for this logical cpu number. */
3057 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3058 
3059 #ifdef CONFIG_PROC_FS
3060 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3061 			   int length, int *eof, void *data)
3062 {
3063 	unsigned int i;
3064 
3065 	if ((offset & 3) || (length & 3))
3066 		return -EIO;
3067 
3068 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3069 		*eof = 1;
3070 		return 0;
3071 	}
3072 
3073 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3074 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3075 		*eof = 1;
3076 	}
3077 
3078 	offset /= sizeof(u32);
3079 
3080 	if (length > 0) {
3081 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3082 		u32 *dst = (u32 *) buffer;
3083 
3084 		/* Copy first cpu. */
3085 		*start = buffer;
3086 		memcpy(dst, src, length);
3087 
3088 		/* Add the other cpus in, one int at a time */
3089 		for_each_possible_cpu(i) {
3090 			unsigned int j;
3091 
3092 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3093 
3094 			for (j = 0; j < length/4; j++)
3095 				dst[j] += src[j];
3096 		}
3097 	}
3098 	return length;
3099 }
3100 #endif /* CONFIG_PROC_FS */
3101 #endif /* CONFIG_NET_CLS_ROUTE */
3102 
3103 static __initdata unsigned long rhash_entries;
3104 static int __init set_rhash_entries(char *str)
3105 {
3106 	if (!str)
3107 		return 0;
3108 	rhash_entries = simple_strtoul(str, &str, 0);
3109 	return 1;
3110 }
3111 __setup("rhash_entries=", set_rhash_entries);
3112 
3113 int __init ip_rt_init(void)
3114 {
3115 	int rc = 0;
3116 
3117 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3118 			     (jiffies ^ (jiffies >> 7)));
3119 
3120 #ifdef CONFIG_NET_CLS_ROUTE
3121 	{
3122 	int order;
3123 	for (order = 0;
3124 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3125 		/* NOTHING */;
3126 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3127 	if (!ip_rt_acct)
3128 		panic("IP: failed to allocate ip_rt_acct\n");
3129 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3130 	}
3131 #endif
3132 
3133 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3134 						     sizeof(struct rtable),
3135 						     0, SLAB_HWCACHE_ALIGN,
3136 						     NULL, NULL);
3137 
3138 	if (!ipv4_dst_ops.kmem_cachep)
3139 		panic("IP: failed to allocate ip_dst_cache\n");
3140 
3141 	rt_hash_table = (struct rt_hash_bucket *)
3142 		alloc_large_system_hash("IP route cache",
3143 					sizeof(struct rt_hash_bucket),
3144 					rhash_entries,
3145 					(num_physpages >= 128 * 1024) ?
3146 					15 : 17,
3147 					HASH_HIGHMEM,
3148 					&rt_hash_log,
3149 					&rt_hash_mask,
3150 					0);
3151 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 	rt_hash_lock_init();
3153 
3154 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156 
3157 	devinet_init();
3158 	ip_fib_init();
3159 
3160 	init_timer(&rt_flush_timer);
3161 	rt_flush_timer.function = rt_run_flush;
3162 	init_timer(&rt_periodic_timer);
3163 	rt_periodic_timer.function = rt_check_expire;
3164 	init_timer(&rt_secret_timer);
3165 	rt_secret_timer.function = rt_secret_rebuild;
3166 
3167 	/* All the timers, started at system startup tend
3168 	   to synchronize. Perturb it a bit.
3169 	 */
3170 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 					ip_rt_gc_interval;
3172 	add_timer(&rt_periodic_timer);
3173 
3174 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175 		ip_rt_secret_interval;
3176 	add_timer(&rt_secret_timer);
3177 
3178 #ifdef CONFIG_PROC_FS
3179 	{
3180 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183 			    		     proc_net_stat))) {
3184 		return -ENOMEM;
3185 	}
3186 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 	}
3188 #ifdef CONFIG_NET_CLS_ROUTE
3189 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190 #endif
3191 #endif
3192 #ifdef CONFIG_XFRM
3193 	xfrm_init();
3194 	xfrm4_init();
3195 #endif
3196 	return rc;
3197 }
3198 
3199 EXPORT_SYMBOL(__ip_select_ident);
3200 EXPORT_SYMBOL(ip_route_input);
3201 EXPORT_SYMBOL(ip_route_output_key);
3202