xref: /linux/net/ipv4/route.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 
113 #define IP_MAX_MTU	0xFFF0
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_min_delay		= 2 * HZ;
118 static int ip_rt_max_delay		= 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval		= 60 * HZ;
122 static int ip_rt_gc_min_interval	= HZ / 2;
123 static int ip_rt_redirect_number	= 9;
124 static int ip_rt_redirect_load		= HZ / 50;
125 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost		= HZ;
127 static int ip_rt_error_burst		= 5 * HZ;
128 static int ip_rt_gc_elasticity		= 8;
129 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu		= 512 + 20 + 20;
131 static int ip_rt_min_advmss		= 256;
132 static int ip_rt_secret_interval	= 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.entry_size =		sizeof(struct rtable),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ	4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ	2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ	1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ	512
220 #else
221 #define RT_HASH_LOCK_SZ	256
222 #endif
223 
224 static spinlock_t	*rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()	{ \
227 		int i; \
228 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 			spin_lock_init(&rt_hash_locks[i]); \
232 		}
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237 
238 static struct rt_hash_bucket 	*rt_hash_table;
239 static unsigned			rt_hash_mask;
240 static int			rt_hash_log;
241 static unsigned int		rt_hash_rnd;
242 
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field)					  \
245 		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
246 
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248 				struct rtable **res);
249 
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 {
252 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253 		& rt_hash_mask);
254 }
255 
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
258 	int bucket;
259 };
260 
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 {
263 	struct rtable *r = NULL;
264 	struct rt_cache_iter_state *st = seq->private;
265 
266 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267 		rcu_read_lock_bh();
268 		r = rt_hash_table[st->bucket].chain;
269 		if (r)
270 			break;
271 		rcu_read_unlock_bh();
272 	}
273 	return r;
274 }
275 
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 {
278 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279 
280 	r = r->u.rt_next;
281 	while (!r) {
282 		rcu_read_unlock_bh();
283 		if (--st->bucket < 0)
284 			break;
285 		rcu_read_lock_bh();
286 		r = rt_hash_table[st->bucket].chain;
287 	}
288 	return r;
289 }
290 
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 {
293 	struct rtable *r = rt_cache_get_first(seq);
294 
295 	if (r)
296 		while (pos && (r = rt_cache_get_next(seq, r)))
297 			--pos;
298 	return pos ? NULL : r;
299 }
300 
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 {
303 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 }
305 
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 {
308 	struct rtable *r = NULL;
309 
310 	if (v == SEQ_START_TOKEN)
311 		r = rt_cache_get_first(seq);
312 	else
313 		r = rt_cache_get_next(seq, v);
314 	++*pos;
315 	return r;
316 }
317 
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 {
320 	if (v && v != SEQ_START_TOKEN)
321 		rcu_read_unlock_bh();
322 }
323 
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 {
326 	if (v == SEQ_START_TOKEN)
327 		seq_printf(seq, "%-127s\n",
328 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330 			   "HHUptod\tSpecDst");
331 	else {
332 		struct rtable *r = v;
333 		char temp[256];
334 
335 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337 			r->u.dst.dev ? r->u.dst.dev->name : "*",
338 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
341 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343 			dst_metric(&r->u.dst, RTAX_WINDOW),
344 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
346 			r->fl.fl4_tos,
347 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349 				       dev_queue_xmit) : 0,
350 			r->rt_spec_dst);
351 		seq_printf(seq, "%-127s\n", temp);
352         }
353   	return 0;
354 }
355 
356 static struct seq_operations rt_cache_seq_ops = {
357 	.start  = rt_cache_seq_start,
358 	.next   = rt_cache_seq_next,
359 	.stop   = rt_cache_seq_stop,
360 	.show   = rt_cache_seq_show,
361 };
362 
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 {
365 	struct seq_file *seq;
366 	int rc = -ENOMEM;
367 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368 
369 	if (!s)
370 		goto out;
371 	rc = seq_open(file, &rt_cache_seq_ops);
372 	if (rc)
373 		goto out_kfree;
374 	seq          = file->private_data;
375 	seq->private = s;
376 	memset(s, 0, sizeof(*s));
377 out:
378 	return rc;
379 out_kfree:
380 	kfree(s);
381 	goto out;
382 }
383 
384 static struct file_operations rt_cache_seq_fops = {
385 	.owner	 = THIS_MODULE,
386 	.open	 = rt_cache_seq_open,
387 	.read	 = seq_read,
388 	.llseek	 = seq_lseek,
389 	.release = seq_release_private,
390 };
391 
392 
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395 	int cpu;
396 
397 	if (*pos == 0)
398 		return SEQ_START_TOKEN;
399 
400 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401 		if (!cpu_possible(cpu))
402 			continue;
403 		*pos = cpu+1;
404 		return per_cpu_ptr(rt_cache_stat, cpu);
405 	}
406 	return NULL;
407 }
408 
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411 	int cpu;
412 
413 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414 		if (!cpu_possible(cpu))
415 			continue;
416 		*pos = cpu+1;
417 		return per_cpu_ptr(rt_cache_stat, cpu);
418 	}
419 	return NULL;
420 
421 }
422 
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425 
426 }
427 
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430 	struct rt_cache_stat *st = v;
431 
432 	if (v == SEQ_START_TOKEN) {
433 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434 		return 0;
435 	}
436 
437 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439 		   atomic_read(&ipv4_dst_ops.entries),
440 		   st->in_hit,
441 		   st->in_slow_tot,
442 		   st->in_slow_mc,
443 		   st->in_no_route,
444 		   st->in_brd,
445 		   st->in_martian_dst,
446 		   st->in_martian_src,
447 
448 		   st->out_hit,
449 		   st->out_slow_tot,
450 		   st->out_slow_mc,
451 
452 		   st->gc_total,
453 		   st->gc_ignored,
454 		   st->gc_goal_miss,
455 		   st->gc_dst_overflow,
456 		   st->in_hlist_search,
457 		   st->out_hlist_search
458 		);
459 	return 0;
460 }
461 
462 static struct seq_operations rt_cpu_seq_ops = {
463 	.start  = rt_cpu_seq_start,
464 	.next   = rt_cpu_seq_next,
465 	.stop   = rt_cpu_seq_stop,
466 	.show   = rt_cpu_seq_show,
467 };
468 
469 
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472 	return seq_open(file, &rt_cpu_seq_ops);
473 }
474 
475 static struct file_operations rt_cpu_seq_fops = {
476 	.owner	 = THIS_MODULE,
477 	.open	 = rt_cpu_seq_open,
478 	.read	 = seq_read,
479 	.llseek	 = seq_lseek,
480 	.release = seq_release,
481 };
482 
483 #endif /* CONFIG_PROC_FS */
484 
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487 	multipath_remove(rt);
488 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490 
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493 	multipath_remove(rt);
494 	ip_rt_put(rt);
495 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497 
498 static __inline__ int rt_fast_clean(struct rtable *rth)
499 {
500 	/* Kill broadcast/multicast entries very aggresively, if they
501 	   collide in hash table with more useful entries */
502 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503 		rth->fl.iif && rth->u.rt_next;
504 }
505 
506 static __inline__ int rt_valuable(struct rtable *rth)
507 {
508 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509 		rth->u.dst.expires;
510 }
511 
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 {
514 	unsigned long age;
515 	int ret = 0;
516 
517 	if (atomic_read(&rth->u.dst.__refcnt))
518 		goto out;
519 
520 	ret = 1;
521 	if (rth->u.dst.expires &&
522 	    time_after_eq(jiffies, rth->u.dst.expires))
523 		goto out;
524 
525 	age = jiffies - rth->u.dst.lastuse;
526 	ret = 0;
527 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528 	    (age <= tmo2 && rt_valuable(rth)))
529 		goto out;
530 	ret = 1;
531 out:	return ret;
532 }
533 
534 /* Bits of score are:
535  * 31: very valuable
536  * 30: not quite useless
537  * 29..0: usage counter
538  */
539 static inline u32 rt_score(struct rtable *rt)
540 {
541 	u32 score = jiffies - rt->u.dst.lastuse;
542 
543 	score = ~score & ~(3<<30);
544 
545 	if (rt_valuable(rt))
546 		score |= (1<<31);
547 
548 	if (!rt->fl.iif ||
549 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550 		score |= (1<<30);
551 
552 	return score;
553 }
554 
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 {
557 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558 	       fl1->oif     == fl2->oif &&
559 	       fl1->iif     == fl2->iif;
560 }
561 
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564 						struct rtable *expentry,
565 						int *removed_count)
566 {
567 	int passedexpired = 0;
568 	struct rtable **nextstep = NULL;
569 	struct rtable **rthp = chain_head;
570 	struct rtable *rth;
571 
572 	if (removed_count)
573 		*removed_count = 0;
574 
575 	while ((rth = *rthp) != NULL) {
576 		if (rth == expentry)
577 			passedexpired = 1;
578 
579 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
580 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
581 			if (*rthp == expentry) {
582 				*rthp = rth->u.rt_next;
583 				continue;
584 			} else {
585 				*rthp = rth->u.rt_next;
586 				rt_free(rth);
587 				if (removed_count)
588 					++(*removed_count);
589 			}
590 		} else {
591 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592 			    passedexpired && !nextstep)
593 				nextstep = &rth->u.rt_next;
594 
595 			rthp = &rth->u.rt_next;
596 		}
597 	}
598 
599 	rt_free(expentry);
600 	if (removed_count)
601 		++(*removed_count);
602 
603 	return nextstep;
604 }
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606 
607 
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
610 {
611 	static unsigned int rover;
612 	unsigned int i = rover, goal;
613 	struct rtable *rth, **rthp;
614 	unsigned long now = jiffies;
615 	u64 mult;
616 
617 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618 	if (ip_rt_gc_timeout > 1)
619 		do_div(mult, ip_rt_gc_timeout);
620 	goal = (unsigned int)mult;
621 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622 	for (; goal > 0; goal--) {
623 		unsigned long tmo = ip_rt_gc_timeout;
624 
625 		i = (i + 1) & rt_hash_mask;
626 		rthp = &rt_hash_table[i].chain;
627 
628 		if (*rthp == 0)
629 			continue;
630 		spin_lock(rt_hash_lock_addr(i));
631 		while ((rth = *rthp) != NULL) {
632 			if (rth->u.dst.expires) {
633 				/* Entry is expired even if it is in use */
634 				if (time_before_eq(now, rth->u.dst.expires)) {
635 					tmo >>= 1;
636 					rthp = &rth->u.rt_next;
637 					continue;
638 				}
639 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640 				tmo >>= 1;
641 				rthp = &rth->u.rt_next;
642 				continue;
643 			}
644 
645 			/* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647 			/* remove all related balanced entries if necessary */
648 			if (rth->u.dst.flags & DST_BALANCED) {
649 				rthp = rt_remove_balanced_route(
650 					&rt_hash_table[i].chain,
651 					rth, NULL);
652 				if (!rthp)
653 					break;
654 			} else {
655 				*rthp = rth->u.rt_next;
656 				rt_free(rth);
657 			}
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659  			*rthp = rth->u.rt_next;
660  			rt_free(rth);
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662 		}
663 		spin_unlock(rt_hash_lock_addr(i));
664 
665 		/* Fallback loop breaker. */
666 		if (time_after(jiffies, now))
667 			break;
668 	}
669 	rover = i;
670 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 }
672 
673 /* This can run from both BH and non-BH contexts, the latter
674  * in the case of a forced flush event.
675  */
676 static void rt_run_flush(unsigned long dummy)
677 {
678 	int i;
679 	struct rtable *rth, *next;
680 
681 	rt_deadline = 0;
682 
683 	get_random_bytes(&rt_hash_rnd, 4);
684 
685 	for (i = rt_hash_mask; i >= 0; i--) {
686 		spin_lock_bh(rt_hash_lock_addr(i));
687 		rth = rt_hash_table[i].chain;
688 		if (rth)
689 			rt_hash_table[i].chain = NULL;
690 		spin_unlock_bh(rt_hash_lock_addr(i));
691 
692 		for (; rth; rth = next) {
693 			next = rth->u.rt_next;
694 			rt_free(rth);
695 		}
696 	}
697 }
698 
699 static DEFINE_SPINLOCK(rt_flush_lock);
700 
701 void rt_cache_flush(int delay)
702 {
703 	unsigned long now = jiffies;
704 	int user_mode = !in_softirq();
705 
706 	if (delay < 0)
707 		delay = ip_rt_min_delay;
708 
709 	/* flush existing multipath state*/
710 	multipath_flush();
711 
712 	spin_lock_bh(&rt_flush_lock);
713 
714 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715 		long tmo = (long)(rt_deadline - now);
716 
717 		/* If flush timer is already running
718 		   and flush request is not immediate (delay > 0):
719 
720 		   if deadline is not achieved, prolongate timer to "delay",
721 		   otherwise fire it at deadline time.
722 		 */
723 
724 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725 			tmo = 0;
726 
727 		if (delay > tmo)
728 			delay = tmo;
729 	}
730 
731 	if (delay <= 0) {
732 		spin_unlock_bh(&rt_flush_lock);
733 		rt_run_flush(0);
734 		return;
735 	}
736 
737 	if (rt_deadline == 0)
738 		rt_deadline = now + ip_rt_max_delay;
739 
740 	mod_timer(&rt_flush_timer, now+delay);
741 	spin_unlock_bh(&rt_flush_lock);
742 }
743 
744 static void rt_secret_rebuild(unsigned long dummy)
745 {
746 	unsigned long now = jiffies;
747 
748 	rt_cache_flush(0);
749 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750 }
751 
752 /*
753    Short description of GC goals.
754 
755    We want to build algorithm, which will keep routing cache
756    at some equilibrium point, when number of aged off entries
757    is kept approximately equal to newly generated ones.
758 
759    Current expiration strength is variable "expire".
760    We try to adjust it dynamically, so that if networking
761    is idle expires is large enough to keep enough of warm entries,
762    and when load increases it reduces to limit cache size.
763  */
764 
765 static int rt_garbage_collect(void)
766 {
767 	static unsigned long expire = RT_GC_TIMEOUT;
768 	static unsigned long last_gc;
769 	static int rover;
770 	static int equilibrium;
771 	struct rtable *rth, **rthp;
772 	unsigned long now = jiffies;
773 	int goal;
774 
775 	/*
776 	 * Garbage collection is pretty expensive,
777 	 * do not make it too frequently.
778 	 */
779 
780 	RT_CACHE_STAT_INC(gc_total);
781 
782 	if (now - last_gc < ip_rt_gc_min_interval &&
783 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784 		RT_CACHE_STAT_INC(gc_ignored);
785 		goto out;
786 	}
787 
788 	/* Calculate number of entries, which we want to expire now. */
789 	goal = atomic_read(&ipv4_dst_ops.entries) -
790 		(ip_rt_gc_elasticity << rt_hash_log);
791 	if (goal <= 0) {
792 		if (equilibrium < ipv4_dst_ops.gc_thresh)
793 			equilibrium = ipv4_dst_ops.gc_thresh;
794 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795 		if (goal > 0) {
796 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798 		}
799 	} else {
800 		/* We are in dangerous area. Try to reduce cache really
801 		 * aggressively.
802 		 */
803 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 	}
806 
807 	if (now - last_gc >= ip_rt_gc_min_interval)
808 		last_gc = now;
809 
810 	if (goal <= 0) {
811 		equilibrium += goal;
812 		goto work_done;
813 	}
814 
815 	do {
816 		int i, k;
817 
818 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819 			unsigned long tmo = expire;
820 
821 			k = (k + 1) & rt_hash_mask;
822 			rthp = &rt_hash_table[k].chain;
823 			spin_lock_bh(rt_hash_lock_addr(k));
824 			while ((rth = *rthp) != NULL) {
825 				if (!rt_may_expire(rth, tmo, expire)) {
826 					tmo >>= 1;
827 					rthp = &rth->u.rt_next;
828 					continue;
829 				}
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831 				/* remove all related balanced entries
832 				 * if necessary
833 				 */
834 				if (rth->u.dst.flags & DST_BALANCED) {
835 					int r;
836 
837 					rthp = rt_remove_balanced_route(
838 						&rt_hash_table[i].chain,
839 						rth,
840 						&r);
841 					goal -= r;
842 					if (!rthp)
843 						break;
844 				} else {
845 					*rthp = rth->u.rt_next;
846 					rt_free(rth);
847 					goal--;
848 				}
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850 				*rthp = rth->u.rt_next;
851 				rt_free(rth);
852 				goal--;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854 			}
855 			spin_unlock_bh(rt_hash_lock_addr(k));
856 			if (goal <= 0)
857 				break;
858 		}
859 		rover = k;
860 
861 		if (goal <= 0)
862 			goto work_done;
863 
864 		/* Goal is not achieved. We stop process if:
865 
866 		   - if expire reduced to zero. Otherwise, expire is halfed.
867 		   - if table is not full.
868 		   - if we are called from interrupt.
869 		   - jiffies check is just fallback/debug loop breaker.
870 		     We will not spin here for long time in any case.
871 		 */
872 
873 		RT_CACHE_STAT_INC(gc_goal_miss);
874 
875 		if (expire == 0)
876 			break;
877 
878 		expire >>= 1;
879 #if RT_CACHE_DEBUG >= 2
880 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881 				atomic_read(&ipv4_dst_ops.entries), goal, i);
882 #endif
883 
884 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885 			goto out;
886 	} while (!in_softirq() && time_before_eq(jiffies, now));
887 
888 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889 		goto out;
890 	if (net_ratelimit())
891 		printk(KERN_WARNING "dst cache overflow\n");
892 	RT_CACHE_STAT_INC(gc_dst_overflow);
893 	return 1;
894 
895 work_done:
896 	expire += ip_rt_gc_min_interval;
897 	if (expire > ip_rt_gc_timeout ||
898 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899 		expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
903 #endif
904 out:	return 0;
905 }
906 
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 {
909 	struct rtable	*rth, **rthp;
910 	unsigned long	now;
911 	struct rtable *cand, **candp;
912 	u32 		min_score;
913 	int		chain_length;
914 	int attempts = !in_softirq();
915 
916 restart:
917 	chain_length = 0;
918 	min_score = ~(u32)0;
919 	cand = NULL;
920 	candp = NULL;
921 	now = jiffies;
922 
923 	rthp = &rt_hash_table[hash].chain;
924 
925 	spin_lock_bh(rt_hash_lock_addr(hash));
926 	while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 		if (!(rth->u.dst.flags & DST_BALANCED) &&
929 		    compare_keys(&rth->fl, &rt->fl)) {
930 #else
931 		if (compare_keys(&rth->fl, &rt->fl)) {
932 #endif
933 			/* Put it first */
934 			*rthp = rth->u.rt_next;
935 			/*
936 			 * Since lookup is lockfree, the deletion
937 			 * must be visible to another weakly ordered CPU before
938 			 * the insertion at the start of the hash chain.
939 			 */
940 			rcu_assign_pointer(rth->u.rt_next,
941 					   rt_hash_table[hash].chain);
942 			/*
943 			 * Since lookup is lockfree, the update writes
944 			 * must be ordered for consistency on SMP.
945 			 */
946 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947 
948 			rth->u.dst.__use++;
949 			dst_hold(&rth->u.dst);
950 			rth->u.dst.lastuse = now;
951 			spin_unlock_bh(rt_hash_lock_addr(hash));
952 
953 			rt_drop(rt);
954 			*rp = rth;
955 			return 0;
956 		}
957 
958 		if (!atomic_read(&rth->u.dst.__refcnt)) {
959 			u32 score = rt_score(rth);
960 
961 			if (score <= min_score) {
962 				cand = rth;
963 				candp = rthp;
964 				min_score = score;
965 			}
966 		}
967 
968 		chain_length++;
969 
970 		rthp = &rth->u.rt_next;
971 	}
972 
973 	if (cand) {
974 		/* ip_rt_gc_elasticity used to be average length of chain
975 		 * length, when exceeded gc becomes really aggressive.
976 		 *
977 		 * The second limit is less certain. At the moment it allows
978 		 * only 2 entries per bucket. We will see.
979 		 */
980 		if (chain_length > ip_rt_gc_elasticity) {
981 			*candp = cand->u.rt_next;
982 			rt_free(cand);
983 		}
984 	}
985 
986 	/* Try to bind route to arp only if it is output
987 	   route or unicast forwarding path.
988 	 */
989 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990 		int err = arp_bind_neighbour(&rt->u.dst);
991 		if (err) {
992 			spin_unlock_bh(rt_hash_lock_addr(hash));
993 
994 			if (err != -ENOBUFS) {
995 				rt_drop(rt);
996 				return err;
997 			}
998 
999 			/* Neighbour tables are full and nothing
1000 			   can be released. Try to shrink route cache,
1001 			   it is most likely it holds some neighbour records.
1002 			 */
1003 			if (attempts-- > 0) {
1004 				int saved_elasticity = ip_rt_gc_elasticity;
1005 				int saved_int = ip_rt_gc_min_interval;
1006 				ip_rt_gc_elasticity	= 1;
1007 				ip_rt_gc_min_interval	= 0;
1008 				rt_garbage_collect();
1009 				ip_rt_gc_min_interval	= saved_int;
1010 				ip_rt_gc_elasticity	= saved_elasticity;
1011 				goto restart;
1012 			}
1013 
1014 			if (net_ratelimit())
1015 				printk(KERN_WARNING "Neighbour table overflow.\n");
1016 			rt_drop(rt);
1017 			return -ENOBUFS;
1018 		}
1019 	}
1020 
1021 	rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023 	if (rt->u.rt_next) {
1024 		struct rtable *trt;
1025 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026 		       NIPQUAD(rt->rt_dst));
1027 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029 		printk("\n");
1030 	}
1031 #endif
1032 	rt_hash_table[hash].chain = rt;
1033 	spin_unlock_bh(rt_hash_lock_addr(hash));
1034 	*rp = rt;
1035 	return 0;
1036 }
1037 
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040 	static DEFINE_SPINLOCK(rt_peer_lock);
1041 	struct inet_peer *peer;
1042 
1043 	peer = inet_getpeer(rt->rt_dst, create);
1044 
1045 	spin_lock_bh(&rt_peer_lock);
1046 	if (rt->peer == NULL) {
1047 		rt->peer = peer;
1048 		peer = NULL;
1049 	}
1050 	spin_unlock_bh(&rt_peer_lock);
1051 	if (peer)
1052 		inet_putpeer(peer);
1053 }
1054 
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1065 	static u32 ip_fallback_id;
1066 	u32 salt;
1067 
1068 	spin_lock_bh(&ip_fb_id_lock);
1069 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070 	iph->id = htons(salt & 0xFFFF);
1071 	ip_fallback_id = salt;
1072 	spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074 
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077 	struct rtable *rt = (struct rtable *) dst;
1078 
1079 	if (rt) {
1080 		if (rt->peer == NULL)
1081 			rt_bind_peer(rt, 1);
1082 
1083 		/* If peer is attached to destination, it is never detached,
1084 		   so that we need not to grab a lock to dereference it.
1085 		 */
1086 		if (rt->peer) {
1087 			iph->id = htons(inet_getid(rt->peer, more));
1088 			return;
1089 		}
1090 	} else
1091 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092 		       __builtin_return_address(0));
1093 
1094 	ip_select_fb_ident(iph);
1095 }
1096 
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099 	struct rtable **rthp;
1100 
1101 	spin_lock_bh(rt_hash_lock_addr(hash));
1102 	ip_rt_put(rt);
1103 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1104 	     rthp = &(*rthp)->u.rt_next)
1105 		if (*rthp == rt) {
1106 			*rthp = rt->u.rt_next;
1107 			rt_free(rt);
1108 			break;
1109 		}
1110 	spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112 
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114 		    u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116 	int i, k;
1117 	struct in_device *in_dev = in_dev_get(dev);
1118 	struct rtable *rth, **rthp;
1119 	u32  skeys[2] = { saddr, 0 };
1120 	int  ikeys[2] = { dev->ifindex, 0 };
1121 
1122 	tos &= IPTOS_RT_MASK;
1123 
1124 	if (!in_dev)
1125 		return;
1126 
1127 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 		goto reject_redirect;
1130 
1131 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 			goto reject_redirect;
1134 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 			goto reject_redirect;
1136 	} else {
1137 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 			goto reject_redirect;
1139 	}
1140 
1141 	for (i = 0; i < 2; i++) {
1142 		for (k = 0; k < 2; k++) {
1143 			unsigned hash = rt_hash_code(daddr,
1144 						     skeys[i] ^ (ikeys[k] << 5),
1145 						     tos);
1146 
1147 			rthp=&rt_hash_table[hash].chain;
1148 
1149 			rcu_read_lock();
1150 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 				struct rtable *rt;
1152 
1153 				if (rth->fl.fl4_dst != daddr ||
1154 				    rth->fl.fl4_src != skeys[i] ||
1155 				    rth->fl.fl4_tos != tos ||
1156 				    rth->fl.oif != ikeys[k] ||
1157 				    rth->fl.iif != 0) {
1158 					rthp = &rth->u.rt_next;
1159 					continue;
1160 				}
1161 
1162 				if (rth->rt_dst != daddr ||
1163 				    rth->rt_src != saddr ||
1164 				    rth->u.dst.error ||
1165 				    rth->rt_gateway != old_gw ||
1166 				    rth->u.dst.dev != dev)
1167 					break;
1168 
1169 				dst_hold(&rth->u.dst);
1170 				rcu_read_unlock();
1171 
1172 				rt = dst_alloc(&ipv4_dst_ops);
1173 				if (rt == NULL) {
1174 					ip_rt_put(rth);
1175 					in_dev_put(in_dev);
1176 					return;
1177 				}
1178 
1179 				/* Copy all the information. */
1180 				*rt = *rth;
1181  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182 				rt->u.dst.__use		= 1;
1183 				atomic_set(&rt->u.dst.__refcnt, 1);
1184 				rt->u.dst.child		= NULL;
1185 				if (rt->u.dst.dev)
1186 					dev_hold(rt->u.dst.dev);
1187 				if (rt->idev)
1188 					in_dev_hold(rt->idev);
1189 				rt->u.dst.obsolete	= 0;
1190 				rt->u.dst.lastuse	= jiffies;
1191 				rt->u.dst.path		= &rt->u.dst;
1192 				rt->u.dst.neighbour	= NULL;
1193 				rt->u.dst.hh		= NULL;
1194 				rt->u.dst.xfrm		= NULL;
1195 
1196 				rt->rt_flags		|= RTCF_REDIRECTED;
1197 
1198 				/* Gateway is different ... */
1199 				rt->rt_gateway		= new_gw;
1200 
1201 				/* Redirect received -> path was valid */
1202 				dst_confirm(&rth->u.dst);
1203 
1204 				if (rt->peer)
1205 					atomic_inc(&rt->peer->refcnt);
1206 
1207 				if (arp_bind_neighbour(&rt->u.dst) ||
1208 				    !(rt->u.dst.neighbour->nud_state &
1209 					    NUD_VALID)) {
1210 					if (rt->u.dst.neighbour)
1211 						neigh_event_send(rt->u.dst.neighbour, NULL);
1212 					ip_rt_put(rth);
1213 					rt_drop(rt);
1214 					goto do_next;
1215 				}
1216 
1217 				rt_del(hash, rth);
1218 				if (!rt_intern_hash(hash, rt, &rt))
1219 					ip_rt_put(rt);
1220 				goto do_next;
1221 			}
1222 			rcu_read_unlock();
1223 		do_next:
1224 			;
1225 		}
1226 	}
1227 	in_dev_put(in_dev);
1228 	return;
1229 
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234 			"%u.%u.%u.%u ignored.\n"
1235 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236 			"tos %02x\n",
1237 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240 	in_dev_put(in_dev);
1241 }
1242 
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245 	struct rtable *rt = (struct rtable*)dst;
1246 	struct dst_entry *ret = dst;
1247 
1248 	if (rt) {
1249 		if (dst->obsolete) {
1250 			ip_rt_put(rt);
1251 			ret = NULL;
1252 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253 			   rt->u.dst.expires) {
1254 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255 						     rt->fl.fl4_src ^
1256 							(rt->fl.oif << 5),
1257 						     rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260 					  "%u.%u.%u.%u/%02x dropped\n",
1261 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263 			rt_del(hash, rt);
1264 			ret = NULL;
1265 		}
1266 	}
1267 	return ret;
1268 }
1269 
1270 /*
1271  * Algorithm:
1272  *	1. The first ip_rt_redirect_number redirects are sent
1273  *	   with exponential backoff, then we stop sending them at all,
1274  *	   assuming that the host ignores our redirects.
1275  *	2. If we did not see packets requiring redirects
1276  *	   during ip_rt_redirect_silence, we assume that the host
1277  *	   forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285 
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288 	struct rtable *rt = (struct rtable*)skb->dst;
1289 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290 
1291 	if (!in_dev)
1292 		return;
1293 
1294 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1295 		goto out;
1296 
1297 	/* No redirected packets during ip_rt_redirect_silence;
1298 	 * reset the algorithm.
1299 	 */
1300 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301 		rt->u.dst.rate_tokens = 0;
1302 
1303 	/* Too many ignored redirects; do not send anything
1304 	 * set u.dst.rate_last to the last seen redirected packet.
1305 	 */
1306 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307 		rt->u.dst.rate_last = jiffies;
1308 		goto out;
1309 	}
1310 
1311 	/* Check for load limit; set rate_last to the latest sent
1312 	 * redirect.
1313 	 */
1314 	if (time_after(jiffies,
1315 		       (rt->u.dst.rate_last +
1316 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318 		rt->u.dst.rate_last = jiffies;
1319 		++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323 		    net_ratelimit())
1324 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326 				NIPQUAD(rt->rt_src), rt->rt_iif,
1327 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329 	}
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333 
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336 	struct rtable *rt = (struct rtable*)skb->dst;
1337 	unsigned long now;
1338 	int code;
1339 
1340 	switch (rt->u.dst.error) {
1341 		case EINVAL:
1342 		default:
1343 			goto out;
1344 		case EHOSTUNREACH:
1345 			code = ICMP_HOST_UNREACH;
1346 			break;
1347 		case ENETUNREACH:
1348 			code = ICMP_NET_UNREACH;
1349 			break;
1350 		case EACCES:
1351 			code = ICMP_PKT_FILTERED;
1352 			break;
1353 	}
1354 
1355 	now = jiffies;
1356 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1359 	rt->u.dst.rate_last = now;
1360 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 	}
1364 
1365 out:	kfree_skb(skb);
1366 	return 0;
1367 }
1368 
1369 /*
1370  *	The last two values are not from the RFC but
1371  *	are needed for AMPRnet AX.25 paths.
1372  */
1373 
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376 
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379 	int i;
1380 
1381 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382 		if (old_mtu > mtu_plateau[i])
1383 			return mtu_plateau[i];
1384 	return 68;
1385 }
1386 
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389 	int i;
1390 	unsigned short old_mtu = ntohs(iph->tot_len);
1391 	struct rtable *rth;
1392 	u32  skeys[2] = { iph->saddr, 0, };
1393 	u32  daddr = iph->daddr;
1394 	u8   tos = iph->tos & IPTOS_RT_MASK;
1395 	unsigned short est_mtu = 0;
1396 
1397 	if (ipv4_config.no_pmtu_disc)
1398 		return 0;
1399 
1400 	for (i = 0; i < 2; i++) {
1401 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402 
1403 		rcu_read_lock();
1404 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 		     rth = rcu_dereference(rth->u.rt_next)) {
1406 			if (rth->fl.fl4_dst == daddr &&
1407 			    rth->fl.fl4_src == skeys[i] &&
1408 			    rth->rt_dst  == daddr &&
1409 			    rth->rt_src  == iph->saddr &&
1410 			    rth->fl.fl4_tos == tos &&
1411 			    rth->fl.iif == 0 &&
1412 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413 				unsigned short mtu = new_mtu;
1414 
1415 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 
1417 					/* BSD 4.2 compatibility hack :-( */
1418 					if (mtu == 0 &&
1419 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420 					    old_mtu >= 68 + (iph->ihl << 2))
1421 						old_mtu -= iph->ihl << 2;
1422 
1423 					mtu = guess_mtu(old_mtu);
1424 				}
1425 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427 						dst_confirm(&rth->u.dst);
1428 						if (mtu < ip_rt_min_pmtu) {
1429 							mtu = ip_rt_min_pmtu;
1430 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1431 								(1 << RTAX_MTU);
1432 						}
1433 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434 						dst_set_expires(&rth->u.dst,
1435 							ip_rt_mtu_expires);
1436 					}
1437 					est_mtu = mtu;
1438 				}
1439 			}
1440 		}
1441 		rcu_read_unlock();
1442 	}
1443 	return est_mtu ? : new_mtu;
1444 }
1445 
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1450 		if (mtu < ip_rt_min_pmtu) {
1451 			mtu = ip_rt_min_pmtu;
1452 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 		}
1454 		dst->metrics[RTAX_MTU-1] = mtu;
1455 		dst_set_expires(dst, ip_rt_mtu_expires);
1456 	}
1457 }
1458 
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461 	return NULL;
1462 }
1463 
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466 	struct rtable *rt = (struct rtable *) dst;
1467 	struct inet_peer *peer = rt->peer;
1468 	struct in_device *idev = rt->idev;
1469 
1470 	if (peer) {
1471 		rt->peer = NULL;
1472 		inet_putpeer(peer);
1473 	}
1474 
1475 	if (idev) {
1476 		rt->idev = NULL;
1477 		in_dev_put(idev);
1478 	}
1479 }
1480 
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482 			    int how)
1483 {
1484 	struct rtable *rt = (struct rtable *) dst;
1485 	struct in_device *idev = rt->idev;
1486 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1487 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488 		if (loopback_idev) {
1489 			rt->idev = loopback_idev;
1490 			in_dev_put(idev);
1491 		}
1492 	}
1493 }
1494 
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497 	struct rtable *rt;
1498 
1499 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 
1501 	rt = (struct rtable *) skb->dst;
1502 	if (rt)
1503 		dst_set_expires(&rt->u.dst, 0);
1504 }
1505 
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510 		skb->dev ? skb->dev->name : "?");
1511 	kfree_skb(skb);
1512 	return 0;
1513 }
1514 
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519 
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523 
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526 	u32 src;
1527 	struct fib_result res;
1528 
1529 	if (rt->fl.iif == 0)
1530 		src = rt->rt_src;
1531 	else if (fib_lookup(&rt->fl, &res) == 0) {
1532 		src = FIB_RES_PREFSRC(res);
1533 		fib_res_put(&res);
1534 	} else
1535 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 					RT_SCOPE_UNIVERSE);
1537 	memcpy(addr, &src, 4);
1538 }
1539 
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543 	if (!(rt->u.dst.tclassid & 0xFFFF))
1544 		rt->u.dst.tclassid |= tag & 0xFFFF;
1545 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549 
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552 	struct fib_info *fi = res->fi;
1553 
1554 	if (fi) {
1555 		if (FIB_RES_GW(*res) &&
1556 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557 			rt->rt_gateway = FIB_RES_GW(*res);
1558 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559 		       sizeof(rt->u.dst.metrics));
1560 		if (fi->fib_mtu == 0) {
1561 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563 			    rt->rt_gateway != rt->rt_dst &&
1564 			    rt->u.dst.dev->mtu > 576)
1565 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 		}
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570 	} else
1571 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 
1573 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 				       ip_rt_min_advmss);
1580 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585 	set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587 	set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591 
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593 				u8 tos, struct net_device *dev, int our)
1594 {
1595 	unsigned hash;
1596 	struct rtable *rth;
1597 	u32 spec_dst;
1598 	struct in_device *in_dev = in_dev_get(dev);
1599 	u32 itag = 0;
1600 
1601 	/* Primary sanity checks. */
1602 
1603 	if (in_dev == NULL)
1604 		return -EINVAL;
1605 
1606 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607 	    skb->protocol != htons(ETH_P_IP))
1608 		goto e_inval;
1609 
1610 	if (ZERONET(saddr)) {
1611 		if (!LOCAL_MCAST(daddr))
1612 			goto e_inval;
1613 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614 	} else if (fib_validate_source(saddr, 0, tos, 0,
1615 					dev, &spec_dst, &itag) < 0)
1616 		goto e_inval;
1617 
1618 	rth = dst_alloc(&ipv4_dst_ops);
1619 	if (!rth)
1620 		goto e_nobufs;
1621 
1622 	rth->u.dst.output= ip_rt_bug;
1623 
1624 	atomic_set(&rth->u.dst.__refcnt, 1);
1625 	rth->u.dst.flags= DST_HOST;
1626 	if (in_dev->cnf.no_policy)
1627 		rth->u.dst.flags |= DST_NOPOLICY;
1628 	rth->fl.fl4_dst	= daddr;
1629 	rth->rt_dst	= daddr;
1630 	rth->fl.fl4_tos	= tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632 	rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634 	rth->fl.fl4_src	= saddr;
1635 	rth->rt_src	= saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637 	rth->u.dst.tclassid = itag;
1638 #endif
1639 	rth->rt_iif	=
1640 	rth->fl.iif	= dev->ifindex;
1641 	rth->u.dst.dev	= &loopback_dev;
1642 	dev_hold(rth->u.dst.dev);
1643 	rth->idev	= in_dev_get(rth->u.dst.dev);
1644 	rth->fl.oif	= 0;
1645 	rth->rt_gateway	= daddr;
1646 	rth->rt_spec_dst= spec_dst;
1647 	rth->rt_type	= RTN_MULTICAST;
1648 	rth->rt_flags	= RTCF_MULTICAST;
1649 	if (our) {
1650 		rth->u.dst.input= ip_local_deliver;
1651 		rth->rt_flags |= RTCF_LOCAL;
1652 	}
1653 
1654 #ifdef CONFIG_IP_MROUTE
1655 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656 		rth->u.dst.input = ip_mr_input;
1657 #endif
1658 	RT_CACHE_STAT_INC(in_slow_mc);
1659 
1660 	in_dev_put(in_dev);
1661 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663 
1664 e_nobufs:
1665 	in_dev_put(in_dev);
1666 	return -ENOBUFS;
1667 
1668 e_inval:
1669 	in_dev_put(in_dev);
1670 	return -EINVAL;
1671 }
1672 
1673 
1674 static void ip_handle_martian_source(struct net_device *dev,
1675 				     struct in_device *in_dev,
1676 				     struct sk_buff *skb,
1677 				     u32 daddr,
1678 				     u32 saddr)
1679 {
1680 	RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 		/*
1684 		 *	RFC1812 recommendation, if source is martian,
1685 		 *	the only hint is MAC header.
1686 		 */
1687 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688 			"%u.%u.%u.%u, on dev %s\n",
1689 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690 		if (dev->hard_header_len && skb->mac.raw) {
1691 			int i;
1692 			unsigned char *p = skb->mac.raw;
1693 			printk(KERN_WARNING "ll header: ");
1694 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 				printk("%02x", *p);
1696 				if (i < (dev->hard_header_len - 1))
1697 					printk(":");
1698 			}
1699 			printk("\n");
1700 		}
1701 	}
1702 #endif
1703 }
1704 
1705 static inline int __mkroute_input(struct sk_buff *skb,
1706 				  struct fib_result* res,
1707 				  struct in_device *in_dev,
1708 				  u32 daddr, u32 saddr, u32 tos,
1709 				  struct rtable **result)
1710 {
1711 
1712 	struct rtable *rth;
1713 	int err;
1714 	struct in_device *out_dev;
1715 	unsigned flags = 0;
1716 	u32 spec_dst, itag;
1717 
1718 	/* get a working reference to the output device */
1719 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1720 	if (out_dev == NULL) {
1721 		if (net_ratelimit())
1722 			printk(KERN_CRIT "Bug in ip_route_input" \
1723 			       "_slow(). Please, report\n");
1724 		return -EINVAL;
1725 	}
1726 
1727 
1728 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729 				  in_dev->dev, &spec_dst, &itag);
1730 	if (err < 0) {
1731 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732 					 saddr);
1733 
1734 		err = -EINVAL;
1735 		goto cleanup;
1736 	}
1737 
1738 	if (err)
1739 		flags |= RTCF_DIRECTSRC;
1740 
1741 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1743 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744 		flags |= RTCF_DOREDIRECT;
1745 
1746 	if (skb->protocol != htons(ETH_P_IP)) {
1747 		/* Not IP (i.e. ARP). Do not create route, if it is
1748 		 * invalid for proxy arp. DNAT routes are always valid.
1749 		 */
1750 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751 			err = -EINVAL;
1752 			goto cleanup;
1753 		}
1754 	}
1755 
1756 
1757 	rth = dst_alloc(&ipv4_dst_ops);
1758 	if (!rth) {
1759 		err = -ENOBUFS;
1760 		goto cleanup;
1761 	}
1762 
1763 	rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 	if (res->fi->fib_nhs > 1)
1766 		rth->u.dst.flags |= DST_BALANCED;
1767 #endif
1768 	if (in_dev->cnf.no_policy)
1769 		rth->u.dst.flags |= DST_NOPOLICY;
1770 	if (in_dev->cnf.no_xfrm)
1771 		rth->u.dst.flags |= DST_NOXFRM;
1772 	rth->fl.fl4_dst	= daddr;
1773 	rth->rt_dst	= daddr;
1774 	rth->fl.fl4_tos	= tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 	rth->fl.fl4_fwmark= skb->nfmark;
1777 #endif
1778 	rth->fl.fl4_src	= saddr;
1779 	rth->rt_src	= saddr;
1780 	rth->rt_gateway	= daddr;
1781 	rth->rt_iif 	=
1782 		rth->fl.iif	= in_dev->dev->ifindex;
1783 	rth->u.dst.dev	= (out_dev)->dev;
1784 	dev_hold(rth->u.dst.dev);
1785 	rth->idev	= in_dev_get(rth->u.dst.dev);
1786 	rth->fl.oif 	= 0;
1787 	rth->rt_spec_dst= spec_dst;
1788 
1789 	rth->u.dst.input = ip_forward;
1790 	rth->u.dst.output = ip_output;
1791 
1792 	rt_set_nexthop(rth, res, itag);
1793 
1794 	rth->rt_flags = flags;
1795 
1796 	*result = rth;
1797 	err = 0;
1798  cleanup:
1799 	/* release the working reference to the output device */
1800 	in_dev_put(out_dev);
1801 	return err;
1802 }
1803 
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 				       struct fib_result* res,
1806 				       const struct flowi *fl,
1807 				       struct in_device *in_dev,
1808 				       u32 daddr, u32 saddr, u32 tos)
1809 {
1810 	struct rtable* rth = NULL;
1811 	int err;
1812 	unsigned hash;
1813 
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 		fib_select_multipath(fl, res);
1817 #endif
1818 
1819 	/* create a routing cache entry */
1820 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821 	if (err)
1822 		return err;
1823 	atomic_set(&rth->u.dst.__refcnt, 1);
1824 
1825 	/* put it into the cache */
1826 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 }
1829 
1830 static inline int ip_mkroute_input(struct sk_buff *skb,
1831 				   struct fib_result* res,
1832 				   const struct flowi *fl,
1833 				   struct in_device *in_dev,
1834 				   u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837 	struct rtable* rth = NULL;
1838 	unsigned char hop, hopcount, lasthop;
1839 	int err = -EINVAL;
1840 	unsigned int hash;
1841 
1842 	if (res->fi)
1843 		hopcount = res->fi->fib_nhs;
1844 	else
1845 		hopcount = 1;
1846 
1847 	lasthop = hopcount - 1;
1848 
1849 	/* distinguish between multipath and singlepath */
1850 	if (hopcount < 2)
1851 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1852 					    saddr, tos);
1853 
1854 	/* add all alternatives to the routing cache */
1855 	for (hop = 0; hop < hopcount; hop++) {
1856 		res->nh_sel = hop;
1857 
1858 		/* create a routing cache entry */
1859 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1860 				      &rth);
1861 		if (err)
1862 			return err;
1863 
1864 		/* put it into the cache */
1865 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1866 		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1867 		if (err)
1868 			return err;
1869 
1870 		/* forward hop information to multipath impl. */
1871 		multipath_set_nhinfo(rth,
1872 				     FIB_RES_NETWORK(*res),
1873 				     FIB_RES_NETMASK(*res),
1874 				     res->prefixlen,
1875 				     &FIB_RES_NH(*res));
1876 
1877 		/* only for the last hop the reference count is handled
1878 		 * outside
1879 		 */
1880 		if (hop == lasthop)
1881 			atomic_set(&(skb->dst->__refcnt), 1);
1882 	}
1883 	return err;
1884 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1885 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1886 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1887 }
1888 
1889 
1890 /*
1891  *	NOTE. We drop all the packets that has local source
1892  *	addresses, because every properly looped back packet
1893  *	must have correct destination already attached by output routine.
1894  *
1895  *	Such approach solves two big problems:
1896  *	1. Not simplex devices are handled properly.
1897  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1898  */
1899 
1900 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1901 			       u8 tos, struct net_device *dev)
1902 {
1903 	struct fib_result res;
1904 	struct in_device *in_dev = in_dev_get(dev);
1905 	struct flowi fl = { .nl_u = { .ip4_u =
1906 				      { .daddr = daddr,
1907 					.saddr = saddr,
1908 					.tos = tos,
1909 					.scope = RT_SCOPE_UNIVERSE,
1910 #ifdef CONFIG_IP_ROUTE_FWMARK
1911 					.fwmark = skb->nfmark
1912 #endif
1913 				      } },
1914 			    .iif = dev->ifindex };
1915 	unsigned	flags = 0;
1916 	u32		itag = 0;
1917 	struct rtable * rth;
1918 	unsigned	hash;
1919 	u32		spec_dst;
1920 	int		err = -EINVAL;
1921 	int		free_res = 0;
1922 
1923 	/* IP on this device is disabled. */
1924 
1925 	if (!in_dev)
1926 		goto out;
1927 
1928 	/* Check for the most weird martians, which can be not detected
1929 	   by fib_lookup.
1930 	 */
1931 
1932 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1933 		goto martian_source;
1934 
1935 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1936 		goto brd_input;
1937 
1938 	/* Accept zero addresses only to limited broadcast;
1939 	 * I even do not know to fix it or not. Waiting for complains :-)
1940 	 */
1941 	if (ZERONET(saddr))
1942 		goto martian_source;
1943 
1944 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1945 		goto martian_destination;
1946 
1947 	/*
1948 	 *	Now we are ready to route packet.
1949 	 */
1950 	if ((err = fib_lookup(&fl, &res)) != 0) {
1951 		if (!IN_DEV_FORWARD(in_dev))
1952 			goto e_hostunreach;
1953 		goto no_route;
1954 	}
1955 	free_res = 1;
1956 
1957 	RT_CACHE_STAT_INC(in_slow_tot);
1958 
1959 	if (res.type == RTN_BROADCAST)
1960 		goto brd_input;
1961 
1962 	if (res.type == RTN_LOCAL) {
1963 		int result;
1964 		result = fib_validate_source(saddr, daddr, tos,
1965 					     loopback_dev.ifindex,
1966 					     dev, &spec_dst, &itag);
1967 		if (result < 0)
1968 			goto martian_source;
1969 		if (result)
1970 			flags |= RTCF_DIRECTSRC;
1971 		spec_dst = daddr;
1972 		goto local_input;
1973 	}
1974 
1975 	if (!IN_DEV_FORWARD(in_dev))
1976 		goto e_hostunreach;
1977 	if (res.type != RTN_UNICAST)
1978 		goto martian_destination;
1979 
1980 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1981 	if (err == -ENOBUFS)
1982 		goto e_nobufs;
1983 	if (err == -EINVAL)
1984 		goto e_inval;
1985 
1986 done:
1987 	in_dev_put(in_dev);
1988 	if (free_res)
1989 		fib_res_put(&res);
1990 out:	return err;
1991 
1992 brd_input:
1993 	if (skb->protocol != htons(ETH_P_IP))
1994 		goto e_inval;
1995 
1996 	if (ZERONET(saddr))
1997 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1998 	else {
1999 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2000 					  &itag);
2001 		if (err < 0)
2002 			goto martian_source;
2003 		if (err)
2004 			flags |= RTCF_DIRECTSRC;
2005 	}
2006 	flags |= RTCF_BROADCAST;
2007 	res.type = RTN_BROADCAST;
2008 	RT_CACHE_STAT_INC(in_brd);
2009 
2010 local_input:
2011 	rth = dst_alloc(&ipv4_dst_ops);
2012 	if (!rth)
2013 		goto e_nobufs;
2014 
2015 	rth->u.dst.output= ip_rt_bug;
2016 
2017 	atomic_set(&rth->u.dst.__refcnt, 1);
2018 	rth->u.dst.flags= DST_HOST;
2019 	if (in_dev->cnf.no_policy)
2020 		rth->u.dst.flags |= DST_NOPOLICY;
2021 	rth->fl.fl4_dst	= daddr;
2022 	rth->rt_dst	= daddr;
2023 	rth->fl.fl4_tos	= tos;
2024 #ifdef CONFIG_IP_ROUTE_FWMARK
2025 	rth->fl.fl4_fwmark= skb->nfmark;
2026 #endif
2027 	rth->fl.fl4_src	= saddr;
2028 	rth->rt_src	= saddr;
2029 #ifdef CONFIG_NET_CLS_ROUTE
2030 	rth->u.dst.tclassid = itag;
2031 #endif
2032 	rth->rt_iif	=
2033 	rth->fl.iif	= dev->ifindex;
2034 	rth->u.dst.dev	= &loopback_dev;
2035 	dev_hold(rth->u.dst.dev);
2036 	rth->idev	= in_dev_get(rth->u.dst.dev);
2037 	rth->rt_gateway	= daddr;
2038 	rth->rt_spec_dst= spec_dst;
2039 	rth->u.dst.input= ip_local_deliver;
2040 	rth->rt_flags 	= flags|RTCF_LOCAL;
2041 	if (res.type == RTN_UNREACHABLE) {
2042 		rth->u.dst.input= ip_error;
2043 		rth->u.dst.error= -err;
2044 		rth->rt_flags 	&= ~RTCF_LOCAL;
2045 	}
2046 	rth->rt_type	= res.type;
2047 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2048 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2049 	goto done;
2050 
2051 no_route:
2052 	RT_CACHE_STAT_INC(in_no_route);
2053 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2054 	res.type = RTN_UNREACHABLE;
2055 	goto local_input;
2056 
2057 	/*
2058 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2059 	 */
2060 martian_destination:
2061 	RT_CACHE_STAT_INC(in_martian_dst);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2064 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2065 			"%u.%u.%u.%u, dev %s\n",
2066 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2067 #endif
2068 
2069 e_hostunreach:
2070         err = -EHOSTUNREACH;
2071         goto done;
2072 
2073 e_inval:
2074 	err = -EINVAL;
2075 	goto done;
2076 
2077 e_nobufs:
2078 	err = -ENOBUFS;
2079 	goto done;
2080 
2081 martian_source:
2082 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2083 	goto e_inval;
2084 }
2085 
2086 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2087 		   u8 tos, struct net_device *dev)
2088 {
2089 	struct rtable * rth;
2090 	unsigned	hash;
2091 	int iif = dev->ifindex;
2092 
2093 	tos &= IPTOS_RT_MASK;
2094 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2095 
2096 	rcu_read_lock();
2097 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2098 	     rth = rcu_dereference(rth->u.rt_next)) {
2099 		if (rth->fl.fl4_dst == daddr &&
2100 		    rth->fl.fl4_src == saddr &&
2101 		    rth->fl.iif == iif &&
2102 		    rth->fl.oif == 0 &&
2103 #ifdef CONFIG_IP_ROUTE_FWMARK
2104 		    rth->fl.fl4_fwmark == skb->nfmark &&
2105 #endif
2106 		    rth->fl.fl4_tos == tos) {
2107 			rth->u.dst.lastuse = jiffies;
2108 			dst_hold(&rth->u.dst);
2109 			rth->u.dst.__use++;
2110 			RT_CACHE_STAT_INC(in_hit);
2111 			rcu_read_unlock();
2112 			skb->dst = (struct dst_entry*)rth;
2113 			return 0;
2114 		}
2115 		RT_CACHE_STAT_INC(in_hlist_search);
2116 	}
2117 	rcu_read_unlock();
2118 
2119 	/* Multicast recognition logic is moved from route cache to here.
2120 	   The problem was that too many Ethernet cards have broken/missing
2121 	   hardware multicast filters :-( As result the host on multicasting
2122 	   network acquires a lot of useless route cache entries, sort of
2123 	   SDR messages from all the world. Now we try to get rid of them.
2124 	   Really, provided software IP multicast filter is organized
2125 	   reasonably (at least, hashed), it does not result in a slowdown
2126 	   comparing with route cache reject entries.
2127 	   Note, that multicast routers are not affected, because
2128 	   route cache entry is created eventually.
2129 	 */
2130 	if (MULTICAST(daddr)) {
2131 		struct in_device *in_dev;
2132 
2133 		rcu_read_lock();
2134 		if ((in_dev = __in_dev_get(dev)) != NULL) {
2135 			int our = ip_check_mc(in_dev, daddr, saddr,
2136 				skb->nh.iph->protocol);
2137 			if (our
2138 #ifdef CONFIG_IP_MROUTE
2139 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2140 #endif
2141 			    ) {
2142 				rcu_read_unlock();
2143 				return ip_route_input_mc(skb, daddr, saddr,
2144 							 tos, dev, our);
2145 			}
2146 		}
2147 		rcu_read_unlock();
2148 		return -EINVAL;
2149 	}
2150 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2151 }
2152 
2153 static inline int __mkroute_output(struct rtable **result,
2154 				   struct fib_result* res,
2155 				   const struct flowi *fl,
2156 				   const struct flowi *oldflp,
2157 				   struct net_device *dev_out,
2158 				   unsigned flags)
2159 {
2160 	struct rtable *rth;
2161 	struct in_device *in_dev;
2162 	u32 tos = RT_FL_TOS(oldflp);
2163 	int err = 0;
2164 
2165 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2166 		return -EINVAL;
2167 
2168 	if (fl->fl4_dst == 0xFFFFFFFF)
2169 		res->type = RTN_BROADCAST;
2170 	else if (MULTICAST(fl->fl4_dst))
2171 		res->type = RTN_MULTICAST;
2172 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2173 		return -EINVAL;
2174 
2175 	if (dev_out->flags & IFF_LOOPBACK)
2176 		flags |= RTCF_LOCAL;
2177 
2178 	/* get work reference to inet device */
2179 	in_dev = in_dev_get(dev_out);
2180 	if (!in_dev)
2181 		return -EINVAL;
2182 
2183 	if (res->type == RTN_BROADCAST) {
2184 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2185 		if (res->fi) {
2186 			fib_info_put(res->fi);
2187 			res->fi = NULL;
2188 		}
2189 	} else if (res->type == RTN_MULTICAST) {
2190 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2191 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2192 				 oldflp->proto))
2193 			flags &= ~RTCF_LOCAL;
2194 		/* If multicast route do not exist use
2195 		   default one, but do not gateway in this case.
2196 		   Yes, it is hack.
2197 		 */
2198 		if (res->fi && res->prefixlen < 4) {
2199 			fib_info_put(res->fi);
2200 			res->fi = NULL;
2201 		}
2202 	}
2203 
2204 
2205 	rth = dst_alloc(&ipv4_dst_ops);
2206 	if (!rth) {
2207 		err = -ENOBUFS;
2208 		goto cleanup;
2209 	}
2210 
2211 	rth->u.dst.flags= DST_HOST;
2212 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2213 	if (res->fi) {
2214 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2215 		if (res->fi->fib_nhs > 1)
2216 			rth->u.dst.flags |= DST_BALANCED;
2217 	}
2218 #endif
2219 	if (in_dev->cnf.no_xfrm)
2220 		rth->u.dst.flags |= DST_NOXFRM;
2221 	if (in_dev->cnf.no_policy)
2222 		rth->u.dst.flags |= DST_NOPOLICY;
2223 
2224 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2225 	rth->fl.fl4_tos	= tos;
2226 	rth->fl.fl4_src	= oldflp->fl4_src;
2227 	rth->fl.oif	= oldflp->oif;
2228 #ifdef CONFIG_IP_ROUTE_FWMARK
2229 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2230 #endif
2231 	rth->rt_dst	= fl->fl4_dst;
2232 	rth->rt_src	= fl->fl4_src;
2233 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2234 	/* get references to the devices that are to be hold by the routing
2235 	   cache entry */
2236 	rth->u.dst.dev	= dev_out;
2237 	dev_hold(dev_out);
2238 	rth->idev	= in_dev_get(dev_out);
2239 	rth->rt_gateway = fl->fl4_dst;
2240 	rth->rt_spec_dst= fl->fl4_src;
2241 
2242 	rth->u.dst.output=ip_output;
2243 
2244 	RT_CACHE_STAT_INC(out_slow_tot);
2245 
2246 	if (flags & RTCF_LOCAL) {
2247 		rth->u.dst.input = ip_local_deliver;
2248 		rth->rt_spec_dst = fl->fl4_dst;
2249 	}
2250 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 		rth->rt_spec_dst = fl->fl4_src;
2252 		if (flags & RTCF_LOCAL &&
2253 		    !(dev_out->flags & IFF_LOOPBACK)) {
2254 			rth->u.dst.output = ip_mc_output;
2255 			RT_CACHE_STAT_INC(out_slow_mc);
2256 		}
2257 #ifdef CONFIG_IP_MROUTE
2258 		if (res->type == RTN_MULTICAST) {
2259 			if (IN_DEV_MFORWARD(in_dev) &&
2260 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 				rth->u.dst.input = ip_mr_input;
2262 				rth->u.dst.output = ip_mc_output;
2263 			}
2264 		}
2265 #endif
2266 	}
2267 
2268 	rt_set_nexthop(rth, res, 0);
2269 
2270 	rth->rt_flags = flags;
2271 
2272 	*result = rth;
2273  cleanup:
2274 	/* release work reference to inet device */
2275 	in_dev_put(in_dev);
2276 
2277 	return err;
2278 }
2279 
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281 					struct fib_result* res,
2282 					const struct flowi *fl,
2283 					const struct flowi *oldflp,
2284 					struct net_device *dev_out,
2285 					unsigned flags)
2286 {
2287 	struct rtable *rth = NULL;
2288 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 	unsigned hash;
2290 	if (err == 0) {
2291 		u32 tos = RT_FL_TOS(oldflp);
2292 
2293 		atomic_set(&rth->u.dst.__refcnt, 1);
2294 
2295 		hash = rt_hash_code(oldflp->fl4_dst,
2296 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2297 		err = rt_intern_hash(hash, rth, rp);
2298 	}
2299 
2300 	return err;
2301 }
2302 
2303 static inline int ip_mkroute_output(struct rtable** rp,
2304 				    struct fib_result* res,
2305 				    const struct flowi *fl,
2306 				    const struct flowi *oldflp,
2307 				    struct net_device *dev_out,
2308 				    unsigned flags)
2309 {
2310 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2311 	u32 tos = RT_FL_TOS(oldflp);
2312 	unsigned char hop;
2313 	unsigned hash;
2314 	int err = -EINVAL;
2315 	struct rtable *rth = NULL;
2316 
2317 	if (res->fi && res->fi->fib_nhs > 1) {
2318 		unsigned char hopcount = res->fi->fib_nhs;
2319 
2320 		for (hop = 0; hop < hopcount; hop++) {
2321 			struct net_device *dev2nexthop;
2322 
2323 			res->nh_sel = hop;
2324 
2325 			/* hold a work reference to the output device */
2326 			dev2nexthop = FIB_RES_DEV(*res);
2327 			dev_hold(dev2nexthop);
2328 
2329 			err = __mkroute_output(&rth, res, fl, oldflp,
2330 					       dev2nexthop, flags);
2331 
2332 			if (err != 0)
2333 				goto cleanup;
2334 
2335 			hash = rt_hash_code(oldflp->fl4_dst,
2336 					    oldflp->fl4_src ^
2337 					    (oldflp->oif << 5), tos);
2338 			err = rt_intern_hash(hash, rth, rp);
2339 
2340 			/* forward hop information to multipath impl. */
2341 			multipath_set_nhinfo(rth,
2342 					     FIB_RES_NETWORK(*res),
2343 					     FIB_RES_NETMASK(*res),
2344 					     res->prefixlen,
2345 					     &FIB_RES_NH(*res));
2346 		cleanup:
2347 			/* release work reference to output device */
2348 			dev_put(dev2nexthop);
2349 
2350 			if (err != 0)
2351 				return err;
2352 		}
2353 		atomic_set(&(*rp)->u.dst.__refcnt, 1);
2354 		return err;
2355 	} else {
2356 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2357 					     flags);
2358 	}
2359 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2360 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2361 #endif
2362 }
2363 
2364 /*
2365  * Major route resolver routine.
2366  */
2367 
2368 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2369 {
2370 	u32 tos	= RT_FL_TOS(oldflp);
2371 	struct flowi fl = { .nl_u = { .ip4_u =
2372 				      { .daddr = oldflp->fl4_dst,
2373 					.saddr = oldflp->fl4_src,
2374 					.tos = tos & IPTOS_RT_MASK,
2375 					.scope = ((tos & RTO_ONLINK) ?
2376 						  RT_SCOPE_LINK :
2377 						  RT_SCOPE_UNIVERSE),
2378 #ifdef CONFIG_IP_ROUTE_FWMARK
2379 					.fwmark = oldflp->fl4_fwmark
2380 #endif
2381 				      } },
2382 			    .iif = loopback_dev.ifindex,
2383 			    .oif = oldflp->oif };
2384 	struct fib_result res;
2385 	unsigned flags = 0;
2386 	struct net_device *dev_out = NULL;
2387 	int free_res = 0;
2388 	int err;
2389 
2390 
2391 	res.fi		= NULL;
2392 #ifdef CONFIG_IP_MULTIPLE_TABLES
2393 	res.r		= NULL;
2394 #endif
2395 
2396 	if (oldflp->fl4_src) {
2397 		err = -EINVAL;
2398 		if (MULTICAST(oldflp->fl4_src) ||
2399 		    BADCLASS(oldflp->fl4_src) ||
2400 		    ZERONET(oldflp->fl4_src))
2401 			goto out;
2402 
2403 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2404 		dev_out = ip_dev_find(oldflp->fl4_src);
2405 		if (dev_out == NULL)
2406 			goto out;
2407 
2408 		/* I removed check for oif == dev_out->oif here.
2409 		   It was wrong for two reasons:
2410 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2411 		      assigned to multiple interfaces.
2412 		   2. Moreover, we are allowed to send packets with saddr
2413 		      of another iface. --ANK
2414 		 */
2415 
2416 		if (oldflp->oif == 0
2417 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2418 			/* Special hack: user can direct multicasts
2419 			   and limited broadcast via necessary interface
2420 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2421 			   This hack is not just for fun, it allows
2422 			   vic,vat and friends to work.
2423 			   They bind socket to loopback, set ttl to zero
2424 			   and expect that it will work.
2425 			   From the viewpoint of routing cache they are broken,
2426 			   because we are not allowed to build multicast path
2427 			   with loopback source addr (look, routing cache
2428 			   cannot know, that ttl is zero, so that packet
2429 			   will not leave this host and route is valid).
2430 			   Luckily, this hack is good workaround.
2431 			 */
2432 
2433 			fl.oif = dev_out->ifindex;
2434 			goto make_route;
2435 		}
2436 		if (dev_out)
2437 			dev_put(dev_out);
2438 		dev_out = NULL;
2439 	}
2440 
2441 
2442 	if (oldflp->oif) {
2443 		dev_out = dev_get_by_index(oldflp->oif);
2444 		err = -ENODEV;
2445 		if (dev_out == NULL)
2446 			goto out;
2447 		if (__in_dev_get(dev_out) == NULL) {
2448 			dev_put(dev_out);
2449 			goto out;	/* Wrong error code */
2450 		}
2451 
2452 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2453 			if (!fl.fl4_src)
2454 				fl.fl4_src = inet_select_addr(dev_out, 0,
2455 							      RT_SCOPE_LINK);
2456 			goto make_route;
2457 		}
2458 		if (!fl.fl4_src) {
2459 			if (MULTICAST(oldflp->fl4_dst))
2460 				fl.fl4_src = inet_select_addr(dev_out, 0,
2461 							      fl.fl4_scope);
2462 			else if (!oldflp->fl4_dst)
2463 				fl.fl4_src = inet_select_addr(dev_out, 0,
2464 							      RT_SCOPE_HOST);
2465 		}
2466 	}
2467 
2468 	if (!fl.fl4_dst) {
2469 		fl.fl4_dst = fl.fl4_src;
2470 		if (!fl.fl4_dst)
2471 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2472 		if (dev_out)
2473 			dev_put(dev_out);
2474 		dev_out = &loopback_dev;
2475 		dev_hold(dev_out);
2476 		fl.oif = loopback_dev.ifindex;
2477 		res.type = RTN_LOCAL;
2478 		flags |= RTCF_LOCAL;
2479 		goto make_route;
2480 	}
2481 
2482 	if (fib_lookup(&fl, &res)) {
2483 		res.fi = NULL;
2484 		if (oldflp->oif) {
2485 			/* Apparently, routing tables are wrong. Assume,
2486 			   that the destination is on link.
2487 
2488 			   WHY? DW.
2489 			   Because we are allowed to send to iface
2490 			   even if it has NO routes and NO assigned
2491 			   addresses. When oif is specified, routing
2492 			   tables are looked up with only one purpose:
2493 			   to catch if destination is gatewayed, rather than
2494 			   direct. Moreover, if MSG_DONTROUTE is set,
2495 			   we send packet, ignoring both routing tables
2496 			   and ifaddr state. --ANK
2497 
2498 
2499 			   We could make it even if oif is unknown,
2500 			   likely IPv6, but we do not.
2501 			 */
2502 
2503 			if (fl.fl4_src == 0)
2504 				fl.fl4_src = inet_select_addr(dev_out, 0,
2505 							      RT_SCOPE_LINK);
2506 			res.type = RTN_UNICAST;
2507 			goto make_route;
2508 		}
2509 		if (dev_out)
2510 			dev_put(dev_out);
2511 		err = -ENETUNREACH;
2512 		goto out;
2513 	}
2514 	free_res = 1;
2515 
2516 	if (res.type == RTN_LOCAL) {
2517 		if (!fl.fl4_src)
2518 			fl.fl4_src = fl.fl4_dst;
2519 		if (dev_out)
2520 			dev_put(dev_out);
2521 		dev_out = &loopback_dev;
2522 		dev_hold(dev_out);
2523 		fl.oif = dev_out->ifindex;
2524 		if (res.fi)
2525 			fib_info_put(res.fi);
2526 		res.fi = NULL;
2527 		flags |= RTCF_LOCAL;
2528 		goto make_route;
2529 	}
2530 
2531 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2532 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2533 		fib_select_multipath(&fl, &res);
2534 	else
2535 #endif
2536 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2537 		fib_select_default(&fl, &res);
2538 
2539 	if (!fl.fl4_src)
2540 		fl.fl4_src = FIB_RES_PREFSRC(res);
2541 
2542 	if (dev_out)
2543 		dev_put(dev_out);
2544 	dev_out = FIB_RES_DEV(res);
2545 	dev_hold(dev_out);
2546 	fl.oif = dev_out->ifindex;
2547 
2548 
2549 make_route:
2550 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2551 
2552 
2553 	if (free_res)
2554 		fib_res_put(&res);
2555 	if (dev_out)
2556 		dev_put(dev_out);
2557 out:	return err;
2558 }
2559 
2560 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2561 {
2562 	unsigned hash;
2563 	struct rtable *rth;
2564 
2565 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2566 
2567 	rcu_read_lock_bh();
2568 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2569 		rth = rcu_dereference(rth->u.rt_next)) {
2570 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2571 		    rth->fl.fl4_src == flp->fl4_src &&
2572 		    rth->fl.iif == 0 &&
2573 		    rth->fl.oif == flp->oif &&
2574 #ifdef CONFIG_IP_ROUTE_FWMARK
2575 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2576 #endif
2577 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2578 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2579 
2580 			/* check for multipath routes and choose one if
2581 			 * necessary
2582 			 */
2583 			if (multipath_select_route(flp, rth, rp)) {
2584 				dst_hold(&(*rp)->u.dst);
2585 				RT_CACHE_STAT_INC(out_hit);
2586 				rcu_read_unlock_bh();
2587 				return 0;
2588 			}
2589 
2590 			rth->u.dst.lastuse = jiffies;
2591 			dst_hold(&rth->u.dst);
2592 			rth->u.dst.__use++;
2593 			RT_CACHE_STAT_INC(out_hit);
2594 			rcu_read_unlock_bh();
2595 			*rp = rth;
2596 			return 0;
2597 		}
2598 		RT_CACHE_STAT_INC(out_hlist_search);
2599 	}
2600 	rcu_read_unlock_bh();
2601 
2602 	return ip_route_output_slow(rp, flp);
2603 }
2604 
2605 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2606 
2607 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2608 {
2609 	int err;
2610 
2611 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2612 		return err;
2613 
2614 	if (flp->proto) {
2615 		if (!flp->fl4_src)
2616 			flp->fl4_src = (*rp)->rt_src;
2617 		if (!flp->fl4_dst)
2618 			flp->fl4_dst = (*rp)->rt_dst;
2619 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2620 	}
2621 
2622 	return 0;
2623 }
2624 
2625 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2626 
2627 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2628 {
2629 	return ip_route_output_flow(rp, flp, NULL, 0);
2630 }
2631 
2632 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2633 			int nowait, unsigned int flags)
2634 {
2635 	struct rtable *rt = (struct rtable*)skb->dst;
2636 	struct rtmsg *r;
2637 	struct nlmsghdr  *nlh;
2638 	unsigned char	 *b = skb->tail;
2639 	struct rta_cacheinfo ci;
2640 #ifdef CONFIG_IP_MROUTE
2641 	struct rtattr *eptr;
2642 #endif
2643 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2644 	r = NLMSG_DATA(nlh);
2645 	r->rtm_family	 = AF_INET;
2646 	r->rtm_dst_len	= 32;
2647 	r->rtm_src_len	= 0;
2648 	r->rtm_tos	= rt->fl.fl4_tos;
2649 	r->rtm_table	= RT_TABLE_MAIN;
2650 	r->rtm_type	= rt->rt_type;
2651 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2652 	r->rtm_protocol = RTPROT_UNSPEC;
2653 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2654 	if (rt->rt_flags & RTCF_NOTIFY)
2655 		r->rtm_flags |= RTM_F_NOTIFY;
2656 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2657 	if (rt->fl.fl4_src) {
2658 		r->rtm_src_len = 32;
2659 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2660 	}
2661 	if (rt->u.dst.dev)
2662 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2663 #ifdef CONFIG_NET_CLS_ROUTE
2664 	if (rt->u.dst.tclassid)
2665 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2666 #endif
2667 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2668 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2669 		__u32 alg = rt->rt_multipath_alg;
2670 
2671 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2672 	}
2673 #endif
2674 	if (rt->fl.iif)
2675 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2676 	else if (rt->rt_src != rt->fl.fl4_src)
2677 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2678 	if (rt->rt_dst != rt->rt_gateway)
2679 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2680 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2681 		goto rtattr_failure;
2682 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2683 	ci.rta_used	= rt->u.dst.__use;
2684 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2685 	if (rt->u.dst.expires)
2686 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2687 	else
2688 		ci.rta_expires = 0;
2689 	ci.rta_error	= rt->u.dst.error;
2690 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2691 	if (rt->peer) {
2692 		ci.rta_id = rt->peer->ip_id_count;
2693 		if (rt->peer->tcp_ts_stamp) {
2694 			ci.rta_ts = rt->peer->tcp_ts;
2695 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2696 		}
2697 	}
2698 #ifdef CONFIG_IP_MROUTE
2699 	eptr = (struct rtattr*)skb->tail;
2700 #endif
2701 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2702 	if (rt->fl.iif) {
2703 #ifdef CONFIG_IP_MROUTE
2704 		u32 dst = rt->rt_dst;
2705 
2706 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2707 		    ipv4_devconf.mc_forwarding) {
2708 			int err = ipmr_get_route(skb, r, nowait);
2709 			if (err <= 0) {
2710 				if (!nowait) {
2711 					if (err == 0)
2712 						return 0;
2713 					goto nlmsg_failure;
2714 				} else {
2715 					if (err == -EMSGSIZE)
2716 						goto nlmsg_failure;
2717 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2718 				}
2719 			}
2720 		} else
2721 #endif
2722 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2723 	}
2724 
2725 	nlh->nlmsg_len = skb->tail - b;
2726 	return skb->len;
2727 
2728 nlmsg_failure:
2729 rtattr_failure:
2730 	skb_trim(skb, b - skb->data);
2731 	return -1;
2732 }
2733 
2734 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2735 {
2736 	struct rtattr **rta = arg;
2737 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2738 	struct rtable *rt = NULL;
2739 	u32 dst = 0;
2740 	u32 src = 0;
2741 	int iif = 0;
2742 	int err = -ENOBUFS;
2743 	struct sk_buff *skb;
2744 
2745 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2746 	if (!skb)
2747 		goto out;
2748 
2749 	/* Reserve room for dummy headers, this skb can pass
2750 	   through good chunk of routing engine.
2751 	 */
2752 	skb->mac.raw = skb->data;
2753 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754 
2755 	if (rta[RTA_SRC - 1])
2756 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 	if (rta[RTA_DST - 1])
2758 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 	if (rta[RTA_IIF - 1])
2760 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2761 
2762 	if (iif) {
2763 		struct net_device *dev = __dev_get_by_index(iif);
2764 		err = -ENODEV;
2765 		if (!dev)
2766 			goto out_free;
2767 		skb->protocol	= htons(ETH_P_IP);
2768 		skb->dev	= dev;
2769 		local_bh_disable();
2770 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2771 		local_bh_enable();
2772 		rt = (struct rtable*)skb->dst;
2773 		if (!err && rt->u.dst.error)
2774 			err = -rt->u.dst.error;
2775 	} else {
2776 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2777 							 .saddr = src,
2778 							 .tos = rtm->rtm_tos } } };
2779 		int oif = 0;
2780 		if (rta[RTA_OIF - 1])
2781 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2782 		fl.oif = oif;
2783 		err = ip_route_output_key(&rt, &fl);
2784 	}
2785 	if (err)
2786 		goto out_free;
2787 
2788 	skb->dst = &rt->u.dst;
2789 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 		rt->rt_flags |= RTCF_NOTIFY;
2791 
2792 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2793 
2794 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 				RTM_NEWROUTE, 0, 0);
2796 	if (!err)
2797 		goto out_free;
2798 	if (err < 0) {
2799 		err = -EMSGSIZE;
2800 		goto out_free;
2801 	}
2802 
2803 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2804 	if (err > 0)
2805 		err = 0;
2806 out:	return err;
2807 
2808 out_free:
2809 	kfree_skb(skb);
2810 	goto out;
2811 }
2812 
2813 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2814 {
2815 	struct rtable *rt;
2816 	int h, s_h;
2817 	int idx, s_idx;
2818 
2819 	s_h = cb->args[0];
2820 	s_idx = idx = cb->args[1];
2821 	for (h = 0; h <= rt_hash_mask; h++) {
2822 		if (h < s_h) continue;
2823 		if (h > s_h)
2824 			s_idx = 0;
2825 		rcu_read_lock_bh();
2826 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 			if (idx < s_idx)
2829 				continue;
2830 			skb->dst = dst_clone(&rt->u.dst);
2831 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 					 1, NLM_F_MULTI) <= 0) {
2834 				dst_release(xchg(&skb->dst, NULL));
2835 				rcu_read_unlock_bh();
2836 				goto done;
2837 			}
2838 			dst_release(xchg(&skb->dst, NULL));
2839 		}
2840 		rcu_read_unlock_bh();
2841 	}
2842 
2843 done:
2844 	cb->args[0] = h;
2845 	cb->args[1] = idx;
2846 	return skb->len;
2847 }
2848 
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2850 {
2851 	rt_cache_flush(0);
2852 }
2853 
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2856 
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 					struct file *filp, void __user *buffer,
2859 					size_t *lenp, loff_t *ppos)
2860 {
2861 	if (write) {
2862 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 		rt_cache_flush(flush_delay);
2864 		return 0;
2865 	}
2866 
2867 	return -EINVAL;
2868 }
2869 
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 						int __user *name,
2872 						int nlen,
2873 						void __user *oldval,
2874 						size_t __user *oldlenp,
2875 						void __user *newval,
2876 						size_t newlen,
2877 						void **context)
2878 {
2879 	int delay;
2880 	if (newlen != sizeof(int))
2881 		return -EINVAL;
2882 	if (get_user(delay, (int __user *)newval))
2883 		return -EFAULT;
2884 	rt_cache_flush(delay);
2885 	return 0;
2886 }
2887 
2888 ctl_table ipv4_route_table[] = {
2889         {
2890 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2891 		.procname	= "flush",
2892 		.data		= &flush_delay,
2893 		.maxlen		= sizeof(int),
2894 		.mode		= 0200,
2895 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2896 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2897 	},
2898 	{
2899 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2900 		.procname	= "min_delay",
2901 		.data		= &ip_rt_min_delay,
2902 		.maxlen		= sizeof(int),
2903 		.mode		= 0644,
2904 		.proc_handler	= &proc_dointvec_jiffies,
2905 		.strategy	= &sysctl_jiffies,
2906 	},
2907 	{
2908 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2909 		.procname	= "max_delay",
2910 		.data		= &ip_rt_max_delay,
2911 		.maxlen		= sizeof(int),
2912 		.mode		= 0644,
2913 		.proc_handler	= &proc_dointvec_jiffies,
2914 		.strategy	= &sysctl_jiffies,
2915 	},
2916 	{
2917 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2918 		.procname	= "gc_thresh",
2919 		.data		= &ipv4_dst_ops.gc_thresh,
2920 		.maxlen		= sizeof(int),
2921 		.mode		= 0644,
2922 		.proc_handler	= &proc_dointvec,
2923 	},
2924 	{
2925 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2926 		.procname	= "max_size",
2927 		.data		= &ip_rt_max_size,
2928 		.maxlen		= sizeof(int),
2929 		.mode		= 0644,
2930 		.proc_handler	= &proc_dointvec,
2931 	},
2932 	{
2933 		/*  Deprecated. Use gc_min_interval_ms */
2934 
2935 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 		.procname	= "gc_min_interval",
2937 		.data		= &ip_rt_gc_min_interval,
2938 		.maxlen		= sizeof(int),
2939 		.mode		= 0644,
2940 		.proc_handler	= &proc_dointvec_jiffies,
2941 		.strategy	= &sysctl_jiffies,
2942 	},
2943 	{
2944 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 		.procname	= "gc_min_interval_ms",
2946 		.data		= &ip_rt_gc_min_interval,
2947 		.maxlen		= sizeof(int),
2948 		.mode		= 0644,
2949 		.proc_handler	= &proc_dointvec_ms_jiffies,
2950 		.strategy	= &sysctl_ms_jiffies,
2951 	},
2952 	{
2953 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2954 		.procname	= "gc_timeout",
2955 		.data		= &ip_rt_gc_timeout,
2956 		.maxlen		= sizeof(int),
2957 		.mode		= 0644,
2958 		.proc_handler	= &proc_dointvec_jiffies,
2959 		.strategy	= &sysctl_jiffies,
2960 	},
2961 	{
2962 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2963 		.procname	= "gc_interval",
2964 		.data		= &ip_rt_gc_interval,
2965 		.maxlen		= sizeof(int),
2966 		.mode		= 0644,
2967 		.proc_handler	= &proc_dointvec_jiffies,
2968 		.strategy	= &sysctl_jiffies,
2969 	},
2970 	{
2971 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 		.procname	= "redirect_load",
2973 		.data		= &ip_rt_redirect_load,
2974 		.maxlen		= sizeof(int),
2975 		.mode		= 0644,
2976 		.proc_handler	= &proc_dointvec,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 		.procname	= "redirect_number",
2981 		.data		= &ip_rt_redirect_number,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec,
2985 	},
2986 	{
2987 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 		.procname	= "redirect_silence",
2989 		.data		= &ip_rt_redirect_silence,
2990 		.maxlen		= sizeof(int),
2991 		.mode		= 0644,
2992 		.proc_handler	= &proc_dointvec,
2993 	},
2994 	{
2995 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2996 		.procname	= "error_cost",
2997 		.data		= &ip_rt_error_cost,
2998 		.maxlen		= sizeof(int),
2999 		.mode		= 0644,
3000 		.proc_handler	= &proc_dointvec,
3001 	},
3002 	{
3003 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3004 		.procname	= "error_burst",
3005 		.data		= &ip_rt_error_burst,
3006 		.maxlen		= sizeof(int),
3007 		.mode		= 0644,
3008 		.proc_handler	= &proc_dointvec,
3009 	},
3010 	{
3011 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3012 		.procname	= "gc_elasticity",
3013 		.data		= &ip_rt_gc_elasticity,
3014 		.maxlen		= sizeof(int),
3015 		.mode		= 0644,
3016 		.proc_handler	= &proc_dointvec,
3017 	},
3018 	{
3019 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3020 		.procname	= "mtu_expires",
3021 		.data		= &ip_rt_mtu_expires,
3022 		.maxlen		= sizeof(int),
3023 		.mode		= 0644,
3024 		.proc_handler	= &proc_dointvec_jiffies,
3025 		.strategy	= &sysctl_jiffies,
3026 	},
3027 	{
3028 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3029 		.procname	= "min_pmtu",
3030 		.data		= &ip_rt_min_pmtu,
3031 		.maxlen		= sizeof(int),
3032 		.mode		= 0644,
3033 		.proc_handler	= &proc_dointvec,
3034 	},
3035 	{
3036 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3037 		.procname	= "min_adv_mss",
3038 		.data		= &ip_rt_min_advmss,
3039 		.maxlen		= sizeof(int),
3040 		.mode		= 0644,
3041 		.proc_handler	= &proc_dointvec,
3042 	},
3043 	{
3044 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 		.procname	= "secret_interval",
3046 		.data		= &ip_rt_secret_interval,
3047 		.maxlen		= sizeof(int),
3048 		.mode		= 0644,
3049 		.proc_handler	= &proc_dointvec_jiffies,
3050 		.strategy	= &sysctl_jiffies,
3051 	},
3052 	{ .ctl_name = 0 }
3053 };
3054 #endif
3055 
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3058 
3059 /* This code sucks.  But you should have seen it before! --RR */
3060 
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063 
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 			   int length, int *eof, void *data)
3067 {
3068 	unsigned int i;
3069 
3070 	if ((offset & 3) || (length & 3))
3071 		return -EIO;
3072 
3073 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 		*eof = 1;
3075 		return 0;
3076 	}
3077 
3078 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 		*eof = 1;
3081 	}
3082 
3083 	offset /= sizeof(u32);
3084 
3085 	if (length > 0) {
3086 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 		u32 *dst = (u32 *) buffer;
3088 
3089 		/* Copy first cpu. */
3090 		*start = buffer;
3091 		memcpy(dst, src, length);
3092 
3093 		/* Add the other cpus in, one int at a time */
3094 		for_each_cpu(i) {
3095 			unsigned int j;
3096 
3097 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098 
3099 			for (j = 0; j < length/4; j++)
3100 				dst[j] += src[j];
3101 		}
3102 	}
3103 	return length;
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3107 
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3110 {
3111 	if (!str)
3112 		return 0;
3113 	rhash_entries = simple_strtoul(str, &str, 0);
3114 	return 1;
3115 }
3116 __setup("rhash_entries=", set_rhash_entries);
3117 
3118 int __init ip_rt_init(void)
3119 {
3120 	int rc = 0;
3121 
3122 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 			     (jiffies ^ (jiffies >> 7)));
3124 
3125 #ifdef CONFIG_NET_CLS_ROUTE
3126 	{
3127 	int order;
3128 	for (order = 0;
3129 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 		/* NOTHING */;
3131 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 	if (!ip_rt_acct)
3133 		panic("IP: failed to allocate ip_rt_acct\n");
3134 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3135 	}
3136 #endif
3137 
3138 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 						     sizeof(struct rtable),
3140 						     0, SLAB_HWCACHE_ALIGN,
3141 						     NULL, NULL);
3142 
3143 	if (!ipv4_dst_ops.kmem_cachep)
3144 		panic("IP: failed to allocate ip_dst_cache\n");
3145 
3146 	rt_hash_table = (struct rt_hash_bucket *)
3147 		alloc_large_system_hash("IP route cache",
3148 					sizeof(struct rt_hash_bucket),
3149 					rhash_entries,
3150 					(num_physpages >= 128 * 1024) ?
3151 						(27 - PAGE_SHIFT) :
3152 						(29 - PAGE_SHIFT),
3153 					HASH_HIGHMEM,
3154 					&rt_hash_log,
3155 					&rt_hash_mask,
3156 					0);
3157 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3158 	rt_hash_lock_init();
3159 
3160 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3161 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3162 
3163 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3164 	if (!rt_cache_stat)
3165 		return -ENOMEM;
3166 
3167 	devinet_init();
3168 	ip_fib_init();
3169 
3170 	init_timer(&rt_flush_timer);
3171 	rt_flush_timer.function = rt_run_flush;
3172 	init_timer(&rt_periodic_timer);
3173 	rt_periodic_timer.function = rt_check_expire;
3174 	init_timer(&rt_secret_timer);
3175 	rt_secret_timer.function = rt_secret_rebuild;
3176 
3177 	/* All the timers, started at system startup tend
3178 	   to synchronize. Perturb it a bit.
3179 	 */
3180 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3181 					ip_rt_gc_interval;
3182 	add_timer(&rt_periodic_timer);
3183 
3184 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3185 		ip_rt_secret_interval;
3186 	add_timer(&rt_secret_timer);
3187 
3188 #ifdef CONFIG_PROC_FS
3189 	{
3190 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3191 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3192 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3193 			    		     proc_net_stat))) {
3194 		free_percpu(rt_cache_stat);
3195 		return -ENOMEM;
3196 	}
3197 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3198 	}
3199 #ifdef CONFIG_NET_CLS_ROUTE
3200 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3201 #endif
3202 #endif
3203 #ifdef CONFIG_XFRM
3204 	xfrm_init();
3205 	xfrm4_init();
3206 #endif
3207 	return rc;
3208 }
3209 
3210 EXPORT_SYMBOL(__ip_select_ident);
3211 EXPORT_SYMBOL(ip_route_input);
3212 EXPORT_SYMBOL(ip_route_output_key);
3213