xref: /linux/net/ipv4/route.c (revision 20d0021394c1b070bf04b22c5bc8fdb437edd4c5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 
113 #define IP_MAX_MTU	0xFFF0
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_min_delay		= 2 * HZ;
118 static int ip_rt_max_delay		= 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval		= 60 * HZ;
122 static int ip_rt_gc_min_interval	= HZ / 2;
123 static int ip_rt_redirect_number	= 9;
124 static int ip_rt_redirect_load		= HZ / 50;
125 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost		= HZ;
127 static int ip_rt_error_burst		= 5 * HZ;
128 static int ip_rt_gc_elasticity		= 8;
129 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu		= 512 + 20 + 20;
131 static int ip_rt_min_advmss		= 256;
132 static int ip_rt_secret_interval	= 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134 
135 #define RTprint(a...)	printk(KERN_DEBUG a)
136 
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
148 					 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153 
154 
155 static struct dst_ops ipv4_dst_ops = {
156 	.family =		AF_INET,
157 	.protocol =		__constant_htons(ETH_P_IP),
158 	.gc =			rt_garbage_collect,
159 	.check =		ipv4_dst_check,
160 	.destroy =		ipv4_dst_destroy,
161 	.ifdown =		ipv4_dst_ifdown,
162 	.negative_advice =	ipv4_negative_advice,
163 	.link_failure =		ipv4_link_failure,
164 	.update_pmtu =		ip_rt_update_pmtu,
165 	.entry_size =		sizeof(struct rtable),
166 };
167 
168 #define ECN_OR_COST(class)	TC_PRIO_##class
169 
170 __u8 ip_tos2prio[16] = {
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(FILLER),
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(BESTEFFORT),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 
189 
190 /*
191  * Route cache.
192  */
193 
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203 
204 struct rt_hash_bucket {
205 	struct rtable	*chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ	4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ	2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ	1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ	512
220 #else
221 #define RT_HASH_LOCK_SZ	256
222 #endif
223 
224 static spinlock_t	*rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()	{ \
227 		int i; \
228 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 			spin_lock_init(&rt_hash_locks[i]); \
232 		}
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237 
238 static struct rt_hash_bucket 	*rt_hash_table;
239 static unsigned			rt_hash_mask;
240 static int			rt_hash_log;
241 static unsigned int		rt_hash_rnd;
242 
243 struct rt_cache_stat *rt_cache_stat;
244 
245 static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 				struct rtable **res);
247 
248 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
249 {
250 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
251 		& rt_hash_mask);
252 }
253 
254 #ifdef CONFIG_PROC_FS
255 struct rt_cache_iter_state {
256 	int bucket;
257 };
258 
259 static struct rtable *rt_cache_get_first(struct seq_file *seq)
260 {
261 	struct rtable *r = NULL;
262 	struct rt_cache_iter_state *st = seq->private;
263 
264 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
265 		rcu_read_lock_bh();
266 		r = rt_hash_table[st->bucket].chain;
267 		if (r)
268 			break;
269 		rcu_read_unlock_bh();
270 	}
271 	return r;
272 }
273 
274 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
275 {
276 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
277 
278 	r = r->u.rt_next;
279 	while (!r) {
280 		rcu_read_unlock_bh();
281 		if (--st->bucket < 0)
282 			break;
283 		rcu_read_lock_bh();
284 		r = rt_hash_table[st->bucket].chain;
285 	}
286 	return r;
287 }
288 
289 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
290 {
291 	struct rtable *r = rt_cache_get_first(seq);
292 
293 	if (r)
294 		while (pos && (r = rt_cache_get_next(seq, r)))
295 			--pos;
296 	return pos ? NULL : r;
297 }
298 
299 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
300 {
301 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
302 }
303 
304 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
305 {
306 	struct rtable *r = NULL;
307 
308 	if (v == SEQ_START_TOKEN)
309 		r = rt_cache_get_first(seq);
310 	else
311 		r = rt_cache_get_next(seq, v);
312 	++*pos;
313 	return r;
314 }
315 
316 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
317 {
318 	if (v && v != SEQ_START_TOKEN)
319 		rcu_read_unlock_bh();
320 }
321 
322 static int rt_cache_seq_show(struct seq_file *seq, void *v)
323 {
324 	if (v == SEQ_START_TOKEN)
325 		seq_printf(seq, "%-127s\n",
326 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
327 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
328 			   "HHUptod\tSpecDst");
329 	else {
330 		struct rtable *r = v;
331 		char temp[256];
332 
333 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
334 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
335 			r->u.dst.dev ? r->u.dst.dev->name : "*",
336 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
337 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
338 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
339 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
340 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
341 			dst_metric(&r->u.dst, RTAX_WINDOW),
342 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
343 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
344 			r->fl.fl4_tos,
345 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
346 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
347 				       dev_queue_xmit) : 0,
348 			r->rt_spec_dst);
349 		seq_printf(seq, "%-127s\n", temp);
350         }
351   	return 0;
352 }
353 
354 static struct seq_operations rt_cache_seq_ops = {
355 	.start  = rt_cache_seq_start,
356 	.next   = rt_cache_seq_next,
357 	.stop   = rt_cache_seq_stop,
358 	.show   = rt_cache_seq_show,
359 };
360 
361 static int rt_cache_seq_open(struct inode *inode, struct file *file)
362 {
363 	struct seq_file *seq;
364 	int rc = -ENOMEM;
365 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
366 
367 	if (!s)
368 		goto out;
369 	rc = seq_open(file, &rt_cache_seq_ops);
370 	if (rc)
371 		goto out_kfree;
372 	seq          = file->private_data;
373 	seq->private = s;
374 	memset(s, 0, sizeof(*s));
375 out:
376 	return rc;
377 out_kfree:
378 	kfree(s);
379 	goto out;
380 }
381 
382 static struct file_operations rt_cache_seq_fops = {
383 	.owner	 = THIS_MODULE,
384 	.open	 = rt_cache_seq_open,
385 	.read	 = seq_read,
386 	.llseek	 = seq_lseek,
387 	.release = seq_release_private,
388 };
389 
390 
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393 	int cpu;
394 
395 	if (*pos == 0)
396 		return SEQ_START_TOKEN;
397 
398 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 		if (!cpu_possible(cpu))
400 			continue;
401 		*pos = cpu+1;
402 		return per_cpu_ptr(rt_cache_stat, cpu);
403 	}
404 	return NULL;
405 }
406 
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409 	int cpu;
410 
411 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 		if (!cpu_possible(cpu))
413 			continue;
414 		*pos = cpu+1;
415 		return per_cpu_ptr(rt_cache_stat, cpu);
416 	}
417 	return NULL;
418 
419 }
420 
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423 
424 }
425 
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428 	struct rt_cache_stat *st = v;
429 
430 	if (v == SEQ_START_TOKEN) {
431 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432 		return 0;
433 	}
434 
435 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 		   atomic_read(&ipv4_dst_ops.entries),
438 		   st->in_hit,
439 		   st->in_slow_tot,
440 		   st->in_slow_mc,
441 		   st->in_no_route,
442 		   st->in_brd,
443 		   st->in_martian_dst,
444 		   st->in_martian_src,
445 
446 		   st->out_hit,
447 		   st->out_slow_tot,
448 		   st->out_slow_mc,
449 
450 		   st->gc_total,
451 		   st->gc_ignored,
452 		   st->gc_goal_miss,
453 		   st->gc_dst_overflow,
454 		   st->in_hlist_search,
455 		   st->out_hlist_search
456 		);
457 	return 0;
458 }
459 
460 static struct seq_operations rt_cpu_seq_ops = {
461 	.start  = rt_cpu_seq_start,
462 	.next   = rt_cpu_seq_next,
463 	.stop   = rt_cpu_seq_stop,
464 	.show   = rt_cpu_seq_show,
465 };
466 
467 
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470 	return seq_open(file, &rt_cpu_seq_ops);
471 }
472 
473 static struct file_operations rt_cpu_seq_fops = {
474 	.owner	 = THIS_MODULE,
475 	.open	 = rt_cpu_seq_open,
476 	.read	 = seq_read,
477 	.llseek	 = seq_lseek,
478 	.release = seq_release,
479 };
480 
481 #endif /* CONFIG_PROC_FS */
482 
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485 	multipath_remove(rt);
486 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
487 }
488 
489 static __inline__ void rt_drop(struct rtable *rt)
490 {
491 	multipath_remove(rt);
492 	ip_rt_put(rt);
493 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
494 }
495 
496 static __inline__ int rt_fast_clean(struct rtable *rth)
497 {
498 	/* Kill broadcast/multicast entries very aggresively, if they
499 	   collide in hash table with more useful entries */
500 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501 		rth->fl.iif && rth->u.rt_next;
502 }
503 
504 static __inline__ int rt_valuable(struct rtable *rth)
505 {
506 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
507 		rth->u.dst.expires;
508 }
509 
510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
511 {
512 	unsigned long age;
513 	int ret = 0;
514 
515 	if (atomic_read(&rth->u.dst.__refcnt))
516 		goto out;
517 
518 	ret = 1;
519 	if (rth->u.dst.expires &&
520 	    time_after_eq(jiffies, rth->u.dst.expires))
521 		goto out;
522 
523 	age = jiffies - rth->u.dst.lastuse;
524 	ret = 0;
525 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526 	    (age <= tmo2 && rt_valuable(rth)))
527 		goto out;
528 	ret = 1;
529 out:	return ret;
530 }
531 
532 /* Bits of score are:
533  * 31: very valuable
534  * 30: not quite useless
535  * 29..0: usage counter
536  */
537 static inline u32 rt_score(struct rtable *rt)
538 {
539 	u32 score = jiffies - rt->u.dst.lastuse;
540 
541 	score = ~score & ~(3<<30);
542 
543 	if (rt_valuable(rt))
544 		score |= (1<<31);
545 
546 	if (!rt->fl.iif ||
547 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
548 		score |= (1<<30);
549 
550 	return score;
551 }
552 
553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
554 {
555 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
556 	       fl1->oif     == fl2->oif &&
557 	       fl1->iif     == fl2->iif;
558 }
559 
560 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
561 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
562 						struct rtable *expentry,
563 						int *removed_count)
564 {
565 	int passedexpired = 0;
566 	struct rtable **nextstep = NULL;
567 	struct rtable **rthp = chain_head;
568 	struct rtable *rth;
569 
570 	if (removed_count)
571 		*removed_count = 0;
572 
573 	while ((rth = *rthp) != NULL) {
574 		if (rth == expentry)
575 			passedexpired = 1;
576 
577 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
578 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
579 			if (*rthp == expentry) {
580 				*rthp = rth->u.rt_next;
581 				continue;
582 			} else {
583 				*rthp = rth->u.rt_next;
584 				rt_free(rth);
585 				if (removed_count)
586 					++(*removed_count);
587 			}
588 		} else {
589 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
590 			    passedexpired && !nextstep)
591 				nextstep = &rth->u.rt_next;
592 
593 			rthp = &rth->u.rt_next;
594 		}
595 	}
596 
597 	rt_free(expentry);
598 	if (removed_count)
599 		++(*removed_count);
600 
601 	return nextstep;
602 }
603 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
604 
605 
606 /* This runs via a timer and thus is always in BH context. */
607 static void rt_check_expire(unsigned long dummy)
608 {
609 	static unsigned int rover;
610 	unsigned int i = rover, goal;
611 	struct rtable *rth, **rthp;
612 	unsigned long now = jiffies;
613 	u64 mult;
614 
615 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 	if (ip_rt_gc_timeout > 1)
617 		do_div(mult, ip_rt_gc_timeout);
618 	goal = (unsigned int)mult;
619 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 	for (; goal > 0; goal--) {
621 		unsigned long tmo = ip_rt_gc_timeout;
622 
623 		i = (i + 1) & rt_hash_mask;
624 		rthp = &rt_hash_table[i].chain;
625 
626 		if (*rthp == 0)
627 			continue;
628 		spin_lock(rt_hash_lock_addr(i));
629 		while ((rth = *rthp) != NULL) {
630 			if (rth->u.dst.expires) {
631 				/* Entry is expired even if it is in use */
632 				if (time_before_eq(now, rth->u.dst.expires)) {
633 					tmo >>= 1;
634 					rthp = &rth->u.rt_next;
635 					continue;
636 				}
637 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
638 				tmo >>= 1;
639 				rthp = &rth->u.rt_next;
640 				continue;
641 			}
642 
643 			/* Cleanup aged off entries. */
644 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
645 			/* remove all related balanced entries if necessary */
646 			if (rth->u.dst.flags & DST_BALANCED) {
647 				rthp = rt_remove_balanced_route(
648 					&rt_hash_table[i].chain,
649 					rth, NULL);
650 				if (!rthp)
651 					break;
652 			} else {
653 				*rthp = rth->u.rt_next;
654 				rt_free(rth);
655 			}
656 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
657  			*rthp = rth->u.rt_next;
658  			rt_free(rth);
659 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
660 		}
661 		spin_unlock(rt_hash_lock_addr(i));
662 
663 		/* Fallback loop breaker. */
664 		if (time_after(jiffies, now))
665 			break;
666 	}
667 	rover = i;
668 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
669 }
670 
671 /* This can run from both BH and non-BH contexts, the latter
672  * in the case of a forced flush event.
673  */
674 static void rt_run_flush(unsigned long dummy)
675 {
676 	int i;
677 	struct rtable *rth, *next;
678 
679 	rt_deadline = 0;
680 
681 	get_random_bytes(&rt_hash_rnd, 4);
682 
683 	for (i = rt_hash_mask; i >= 0; i--) {
684 		spin_lock_bh(rt_hash_lock_addr(i));
685 		rth = rt_hash_table[i].chain;
686 		if (rth)
687 			rt_hash_table[i].chain = NULL;
688 		spin_unlock_bh(rt_hash_lock_addr(i));
689 
690 		for (; rth; rth = next) {
691 			next = rth->u.rt_next;
692 			rt_free(rth);
693 		}
694 	}
695 }
696 
697 static DEFINE_SPINLOCK(rt_flush_lock);
698 
699 void rt_cache_flush(int delay)
700 {
701 	unsigned long now = jiffies;
702 	int user_mode = !in_softirq();
703 
704 	if (delay < 0)
705 		delay = ip_rt_min_delay;
706 
707 	/* flush existing multipath state*/
708 	multipath_flush();
709 
710 	spin_lock_bh(&rt_flush_lock);
711 
712 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
713 		long tmo = (long)(rt_deadline - now);
714 
715 		/* If flush timer is already running
716 		   and flush request is not immediate (delay > 0):
717 
718 		   if deadline is not achieved, prolongate timer to "delay",
719 		   otherwise fire it at deadline time.
720 		 */
721 
722 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
723 			tmo = 0;
724 
725 		if (delay > tmo)
726 			delay = tmo;
727 	}
728 
729 	if (delay <= 0) {
730 		spin_unlock_bh(&rt_flush_lock);
731 		rt_run_flush(0);
732 		return;
733 	}
734 
735 	if (rt_deadline == 0)
736 		rt_deadline = now + ip_rt_max_delay;
737 
738 	mod_timer(&rt_flush_timer, now+delay);
739 	spin_unlock_bh(&rt_flush_lock);
740 }
741 
742 static void rt_secret_rebuild(unsigned long dummy)
743 {
744 	unsigned long now = jiffies;
745 
746 	rt_cache_flush(0);
747 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
748 }
749 
750 /*
751    Short description of GC goals.
752 
753    We want to build algorithm, which will keep routing cache
754    at some equilibrium point, when number of aged off entries
755    is kept approximately equal to newly generated ones.
756 
757    Current expiration strength is variable "expire".
758    We try to adjust it dynamically, so that if networking
759    is idle expires is large enough to keep enough of warm entries,
760    and when load increases it reduces to limit cache size.
761  */
762 
763 static int rt_garbage_collect(void)
764 {
765 	static unsigned long expire = RT_GC_TIMEOUT;
766 	static unsigned long last_gc;
767 	static int rover;
768 	static int equilibrium;
769 	struct rtable *rth, **rthp;
770 	unsigned long now = jiffies;
771 	int goal;
772 
773 	/*
774 	 * Garbage collection is pretty expensive,
775 	 * do not make it too frequently.
776 	 */
777 
778 	RT_CACHE_STAT_INC(gc_total);
779 
780 	if (now - last_gc < ip_rt_gc_min_interval &&
781 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
782 		RT_CACHE_STAT_INC(gc_ignored);
783 		goto out;
784 	}
785 
786 	/* Calculate number of entries, which we want to expire now. */
787 	goal = atomic_read(&ipv4_dst_ops.entries) -
788 		(ip_rt_gc_elasticity << rt_hash_log);
789 	if (goal <= 0) {
790 		if (equilibrium < ipv4_dst_ops.gc_thresh)
791 			equilibrium = ipv4_dst_ops.gc_thresh;
792 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
793 		if (goal > 0) {
794 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
795 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
796 		}
797 	} else {
798 		/* We are in dangerous area. Try to reduce cache really
799 		 * aggressively.
800 		 */
801 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
802 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
803 	}
804 
805 	if (now - last_gc >= ip_rt_gc_min_interval)
806 		last_gc = now;
807 
808 	if (goal <= 0) {
809 		equilibrium += goal;
810 		goto work_done;
811 	}
812 
813 	do {
814 		int i, k;
815 
816 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
817 			unsigned long tmo = expire;
818 
819 			k = (k + 1) & rt_hash_mask;
820 			rthp = &rt_hash_table[k].chain;
821 			spin_lock_bh(rt_hash_lock_addr(k));
822 			while ((rth = *rthp) != NULL) {
823 				if (!rt_may_expire(rth, tmo, expire)) {
824 					tmo >>= 1;
825 					rthp = &rth->u.rt_next;
826 					continue;
827 				}
828 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
829 				/* remove all related balanced entries
830 				 * if necessary
831 				 */
832 				if (rth->u.dst.flags & DST_BALANCED) {
833 					int r;
834 
835 					rthp = rt_remove_balanced_route(
836 						&rt_hash_table[i].chain,
837 						rth,
838 						&r);
839 					goal -= r;
840 					if (!rthp)
841 						break;
842 				} else {
843 					*rthp = rth->u.rt_next;
844 					rt_free(rth);
845 					goal--;
846 				}
847 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
848 				*rthp = rth->u.rt_next;
849 				rt_free(rth);
850 				goal--;
851 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
852 			}
853 			spin_unlock_bh(rt_hash_lock_addr(k));
854 			if (goal <= 0)
855 				break;
856 		}
857 		rover = k;
858 
859 		if (goal <= 0)
860 			goto work_done;
861 
862 		/* Goal is not achieved. We stop process if:
863 
864 		   - if expire reduced to zero. Otherwise, expire is halfed.
865 		   - if table is not full.
866 		   - if we are called from interrupt.
867 		   - jiffies check is just fallback/debug loop breaker.
868 		     We will not spin here for long time in any case.
869 		 */
870 
871 		RT_CACHE_STAT_INC(gc_goal_miss);
872 
873 		if (expire == 0)
874 			break;
875 
876 		expire >>= 1;
877 #if RT_CACHE_DEBUG >= 2
878 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
879 				atomic_read(&ipv4_dst_ops.entries), goal, i);
880 #endif
881 
882 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
883 			goto out;
884 	} while (!in_softirq() && time_before_eq(jiffies, now));
885 
886 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887 		goto out;
888 	if (net_ratelimit())
889 		printk(KERN_WARNING "dst cache overflow\n");
890 	RT_CACHE_STAT_INC(gc_dst_overflow);
891 	return 1;
892 
893 work_done:
894 	expire += ip_rt_gc_min_interval;
895 	if (expire > ip_rt_gc_timeout ||
896 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
897 		expire = ip_rt_gc_timeout;
898 #if RT_CACHE_DEBUG >= 2
899 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
900 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
901 #endif
902 out:	return 0;
903 }
904 
905 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
906 {
907 	struct rtable	*rth, **rthp;
908 	unsigned long	now;
909 	struct rtable *cand, **candp;
910 	u32 		min_score;
911 	int		chain_length;
912 	int attempts = !in_softirq();
913 
914 restart:
915 	chain_length = 0;
916 	min_score = ~(u32)0;
917 	cand = NULL;
918 	candp = NULL;
919 	now = jiffies;
920 
921 	rthp = &rt_hash_table[hash].chain;
922 
923 	spin_lock_bh(rt_hash_lock_addr(hash));
924 	while ((rth = *rthp) != NULL) {
925 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926 		if (!(rth->u.dst.flags & DST_BALANCED) &&
927 		    compare_keys(&rth->fl, &rt->fl)) {
928 #else
929 		if (compare_keys(&rth->fl, &rt->fl)) {
930 #endif
931 			/* Put it first */
932 			*rthp = rth->u.rt_next;
933 			/*
934 			 * Since lookup is lockfree, the deletion
935 			 * must be visible to another weakly ordered CPU before
936 			 * the insertion at the start of the hash chain.
937 			 */
938 			rcu_assign_pointer(rth->u.rt_next,
939 					   rt_hash_table[hash].chain);
940 			/*
941 			 * Since lookup is lockfree, the update writes
942 			 * must be ordered for consistency on SMP.
943 			 */
944 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
945 
946 			rth->u.dst.__use++;
947 			dst_hold(&rth->u.dst);
948 			rth->u.dst.lastuse = now;
949 			spin_unlock_bh(rt_hash_lock_addr(hash));
950 
951 			rt_drop(rt);
952 			*rp = rth;
953 			return 0;
954 		}
955 
956 		if (!atomic_read(&rth->u.dst.__refcnt)) {
957 			u32 score = rt_score(rth);
958 
959 			if (score <= min_score) {
960 				cand = rth;
961 				candp = rthp;
962 				min_score = score;
963 			}
964 		}
965 
966 		chain_length++;
967 
968 		rthp = &rth->u.rt_next;
969 	}
970 
971 	if (cand) {
972 		/* ip_rt_gc_elasticity used to be average length of chain
973 		 * length, when exceeded gc becomes really aggressive.
974 		 *
975 		 * The second limit is less certain. At the moment it allows
976 		 * only 2 entries per bucket. We will see.
977 		 */
978 		if (chain_length > ip_rt_gc_elasticity) {
979 			*candp = cand->u.rt_next;
980 			rt_free(cand);
981 		}
982 	}
983 
984 	/* Try to bind route to arp only if it is output
985 	   route or unicast forwarding path.
986 	 */
987 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
988 		int err = arp_bind_neighbour(&rt->u.dst);
989 		if (err) {
990 			spin_unlock_bh(rt_hash_lock_addr(hash));
991 
992 			if (err != -ENOBUFS) {
993 				rt_drop(rt);
994 				return err;
995 			}
996 
997 			/* Neighbour tables are full and nothing
998 			   can be released. Try to shrink route cache,
999 			   it is most likely it holds some neighbour records.
1000 			 */
1001 			if (attempts-- > 0) {
1002 				int saved_elasticity = ip_rt_gc_elasticity;
1003 				int saved_int = ip_rt_gc_min_interval;
1004 				ip_rt_gc_elasticity	= 1;
1005 				ip_rt_gc_min_interval	= 0;
1006 				rt_garbage_collect();
1007 				ip_rt_gc_min_interval	= saved_int;
1008 				ip_rt_gc_elasticity	= saved_elasticity;
1009 				goto restart;
1010 			}
1011 
1012 			if (net_ratelimit())
1013 				printk(KERN_WARNING "Neighbour table overflow.\n");
1014 			rt_drop(rt);
1015 			return -ENOBUFS;
1016 		}
1017 	}
1018 
1019 	rt->u.rt_next = rt_hash_table[hash].chain;
1020 #if RT_CACHE_DEBUG >= 2
1021 	if (rt->u.rt_next) {
1022 		struct rtable *trt;
1023 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1024 		       NIPQUAD(rt->rt_dst));
1025 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1026 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1027 		printk("\n");
1028 	}
1029 #endif
1030 	rt_hash_table[hash].chain = rt;
1031 	spin_unlock_bh(rt_hash_lock_addr(hash));
1032 	*rp = rt;
1033 	return 0;
1034 }
1035 
1036 void rt_bind_peer(struct rtable *rt, int create)
1037 {
1038 	static DEFINE_SPINLOCK(rt_peer_lock);
1039 	struct inet_peer *peer;
1040 
1041 	peer = inet_getpeer(rt->rt_dst, create);
1042 
1043 	spin_lock_bh(&rt_peer_lock);
1044 	if (rt->peer == NULL) {
1045 		rt->peer = peer;
1046 		peer = NULL;
1047 	}
1048 	spin_unlock_bh(&rt_peer_lock);
1049 	if (peer)
1050 		inet_putpeer(peer);
1051 }
1052 
1053 /*
1054  * Peer allocation may fail only in serious out-of-memory conditions.  However
1055  * we still can generate some output.
1056  * Random ID selection looks a bit dangerous because we have no chances to
1057  * select ID being unique in a reasonable period of time.
1058  * But broken packet identifier may be better than no packet at all.
1059  */
1060 static void ip_select_fb_ident(struct iphdr *iph)
1061 {
1062 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1063 	static u32 ip_fallback_id;
1064 	u32 salt;
1065 
1066 	spin_lock_bh(&ip_fb_id_lock);
1067 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1068 	iph->id = htons(salt & 0xFFFF);
1069 	ip_fallback_id = salt;
1070 	spin_unlock_bh(&ip_fb_id_lock);
1071 }
1072 
1073 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1074 {
1075 	struct rtable *rt = (struct rtable *) dst;
1076 
1077 	if (rt) {
1078 		if (rt->peer == NULL)
1079 			rt_bind_peer(rt, 1);
1080 
1081 		/* If peer is attached to destination, it is never detached,
1082 		   so that we need not to grab a lock to dereference it.
1083 		 */
1084 		if (rt->peer) {
1085 			iph->id = htons(inet_getid(rt->peer, more));
1086 			return;
1087 		}
1088 	} else
1089 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1090 		       __builtin_return_address(0));
1091 
1092 	ip_select_fb_ident(iph);
1093 }
1094 
1095 static void rt_del(unsigned hash, struct rtable *rt)
1096 {
1097 	struct rtable **rthp;
1098 
1099 	spin_lock_bh(rt_hash_lock_addr(hash));
1100 	ip_rt_put(rt);
1101 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1102 	     rthp = &(*rthp)->u.rt_next)
1103 		if (*rthp == rt) {
1104 			*rthp = rt->u.rt_next;
1105 			rt_free(rt);
1106 			break;
1107 		}
1108 	spin_unlock_bh(rt_hash_lock_addr(hash));
1109 }
1110 
1111 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1112 		    u32 saddr, u8 tos, struct net_device *dev)
1113 {
1114 	int i, k;
1115 	struct in_device *in_dev = in_dev_get(dev);
1116 	struct rtable *rth, **rthp;
1117 	u32  skeys[2] = { saddr, 0 };
1118 	int  ikeys[2] = { dev->ifindex, 0 };
1119 
1120 	tos &= IPTOS_RT_MASK;
1121 
1122 	if (!in_dev)
1123 		return;
1124 
1125 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1126 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1127 		goto reject_redirect;
1128 
1129 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1130 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1131 			goto reject_redirect;
1132 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1133 			goto reject_redirect;
1134 	} else {
1135 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1136 			goto reject_redirect;
1137 	}
1138 
1139 	for (i = 0; i < 2; i++) {
1140 		for (k = 0; k < 2; k++) {
1141 			unsigned hash = rt_hash_code(daddr,
1142 						     skeys[i] ^ (ikeys[k] << 5),
1143 						     tos);
1144 
1145 			rthp=&rt_hash_table[hash].chain;
1146 
1147 			rcu_read_lock();
1148 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1149 				struct rtable *rt;
1150 
1151 				if (rth->fl.fl4_dst != daddr ||
1152 				    rth->fl.fl4_src != skeys[i] ||
1153 				    rth->fl.fl4_tos != tos ||
1154 				    rth->fl.oif != ikeys[k] ||
1155 				    rth->fl.iif != 0) {
1156 					rthp = &rth->u.rt_next;
1157 					continue;
1158 				}
1159 
1160 				if (rth->rt_dst != daddr ||
1161 				    rth->rt_src != saddr ||
1162 				    rth->u.dst.error ||
1163 				    rth->rt_gateway != old_gw ||
1164 				    rth->u.dst.dev != dev)
1165 					break;
1166 
1167 				dst_hold(&rth->u.dst);
1168 				rcu_read_unlock();
1169 
1170 				rt = dst_alloc(&ipv4_dst_ops);
1171 				if (rt == NULL) {
1172 					ip_rt_put(rth);
1173 					in_dev_put(in_dev);
1174 					return;
1175 				}
1176 
1177 				/* Copy all the information. */
1178 				*rt = *rth;
1179  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180 				rt->u.dst.__use		= 1;
1181 				atomic_set(&rt->u.dst.__refcnt, 1);
1182 				rt->u.dst.child		= NULL;
1183 				if (rt->u.dst.dev)
1184 					dev_hold(rt->u.dst.dev);
1185 				if (rt->idev)
1186 					in_dev_hold(rt->idev);
1187 				rt->u.dst.obsolete	= 0;
1188 				rt->u.dst.lastuse	= jiffies;
1189 				rt->u.dst.path		= &rt->u.dst;
1190 				rt->u.dst.neighbour	= NULL;
1191 				rt->u.dst.hh		= NULL;
1192 				rt->u.dst.xfrm		= NULL;
1193 
1194 				rt->rt_flags		|= RTCF_REDIRECTED;
1195 
1196 				/* Gateway is different ... */
1197 				rt->rt_gateway		= new_gw;
1198 
1199 				/* Redirect received -> path was valid */
1200 				dst_confirm(&rth->u.dst);
1201 
1202 				if (rt->peer)
1203 					atomic_inc(&rt->peer->refcnt);
1204 
1205 				if (arp_bind_neighbour(&rt->u.dst) ||
1206 				    !(rt->u.dst.neighbour->nud_state &
1207 					    NUD_VALID)) {
1208 					if (rt->u.dst.neighbour)
1209 						neigh_event_send(rt->u.dst.neighbour, NULL);
1210 					ip_rt_put(rth);
1211 					rt_drop(rt);
1212 					goto do_next;
1213 				}
1214 
1215 				rt_del(hash, rth);
1216 				if (!rt_intern_hash(hash, rt, &rt))
1217 					ip_rt_put(rt);
1218 				goto do_next;
1219 			}
1220 			rcu_read_unlock();
1221 		do_next:
1222 			;
1223 		}
1224 	}
1225 	in_dev_put(in_dev);
1226 	return;
1227 
1228 reject_redirect:
1229 #ifdef CONFIG_IP_ROUTE_VERBOSE
1230 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232 			"%u.%u.%u.%u ignored.\n"
1233 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1234 			"tos %02x\n",
1235 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1236 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1237 #endif
1238 	in_dev_put(in_dev);
1239 }
1240 
1241 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1242 {
1243 	struct rtable *rt = (struct rtable*)dst;
1244 	struct dst_entry *ret = dst;
1245 
1246 	if (rt) {
1247 		if (dst->obsolete) {
1248 			ip_rt_put(rt);
1249 			ret = NULL;
1250 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1251 			   rt->u.dst.expires) {
1252 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1253 						     rt->fl.fl4_src ^
1254 							(rt->fl.oif << 5),
1255 						     rt->fl.fl4_tos);
1256 #if RT_CACHE_DEBUG >= 1
1257 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1258 					  "%u.%u.%u.%u/%02x dropped\n",
1259 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1260 #endif
1261 			rt_del(hash, rt);
1262 			ret = NULL;
1263 		}
1264 	}
1265 	return ret;
1266 }
1267 
1268 /*
1269  * Algorithm:
1270  *	1. The first ip_rt_redirect_number redirects are sent
1271  *	   with exponential backoff, then we stop sending them at all,
1272  *	   assuming that the host ignores our redirects.
1273  *	2. If we did not see packets requiring redirects
1274  *	   during ip_rt_redirect_silence, we assume that the host
1275  *	   forgot redirected route and start to send redirects again.
1276  *
1277  * This algorithm is much cheaper and more intelligent than dumb load limiting
1278  * in icmp.c.
1279  *
1280  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1281  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1282  */
1283 
1284 void ip_rt_send_redirect(struct sk_buff *skb)
1285 {
1286 	struct rtable *rt = (struct rtable*)skb->dst;
1287 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1288 
1289 	if (!in_dev)
1290 		return;
1291 
1292 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1293 		goto out;
1294 
1295 	/* No redirected packets during ip_rt_redirect_silence;
1296 	 * reset the algorithm.
1297 	 */
1298 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1299 		rt->u.dst.rate_tokens = 0;
1300 
1301 	/* Too many ignored redirects; do not send anything
1302 	 * set u.dst.rate_last to the last seen redirected packet.
1303 	 */
1304 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1305 		rt->u.dst.rate_last = jiffies;
1306 		goto out;
1307 	}
1308 
1309 	/* Check for load limit; set rate_last to the latest sent
1310 	 * redirect.
1311 	 */
1312 	if (time_after(jiffies,
1313 		       (rt->u.dst.rate_last +
1314 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1315 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1316 		rt->u.dst.rate_last = jiffies;
1317 		++rt->u.dst.rate_tokens;
1318 #ifdef CONFIG_IP_ROUTE_VERBOSE
1319 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1320 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1321 		    net_ratelimit())
1322 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1323 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1324 				NIPQUAD(rt->rt_src), rt->rt_iif,
1325 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1326 #endif
1327 	}
1328 out:
1329         in_dev_put(in_dev);
1330 }
1331 
1332 static int ip_error(struct sk_buff *skb)
1333 {
1334 	struct rtable *rt = (struct rtable*)skb->dst;
1335 	unsigned long now;
1336 	int code;
1337 
1338 	switch (rt->u.dst.error) {
1339 		case EINVAL:
1340 		default:
1341 			goto out;
1342 		case EHOSTUNREACH:
1343 			code = ICMP_HOST_UNREACH;
1344 			break;
1345 		case ENETUNREACH:
1346 			code = ICMP_NET_UNREACH;
1347 			break;
1348 		case EACCES:
1349 			code = ICMP_PKT_FILTERED;
1350 			break;
1351 	}
1352 
1353 	now = jiffies;
1354 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1355 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1356 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1357 	rt->u.dst.rate_last = now;
1358 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1359 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1360 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1361 	}
1362 
1363 out:	kfree_skb(skb);
1364 	return 0;
1365 }
1366 
1367 /*
1368  *	The last two values are not from the RFC but
1369  *	are needed for AMPRnet AX.25 paths.
1370  */
1371 
1372 static unsigned short mtu_plateau[] =
1373 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1374 
1375 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1376 {
1377 	int i;
1378 
1379 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1380 		if (old_mtu > mtu_plateau[i])
1381 			return mtu_plateau[i];
1382 	return 68;
1383 }
1384 
1385 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1386 {
1387 	int i;
1388 	unsigned short old_mtu = ntohs(iph->tot_len);
1389 	struct rtable *rth;
1390 	u32  skeys[2] = { iph->saddr, 0, };
1391 	u32  daddr = iph->daddr;
1392 	u8   tos = iph->tos & IPTOS_RT_MASK;
1393 	unsigned short est_mtu = 0;
1394 
1395 	if (ipv4_config.no_pmtu_disc)
1396 		return 0;
1397 
1398 	for (i = 0; i < 2; i++) {
1399 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1400 
1401 		rcu_read_lock();
1402 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1403 		     rth = rcu_dereference(rth->u.rt_next)) {
1404 			if (rth->fl.fl4_dst == daddr &&
1405 			    rth->fl.fl4_src == skeys[i] &&
1406 			    rth->rt_dst  == daddr &&
1407 			    rth->rt_src  == iph->saddr &&
1408 			    rth->fl.fl4_tos == tos &&
1409 			    rth->fl.iif == 0 &&
1410 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1411 				unsigned short mtu = new_mtu;
1412 
1413 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1414 
1415 					/* BSD 4.2 compatibility hack :-( */
1416 					if (mtu == 0 &&
1417 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1418 					    old_mtu >= 68 + (iph->ihl << 2))
1419 						old_mtu -= iph->ihl << 2;
1420 
1421 					mtu = guess_mtu(old_mtu);
1422 				}
1423 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1424 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1425 						dst_confirm(&rth->u.dst);
1426 						if (mtu < ip_rt_min_pmtu) {
1427 							mtu = ip_rt_min_pmtu;
1428 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1429 								(1 << RTAX_MTU);
1430 						}
1431 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1432 						dst_set_expires(&rth->u.dst,
1433 							ip_rt_mtu_expires);
1434 					}
1435 					est_mtu = mtu;
1436 				}
1437 			}
1438 		}
1439 		rcu_read_unlock();
1440 	}
1441 	return est_mtu ? : new_mtu;
1442 }
1443 
1444 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1445 {
1446 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1447 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1448 		if (mtu < ip_rt_min_pmtu) {
1449 			mtu = ip_rt_min_pmtu;
1450 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1451 		}
1452 		dst->metrics[RTAX_MTU-1] = mtu;
1453 		dst_set_expires(dst, ip_rt_mtu_expires);
1454 	}
1455 }
1456 
1457 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1458 {
1459 	return NULL;
1460 }
1461 
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1463 {
1464 	struct rtable *rt = (struct rtable *) dst;
1465 	struct inet_peer *peer = rt->peer;
1466 	struct in_device *idev = rt->idev;
1467 
1468 	if (peer) {
1469 		rt->peer = NULL;
1470 		inet_putpeer(peer);
1471 	}
1472 
1473 	if (idev) {
1474 		rt->idev = NULL;
1475 		in_dev_put(idev);
1476 	}
1477 }
1478 
1479 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1480 			    int how)
1481 {
1482 	struct rtable *rt = (struct rtable *) dst;
1483 	struct in_device *idev = rt->idev;
1484 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1485 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1486 		if (loopback_idev) {
1487 			rt->idev = loopback_idev;
1488 			in_dev_put(idev);
1489 		}
1490 	}
1491 }
1492 
1493 static void ipv4_link_failure(struct sk_buff *skb)
1494 {
1495 	struct rtable *rt;
1496 
1497 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1498 
1499 	rt = (struct rtable *) skb->dst;
1500 	if (rt)
1501 		dst_set_expires(&rt->u.dst, 0);
1502 }
1503 
1504 static int ip_rt_bug(struct sk_buff *skb)
1505 {
1506 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1507 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1508 		skb->dev ? skb->dev->name : "?");
1509 	kfree_skb(skb);
1510 	return 0;
1511 }
1512 
1513 /*
1514    We do not cache source address of outgoing interface,
1515    because it is used only by IP RR, TS and SRR options,
1516    so that it out of fast path.
1517 
1518    BTW remember: "addr" is allowed to be not aligned
1519    in IP options!
1520  */
1521 
1522 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1523 {
1524 	u32 src;
1525 	struct fib_result res;
1526 
1527 	if (rt->fl.iif == 0)
1528 		src = rt->rt_src;
1529 	else if (fib_lookup(&rt->fl, &res) == 0) {
1530 		src = FIB_RES_PREFSRC(res);
1531 		fib_res_put(&res);
1532 	} else
1533 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1534 					RT_SCOPE_UNIVERSE);
1535 	memcpy(addr, &src, 4);
1536 }
1537 
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539 static void set_class_tag(struct rtable *rt, u32 tag)
1540 {
1541 	if (!(rt->u.dst.tclassid & 0xFFFF))
1542 		rt->u.dst.tclassid |= tag & 0xFFFF;
1543 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1544 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1545 }
1546 #endif
1547 
1548 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1549 {
1550 	struct fib_info *fi = res->fi;
1551 
1552 	if (fi) {
1553 		if (FIB_RES_GW(*res) &&
1554 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1555 			rt->rt_gateway = FIB_RES_GW(*res);
1556 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1557 		       sizeof(rt->u.dst.metrics));
1558 		if (fi->fib_mtu == 0) {
1559 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1560 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1561 			    rt->rt_gateway != rt->rt_dst &&
1562 			    rt->u.dst.dev->mtu > 576)
1563 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1564 		}
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1567 #endif
1568 	} else
1569 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1570 
1571 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1572 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1573 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1574 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1575 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1576 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1577 				       ip_rt_min_advmss);
1578 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1579 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1580 
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 #ifdef CONFIG_IP_MULTIPLE_TABLES
1583 	set_class_tag(rt, fib_rules_tclass(res));
1584 #endif
1585 	set_class_tag(rt, itag);
1586 #endif
1587         rt->rt_type = res->type;
1588 }
1589 
1590 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1591 				u8 tos, struct net_device *dev, int our)
1592 {
1593 	unsigned hash;
1594 	struct rtable *rth;
1595 	u32 spec_dst;
1596 	struct in_device *in_dev = in_dev_get(dev);
1597 	u32 itag = 0;
1598 
1599 	/* Primary sanity checks. */
1600 
1601 	if (in_dev == NULL)
1602 		return -EINVAL;
1603 
1604 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1605 	    skb->protocol != htons(ETH_P_IP))
1606 		goto e_inval;
1607 
1608 	if (ZERONET(saddr)) {
1609 		if (!LOCAL_MCAST(daddr))
1610 			goto e_inval;
1611 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1612 	} else if (fib_validate_source(saddr, 0, tos, 0,
1613 					dev, &spec_dst, &itag) < 0)
1614 		goto e_inval;
1615 
1616 	rth = dst_alloc(&ipv4_dst_ops);
1617 	if (!rth)
1618 		goto e_nobufs;
1619 
1620 	rth->u.dst.output= ip_rt_bug;
1621 
1622 	atomic_set(&rth->u.dst.__refcnt, 1);
1623 	rth->u.dst.flags= DST_HOST;
1624 	if (in_dev->cnf.no_policy)
1625 		rth->u.dst.flags |= DST_NOPOLICY;
1626 	rth->fl.fl4_dst	= daddr;
1627 	rth->rt_dst	= daddr;
1628 	rth->fl.fl4_tos	= tos;
1629 #ifdef CONFIG_IP_ROUTE_FWMARK
1630 	rth->fl.fl4_fwmark= skb->nfmark;
1631 #endif
1632 	rth->fl.fl4_src	= saddr;
1633 	rth->rt_src	= saddr;
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 	rth->u.dst.tclassid = itag;
1636 #endif
1637 	rth->rt_iif	=
1638 	rth->fl.iif	= dev->ifindex;
1639 	rth->u.dst.dev	= &loopback_dev;
1640 	dev_hold(rth->u.dst.dev);
1641 	rth->idev	= in_dev_get(rth->u.dst.dev);
1642 	rth->fl.oif	= 0;
1643 	rth->rt_gateway	= daddr;
1644 	rth->rt_spec_dst= spec_dst;
1645 	rth->rt_type	= RTN_MULTICAST;
1646 	rth->rt_flags	= RTCF_MULTICAST;
1647 	if (our) {
1648 		rth->u.dst.input= ip_local_deliver;
1649 		rth->rt_flags |= RTCF_LOCAL;
1650 	}
1651 
1652 #ifdef CONFIG_IP_MROUTE
1653 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1654 		rth->u.dst.input = ip_mr_input;
1655 #endif
1656 	RT_CACHE_STAT_INC(in_slow_mc);
1657 
1658 	in_dev_put(in_dev);
1659 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1660 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1661 
1662 e_nobufs:
1663 	in_dev_put(in_dev);
1664 	return -ENOBUFS;
1665 
1666 e_inval:
1667 	in_dev_put(in_dev);
1668 	return -EINVAL;
1669 }
1670 
1671 
1672 static void ip_handle_martian_source(struct net_device *dev,
1673 				     struct in_device *in_dev,
1674 				     struct sk_buff *skb,
1675 				     u32 daddr,
1676 				     u32 saddr)
1677 {
1678 	RT_CACHE_STAT_INC(in_martian_src);
1679 #ifdef CONFIG_IP_ROUTE_VERBOSE
1680 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1681 		/*
1682 		 *	RFC1812 recommendation, if source is martian,
1683 		 *	the only hint is MAC header.
1684 		 */
1685 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686 			"%u.%u.%u.%u, on dev %s\n",
1687 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1688 		if (dev->hard_header_len && skb->mac.raw) {
1689 			int i;
1690 			unsigned char *p = skb->mac.raw;
1691 			printk(KERN_WARNING "ll header: ");
1692 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1693 				printk("%02x", *p);
1694 				if (i < (dev->hard_header_len - 1))
1695 					printk(":");
1696 			}
1697 			printk("\n");
1698 		}
1699 	}
1700 #endif
1701 }
1702 
1703 static inline int __mkroute_input(struct sk_buff *skb,
1704 				  struct fib_result* res,
1705 				  struct in_device *in_dev,
1706 				  u32 daddr, u32 saddr, u32 tos,
1707 				  struct rtable **result)
1708 {
1709 
1710 	struct rtable *rth;
1711 	int err;
1712 	struct in_device *out_dev;
1713 	unsigned flags = 0;
1714 	u32 spec_dst, itag;
1715 
1716 	/* get a working reference to the output device */
1717 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1718 	if (out_dev == NULL) {
1719 		if (net_ratelimit())
1720 			printk(KERN_CRIT "Bug in ip_route_input" \
1721 			       "_slow(). Please, report\n");
1722 		return -EINVAL;
1723 	}
1724 
1725 
1726 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1727 				  in_dev->dev, &spec_dst, &itag);
1728 	if (err < 0) {
1729 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1730 					 saddr);
1731 
1732 		err = -EINVAL;
1733 		goto cleanup;
1734 	}
1735 
1736 	if (err)
1737 		flags |= RTCF_DIRECTSRC;
1738 
1739 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1740 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1741 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1742 		flags |= RTCF_DOREDIRECT;
1743 
1744 	if (skb->protocol != htons(ETH_P_IP)) {
1745 		/* Not IP (i.e. ARP). Do not create route, if it is
1746 		 * invalid for proxy arp. DNAT routes are always valid.
1747 		 */
1748 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1749 			err = -EINVAL;
1750 			goto cleanup;
1751 		}
1752 	}
1753 
1754 
1755 	rth = dst_alloc(&ipv4_dst_ops);
1756 	if (!rth) {
1757 		err = -ENOBUFS;
1758 		goto cleanup;
1759 	}
1760 
1761 	rth->u.dst.flags= DST_HOST;
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1763 	if (res->fi->fib_nhs > 1)
1764 		rth->u.dst.flags |= DST_BALANCED;
1765 #endif
1766 	if (in_dev->cnf.no_policy)
1767 		rth->u.dst.flags |= DST_NOPOLICY;
1768 	if (in_dev->cnf.no_xfrm)
1769 		rth->u.dst.flags |= DST_NOXFRM;
1770 	rth->fl.fl4_dst	= daddr;
1771 	rth->rt_dst	= daddr;
1772 	rth->fl.fl4_tos	= tos;
1773 #ifdef CONFIG_IP_ROUTE_FWMARK
1774 	rth->fl.fl4_fwmark= skb->nfmark;
1775 #endif
1776 	rth->fl.fl4_src	= saddr;
1777 	rth->rt_src	= saddr;
1778 	rth->rt_gateway	= daddr;
1779 	rth->rt_iif 	=
1780 		rth->fl.iif	= in_dev->dev->ifindex;
1781 	rth->u.dst.dev	= (out_dev)->dev;
1782 	dev_hold(rth->u.dst.dev);
1783 	rth->idev	= in_dev_get(rth->u.dst.dev);
1784 	rth->fl.oif 	= 0;
1785 	rth->rt_spec_dst= spec_dst;
1786 
1787 	rth->u.dst.input = ip_forward;
1788 	rth->u.dst.output = ip_output;
1789 
1790 	rt_set_nexthop(rth, res, itag);
1791 
1792 	rth->rt_flags = flags;
1793 
1794 	*result = rth;
1795 	err = 0;
1796  cleanup:
1797 	/* release the working reference to the output device */
1798 	in_dev_put(out_dev);
1799 	return err;
1800 }
1801 
1802 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1803 				       struct fib_result* res,
1804 				       const struct flowi *fl,
1805 				       struct in_device *in_dev,
1806 				       u32 daddr, u32 saddr, u32 tos)
1807 {
1808 	struct rtable* rth = NULL;
1809 	int err;
1810 	unsigned hash;
1811 
1812 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1813 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1814 		fib_select_multipath(fl, res);
1815 #endif
1816 
1817 	/* create a routing cache entry */
1818 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1819 	if (err)
1820 		return err;
1821 	atomic_set(&rth->u.dst.__refcnt, 1);
1822 
1823 	/* put it into the cache */
1824 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826 }
1827 
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829 				   struct fib_result* res,
1830 				   const struct flowi *fl,
1831 				   struct in_device *in_dev,
1832 				   u32 daddr, u32 saddr, u32 tos)
1833 {
1834 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1835 	struct rtable* rth = NULL;
1836 	unsigned char hop, hopcount, lasthop;
1837 	int err = -EINVAL;
1838 	unsigned int hash;
1839 
1840 	if (res->fi)
1841 		hopcount = res->fi->fib_nhs;
1842 	else
1843 		hopcount = 1;
1844 
1845 	lasthop = hopcount - 1;
1846 
1847 	/* distinguish between multipath and singlepath */
1848 	if (hopcount < 2)
1849 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 					    saddr, tos);
1851 
1852 	/* add all alternatives to the routing cache */
1853 	for (hop = 0; hop < hopcount; hop++) {
1854 		res->nh_sel = hop;
1855 
1856 		/* create a routing cache entry */
1857 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1858 				      &rth);
1859 		if (err)
1860 			return err;
1861 
1862 		/* put it into the cache */
1863 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1864 		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1865 		if (err)
1866 			return err;
1867 
1868 		/* forward hop information to multipath impl. */
1869 		multipath_set_nhinfo(rth,
1870 				     FIB_RES_NETWORK(*res),
1871 				     FIB_RES_NETMASK(*res),
1872 				     res->prefixlen,
1873 				     &FIB_RES_NH(*res));
1874 
1875 		/* only for the last hop the reference count is handled
1876 		 * outside
1877 		 */
1878 		if (hop == lasthop)
1879 			atomic_set(&(skb->dst->__refcnt), 1);
1880 	}
1881 	return err;
1882 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1883 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1884 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1885 }
1886 
1887 
1888 /*
1889  *	NOTE. We drop all the packets that has local source
1890  *	addresses, because every properly looped back packet
1891  *	must have correct destination already attached by output routine.
1892  *
1893  *	Such approach solves two big problems:
1894  *	1. Not simplex devices are handled properly.
1895  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1896  */
1897 
1898 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1899 			       u8 tos, struct net_device *dev)
1900 {
1901 	struct fib_result res;
1902 	struct in_device *in_dev = in_dev_get(dev);
1903 	struct flowi fl = { .nl_u = { .ip4_u =
1904 				      { .daddr = daddr,
1905 					.saddr = saddr,
1906 					.tos = tos,
1907 					.scope = RT_SCOPE_UNIVERSE,
1908 #ifdef CONFIG_IP_ROUTE_FWMARK
1909 					.fwmark = skb->nfmark
1910 #endif
1911 				      } },
1912 			    .iif = dev->ifindex };
1913 	unsigned	flags = 0;
1914 	u32		itag = 0;
1915 	struct rtable * rth;
1916 	unsigned	hash;
1917 	u32		spec_dst;
1918 	int		err = -EINVAL;
1919 	int		free_res = 0;
1920 
1921 	/* IP on this device is disabled. */
1922 
1923 	if (!in_dev)
1924 		goto out;
1925 
1926 	/* Check for the most weird martians, which can be not detected
1927 	   by fib_lookup.
1928 	 */
1929 
1930 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1931 		goto martian_source;
1932 
1933 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934 		goto brd_input;
1935 
1936 	/* Accept zero addresses only to limited broadcast;
1937 	 * I even do not know to fix it or not. Waiting for complains :-)
1938 	 */
1939 	if (ZERONET(saddr))
1940 		goto martian_source;
1941 
1942 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1943 		goto martian_destination;
1944 
1945 	/*
1946 	 *	Now we are ready to route packet.
1947 	 */
1948 	if ((err = fib_lookup(&fl, &res)) != 0) {
1949 		if (!IN_DEV_FORWARD(in_dev))
1950 			goto e_hostunreach;
1951 		goto no_route;
1952 	}
1953 	free_res = 1;
1954 
1955 	RT_CACHE_STAT_INC(in_slow_tot);
1956 
1957 	if (res.type == RTN_BROADCAST)
1958 		goto brd_input;
1959 
1960 	if (res.type == RTN_LOCAL) {
1961 		int result;
1962 		result = fib_validate_source(saddr, daddr, tos,
1963 					     loopback_dev.ifindex,
1964 					     dev, &spec_dst, &itag);
1965 		if (result < 0)
1966 			goto martian_source;
1967 		if (result)
1968 			flags |= RTCF_DIRECTSRC;
1969 		spec_dst = daddr;
1970 		goto local_input;
1971 	}
1972 
1973 	if (!IN_DEV_FORWARD(in_dev))
1974 		goto e_hostunreach;
1975 	if (res.type != RTN_UNICAST)
1976 		goto martian_destination;
1977 
1978 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1979 	if (err == -ENOBUFS)
1980 		goto e_nobufs;
1981 	if (err == -EINVAL)
1982 		goto e_inval;
1983 
1984 done:
1985 	in_dev_put(in_dev);
1986 	if (free_res)
1987 		fib_res_put(&res);
1988 out:	return err;
1989 
1990 brd_input:
1991 	if (skb->protocol != htons(ETH_P_IP))
1992 		goto e_inval;
1993 
1994 	if (ZERONET(saddr))
1995 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1996 	else {
1997 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998 					  &itag);
1999 		if (err < 0)
2000 			goto martian_source;
2001 		if (err)
2002 			flags |= RTCF_DIRECTSRC;
2003 	}
2004 	flags |= RTCF_BROADCAST;
2005 	res.type = RTN_BROADCAST;
2006 	RT_CACHE_STAT_INC(in_brd);
2007 
2008 local_input:
2009 	rth = dst_alloc(&ipv4_dst_ops);
2010 	if (!rth)
2011 		goto e_nobufs;
2012 
2013 	rth->u.dst.output= ip_rt_bug;
2014 
2015 	atomic_set(&rth->u.dst.__refcnt, 1);
2016 	rth->u.dst.flags= DST_HOST;
2017 	if (in_dev->cnf.no_policy)
2018 		rth->u.dst.flags |= DST_NOPOLICY;
2019 	rth->fl.fl4_dst	= daddr;
2020 	rth->rt_dst	= daddr;
2021 	rth->fl.fl4_tos	= tos;
2022 #ifdef CONFIG_IP_ROUTE_FWMARK
2023 	rth->fl.fl4_fwmark= skb->nfmark;
2024 #endif
2025 	rth->fl.fl4_src	= saddr;
2026 	rth->rt_src	= saddr;
2027 #ifdef CONFIG_NET_CLS_ROUTE
2028 	rth->u.dst.tclassid = itag;
2029 #endif
2030 	rth->rt_iif	=
2031 	rth->fl.iif	= dev->ifindex;
2032 	rth->u.dst.dev	= &loopback_dev;
2033 	dev_hold(rth->u.dst.dev);
2034 	rth->idev	= in_dev_get(rth->u.dst.dev);
2035 	rth->rt_gateway	= daddr;
2036 	rth->rt_spec_dst= spec_dst;
2037 	rth->u.dst.input= ip_local_deliver;
2038 	rth->rt_flags 	= flags|RTCF_LOCAL;
2039 	if (res.type == RTN_UNREACHABLE) {
2040 		rth->u.dst.input= ip_error;
2041 		rth->u.dst.error= -err;
2042 		rth->rt_flags 	&= ~RTCF_LOCAL;
2043 	}
2044 	rth->rt_type	= res.type;
2045 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2046 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2047 	goto done;
2048 
2049 no_route:
2050 	RT_CACHE_STAT_INC(in_no_route);
2051 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2052 	res.type = RTN_UNREACHABLE;
2053 	goto local_input;
2054 
2055 	/*
2056 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2057 	 */
2058 martian_destination:
2059 	RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2062 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2063 			"%u.%u.%u.%u, dev %s\n",
2064 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2065 #endif
2066 
2067 e_hostunreach:
2068         err = -EHOSTUNREACH;
2069         goto done;
2070 
2071 e_inval:
2072 	err = -EINVAL;
2073 	goto done;
2074 
2075 e_nobufs:
2076 	err = -ENOBUFS;
2077 	goto done;
2078 
2079 martian_source:
2080 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2081 	goto e_inval;
2082 }
2083 
2084 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2085 		   u8 tos, struct net_device *dev)
2086 {
2087 	struct rtable * rth;
2088 	unsigned	hash;
2089 	int iif = dev->ifindex;
2090 
2091 	tos &= IPTOS_RT_MASK;
2092 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2093 
2094 	rcu_read_lock();
2095 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2096 	     rth = rcu_dereference(rth->u.rt_next)) {
2097 		if (rth->fl.fl4_dst == daddr &&
2098 		    rth->fl.fl4_src == saddr &&
2099 		    rth->fl.iif == iif &&
2100 		    rth->fl.oif == 0 &&
2101 #ifdef CONFIG_IP_ROUTE_FWMARK
2102 		    rth->fl.fl4_fwmark == skb->nfmark &&
2103 #endif
2104 		    rth->fl.fl4_tos == tos) {
2105 			rth->u.dst.lastuse = jiffies;
2106 			dst_hold(&rth->u.dst);
2107 			rth->u.dst.__use++;
2108 			RT_CACHE_STAT_INC(in_hit);
2109 			rcu_read_unlock();
2110 			skb->dst = (struct dst_entry*)rth;
2111 			return 0;
2112 		}
2113 		RT_CACHE_STAT_INC(in_hlist_search);
2114 	}
2115 	rcu_read_unlock();
2116 
2117 	/* Multicast recognition logic is moved from route cache to here.
2118 	   The problem was that too many Ethernet cards have broken/missing
2119 	   hardware multicast filters :-( As result the host on multicasting
2120 	   network acquires a lot of useless route cache entries, sort of
2121 	   SDR messages from all the world. Now we try to get rid of them.
2122 	   Really, provided software IP multicast filter is organized
2123 	   reasonably (at least, hashed), it does not result in a slowdown
2124 	   comparing with route cache reject entries.
2125 	   Note, that multicast routers are not affected, because
2126 	   route cache entry is created eventually.
2127 	 */
2128 	if (MULTICAST(daddr)) {
2129 		struct in_device *in_dev;
2130 
2131 		rcu_read_lock();
2132 		if ((in_dev = __in_dev_get(dev)) != NULL) {
2133 			int our = ip_check_mc(in_dev, daddr, saddr,
2134 				skb->nh.iph->protocol);
2135 			if (our
2136 #ifdef CONFIG_IP_MROUTE
2137 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2138 #endif
2139 			    ) {
2140 				rcu_read_unlock();
2141 				return ip_route_input_mc(skb, daddr, saddr,
2142 							 tos, dev, our);
2143 			}
2144 		}
2145 		rcu_read_unlock();
2146 		return -EINVAL;
2147 	}
2148 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149 }
2150 
2151 static inline int __mkroute_output(struct rtable **result,
2152 				   struct fib_result* res,
2153 				   const struct flowi *fl,
2154 				   const struct flowi *oldflp,
2155 				   struct net_device *dev_out,
2156 				   unsigned flags)
2157 {
2158 	struct rtable *rth;
2159 	struct in_device *in_dev;
2160 	u32 tos = RT_FL_TOS(oldflp);
2161 	int err = 0;
2162 
2163 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164 		return -EINVAL;
2165 
2166 	if (fl->fl4_dst == 0xFFFFFFFF)
2167 		res->type = RTN_BROADCAST;
2168 	else if (MULTICAST(fl->fl4_dst))
2169 		res->type = RTN_MULTICAST;
2170 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171 		return -EINVAL;
2172 
2173 	if (dev_out->flags & IFF_LOOPBACK)
2174 		flags |= RTCF_LOCAL;
2175 
2176 	/* get work reference to inet device */
2177 	in_dev = in_dev_get(dev_out);
2178 	if (!in_dev)
2179 		return -EINVAL;
2180 
2181 	if (res->type == RTN_BROADCAST) {
2182 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2183 		if (res->fi) {
2184 			fib_info_put(res->fi);
2185 			res->fi = NULL;
2186 		}
2187 	} else if (res->type == RTN_MULTICAST) {
2188 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2189 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2190 				 oldflp->proto))
2191 			flags &= ~RTCF_LOCAL;
2192 		/* If multicast route do not exist use
2193 		   default one, but do not gateway in this case.
2194 		   Yes, it is hack.
2195 		 */
2196 		if (res->fi && res->prefixlen < 4) {
2197 			fib_info_put(res->fi);
2198 			res->fi = NULL;
2199 		}
2200 	}
2201 
2202 
2203 	rth = dst_alloc(&ipv4_dst_ops);
2204 	if (!rth) {
2205 		err = -ENOBUFS;
2206 		goto cleanup;
2207 	}
2208 
2209 	rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 	if (res->fi) {
2212 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 		if (res->fi->fib_nhs > 1)
2214 			rth->u.dst.flags |= DST_BALANCED;
2215 	}
2216 #endif
2217 	if (in_dev->cnf.no_xfrm)
2218 		rth->u.dst.flags |= DST_NOXFRM;
2219 	if (in_dev->cnf.no_policy)
2220 		rth->u.dst.flags |= DST_NOPOLICY;
2221 
2222 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2223 	rth->fl.fl4_tos	= tos;
2224 	rth->fl.fl4_src	= oldflp->fl4_src;
2225 	rth->fl.oif	= oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229 	rth->rt_dst	= fl->fl4_dst;
2230 	rth->rt_src	= fl->fl4_src;
2231 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2232 	/* get references to the devices that are to be hold by the routing
2233 	   cache entry */
2234 	rth->u.dst.dev	= dev_out;
2235 	dev_hold(dev_out);
2236 	rth->idev	= in_dev_get(dev_out);
2237 	rth->rt_gateway = fl->fl4_dst;
2238 	rth->rt_spec_dst= fl->fl4_src;
2239 
2240 	rth->u.dst.output=ip_output;
2241 
2242 	RT_CACHE_STAT_INC(out_slow_tot);
2243 
2244 	if (flags & RTCF_LOCAL) {
2245 		rth->u.dst.input = ip_local_deliver;
2246 		rth->rt_spec_dst = fl->fl4_dst;
2247 	}
2248 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 		rth->rt_spec_dst = fl->fl4_src;
2250 		if (flags & RTCF_LOCAL &&
2251 		    !(dev_out->flags & IFF_LOOPBACK)) {
2252 			rth->u.dst.output = ip_mc_output;
2253 			RT_CACHE_STAT_INC(out_slow_mc);
2254 		}
2255 #ifdef CONFIG_IP_MROUTE
2256 		if (res->type == RTN_MULTICAST) {
2257 			if (IN_DEV_MFORWARD(in_dev) &&
2258 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 				rth->u.dst.input = ip_mr_input;
2260 				rth->u.dst.output = ip_mc_output;
2261 			}
2262 		}
2263 #endif
2264 	}
2265 
2266 	rt_set_nexthop(rth, res, 0);
2267 
2268 	rth->rt_flags = flags;
2269 
2270 	*result = rth;
2271  cleanup:
2272 	/* release work reference to inet device */
2273 	in_dev_put(in_dev);
2274 
2275 	return err;
2276 }
2277 
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279 					struct fib_result* res,
2280 					const struct flowi *fl,
2281 					const struct flowi *oldflp,
2282 					struct net_device *dev_out,
2283 					unsigned flags)
2284 {
2285 	struct rtable *rth = NULL;
2286 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 	unsigned hash;
2288 	if (err == 0) {
2289 		u32 tos = RT_FL_TOS(oldflp);
2290 
2291 		atomic_set(&rth->u.dst.__refcnt, 1);
2292 
2293 		hash = rt_hash_code(oldflp->fl4_dst,
2294 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2295 		err = rt_intern_hash(hash, rth, rp);
2296 	}
2297 
2298 	return err;
2299 }
2300 
2301 static inline int ip_mkroute_output(struct rtable** rp,
2302 				    struct fib_result* res,
2303 				    const struct flowi *fl,
2304 				    const struct flowi *oldflp,
2305 				    struct net_device *dev_out,
2306 				    unsigned flags)
2307 {
2308 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309 	u32 tos = RT_FL_TOS(oldflp);
2310 	unsigned char hop;
2311 	unsigned hash;
2312 	int err = -EINVAL;
2313 	struct rtable *rth = NULL;
2314 
2315 	if (res->fi && res->fi->fib_nhs > 1) {
2316 		unsigned char hopcount = res->fi->fib_nhs;
2317 
2318 		for (hop = 0; hop < hopcount; hop++) {
2319 			struct net_device *dev2nexthop;
2320 
2321 			res->nh_sel = hop;
2322 
2323 			/* hold a work reference to the output device */
2324 			dev2nexthop = FIB_RES_DEV(*res);
2325 			dev_hold(dev2nexthop);
2326 
2327 			err = __mkroute_output(&rth, res, fl, oldflp,
2328 					       dev2nexthop, flags);
2329 
2330 			if (err != 0)
2331 				goto cleanup;
2332 
2333 			hash = rt_hash_code(oldflp->fl4_dst,
2334 					    oldflp->fl4_src ^
2335 					    (oldflp->oif << 5), tos);
2336 			err = rt_intern_hash(hash, rth, rp);
2337 
2338 			/* forward hop information to multipath impl. */
2339 			multipath_set_nhinfo(rth,
2340 					     FIB_RES_NETWORK(*res),
2341 					     FIB_RES_NETMASK(*res),
2342 					     res->prefixlen,
2343 					     &FIB_RES_NH(*res));
2344 		cleanup:
2345 			/* release work reference to output device */
2346 			dev_put(dev2nexthop);
2347 
2348 			if (err != 0)
2349 				return err;
2350 		}
2351 		atomic_set(&(*rp)->u.dst.__refcnt, 1);
2352 		return err;
2353 	} else {
2354 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2355 					     flags);
2356 	}
2357 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2359 #endif
2360 }
2361 
2362 /*
2363  * Major route resolver routine.
2364  */
2365 
2366 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2367 {
2368 	u32 tos	= RT_FL_TOS(oldflp);
2369 	struct flowi fl = { .nl_u = { .ip4_u =
2370 				      { .daddr = oldflp->fl4_dst,
2371 					.saddr = oldflp->fl4_src,
2372 					.tos = tos & IPTOS_RT_MASK,
2373 					.scope = ((tos & RTO_ONLINK) ?
2374 						  RT_SCOPE_LINK :
2375 						  RT_SCOPE_UNIVERSE),
2376 #ifdef CONFIG_IP_ROUTE_FWMARK
2377 					.fwmark = oldflp->fl4_fwmark
2378 #endif
2379 				      } },
2380 			    .iif = loopback_dev.ifindex,
2381 			    .oif = oldflp->oif };
2382 	struct fib_result res;
2383 	unsigned flags = 0;
2384 	struct net_device *dev_out = NULL;
2385 	int free_res = 0;
2386 	int err;
2387 
2388 
2389 	res.fi		= NULL;
2390 #ifdef CONFIG_IP_MULTIPLE_TABLES
2391 	res.r		= NULL;
2392 #endif
2393 
2394 	if (oldflp->fl4_src) {
2395 		err = -EINVAL;
2396 		if (MULTICAST(oldflp->fl4_src) ||
2397 		    BADCLASS(oldflp->fl4_src) ||
2398 		    ZERONET(oldflp->fl4_src))
2399 			goto out;
2400 
2401 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402 		dev_out = ip_dev_find(oldflp->fl4_src);
2403 		if (dev_out == NULL)
2404 			goto out;
2405 
2406 		/* I removed check for oif == dev_out->oif here.
2407 		   It was wrong for two reasons:
2408 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409 		      assigned to multiple interfaces.
2410 		   2. Moreover, we are allowed to send packets with saddr
2411 		      of another iface. --ANK
2412 		 */
2413 
2414 		if (oldflp->oif == 0
2415 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416 			/* Special hack: user can direct multicasts
2417 			   and limited broadcast via necessary interface
2418 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419 			   This hack is not just for fun, it allows
2420 			   vic,vat and friends to work.
2421 			   They bind socket to loopback, set ttl to zero
2422 			   and expect that it will work.
2423 			   From the viewpoint of routing cache they are broken,
2424 			   because we are not allowed to build multicast path
2425 			   with loopback source addr (look, routing cache
2426 			   cannot know, that ttl is zero, so that packet
2427 			   will not leave this host and route is valid).
2428 			   Luckily, this hack is good workaround.
2429 			 */
2430 
2431 			fl.oif = dev_out->ifindex;
2432 			goto make_route;
2433 		}
2434 		if (dev_out)
2435 			dev_put(dev_out);
2436 		dev_out = NULL;
2437 	}
2438 
2439 
2440 	if (oldflp->oif) {
2441 		dev_out = dev_get_by_index(oldflp->oif);
2442 		err = -ENODEV;
2443 		if (dev_out == NULL)
2444 			goto out;
2445 		if (__in_dev_get(dev_out) == NULL) {
2446 			dev_put(dev_out);
2447 			goto out;	/* Wrong error code */
2448 		}
2449 
2450 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2451 			if (!fl.fl4_src)
2452 				fl.fl4_src = inet_select_addr(dev_out, 0,
2453 							      RT_SCOPE_LINK);
2454 			goto make_route;
2455 		}
2456 		if (!fl.fl4_src) {
2457 			if (MULTICAST(oldflp->fl4_dst))
2458 				fl.fl4_src = inet_select_addr(dev_out, 0,
2459 							      fl.fl4_scope);
2460 			else if (!oldflp->fl4_dst)
2461 				fl.fl4_src = inet_select_addr(dev_out, 0,
2462 							      RT_SCOPE_HOST);
2463 		}
2464 	}
2465 
2466 	if (!fl.fl4_dst) {
2467 		fl.fl4_dst = fl.fl4_src;
2468 		if (!fl.fl4_dst)
2469 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2470 		if (dev_out)
2471 			dev_put(dev_out);
2472 		dev_out = &loopback_dev;
2473 		dev_hold(dev_out);
2474 		fl.oif = loopback_dev.ifindex;
2475 		res.type = RTN_LOCAL;
2476 		flags |= RTCF_LOCAL;
2477 		goto make_route;
2478 	}
2479 
2480 	if (fib_lookup(&fl, &res)) {
2481 		res.fi = NULL;
2482 		if (oldflp->oif) {
2483 			/* Apparently, routing tables are wrong. Assume,
2484 			   that the destination is on link.
2485 
2486 			   WHY? DW.
2487 			   Because we are allowed to send to iface
2488 			   even if it has NO routes and NO assigned
2489 			   addresses. When oif is specified, routing
2490 			   tables are looked up with only one purpose:
2491 			   to catch if destination is gatewayed, rather than
2492 			   direct. Moreover, if MSG_DONTROUTE is set,
2493 			   we send packet, ignoring both routing tables
2494 			   and ifaddr state. --ANK
2495 
2496 
2497 			   We could make it even if oif is unknown,
2498 			   likely IPv6, but we do not.
2499 			 */
2500 
2501 			if (fl.fl4_src == 0)
2502 				fl.fl4_src = inet_select_addr(dev_out, 0,
2503 							      RT_SCOPE_LINK);
2504 			res.type = RTN_UNICAST;
2505 			goto make_route;
2506 		}
2507 		if (dev_out)
2508 			dev_put(dev_out);
2509 		err = -ENETUNREACH;
2510 		goto out;
2511 	}
2512 	free_res = 1;
2513 
2514 	if (res.type == RTN_LOCAL) {
2515 		if (!fl.fl4_src)
2516 			fl.fl4_src = fl.fl4_dst;
2517 		if (dev_out)
2518 			dev_put(dev_out);
2519 		dev_out = &loopback_dev;
2520 		dev_hold(dev_out);
2521 		fl.oif = dev_out->ifindex;
2522 		if (res.fi)
2523 			fib_info_put(res.fi);
2524 		res.fi = NULL;
2525 		flags |= RTCF_LOCAL;
2526 		goto make_route;
2527 	}
2528 
2529 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2530 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2531 		fib_select_multipath(&fl, &res);
2532 	else
2533 #endif
2534 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2535 		fib_select_default(&fl, &res);
2536 
2537 	if (!fl.fl4_src)
2538 		fl.fl4_src = FIB_RES_PREFSRC(res);
2539 
2540 	if (dev_out)
2541 		dev_put(dev_out);
2542 	dev_out = FIB_RES_DEV(res);
2543 	dev_hold(dev_out);
2544 	fl.oif = dev_out->ifindex;
2545 
2546 
2547 make_route:
2548 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2549 
2550 
2551 	if (free_res)
2552 		fib_res_put(&res);
2553 	if (dev_out)
2554 		dev_put(dev_out);
2555 out:	return err;
2556 }
2557 
2558 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2559 {
2560 	unsigned hash;
2561 	struct rtable *rth;
2562 
2563 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2564 
2565 	rcu_read_lock_bh();
2566 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2567 		rth = rcu_dereference(rth->u.rt_next)) {
2568 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2569 		    rth->fl.fl4_src == flp->fl4_src &&
2570 		    rth->fl.iif == 0 &&
2571 		    rth->fl.oif == flp->oif &&
2572 #ifdef CONFIG_IP_ROUTE_FWMARK
2573 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2574 #endif
2575 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2576 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2577 
2578 			/* check for multipath routes and choose one if
2579 			 * necessary
2580 			 */
2581 			if (multipath_select_route(flp, rth, rp)) {
2582 				dst_hold(&(*rp)->u.dst);
2583 				RT_CACHE_STAT_INC(out_hit);
2584 				rcu_read_unlock_bh();
2585 				return 0;
2586 			}
2587 
2588 			rth->u.dst.lastuse = jiffies;
2589 			dst_hold(&rth->u.dst);
2590 			rth->u.dst.__use++;
2591 			RT_CACHE_STAT_INC(out_hit);
2592 			rcu_read_unlock_bh();
2593 			*rp = rth;
2594 			return 0;
2595 		}
2596 		RT_CACHE_STAT_INC(out_hlist_search);
2597 	}
2598 	rcu_read_unlock_bh();
2599 
2600 	return ip_route_output_slow(rp, flp);
2601 }
2602 
2603 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604 {
2605 	int err;
2606 
2607 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2608 		return err;
2609 
2610 	if (flp->proto) {
2611 		if (!flp->fl4_src)
2612 			flp->fl4_src = (*rp)->rt_src;
2613 		if (!flp->fl4_dst)
2614 			flp->fl4_dst = (*rp)->rt_dst;
2615 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2616 	}
2617 
2618 	return 0;
2619 }
2620 
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623 	return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625 
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627 			int nowait, unsigned int flags)
2628 {
2629 	struct rtable *rt = (struct rtable*)skb->dst;
2630 	struct rtmsg *r;
2631 	struct nlmsghdr  *nlh;
2632 	unsigned char	 *b = skb->tail;
2633 	struct rta_cacheinfo ci;
2634 #ifdef CONFIG_IP_MROUTE
2635 	struct rtattr *eptr;
2636 #endif
2637 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2638 	r = NLMSG_DATA(nlh);
2639 	r->rtm_family	 = AF_INET;
2640 	r->rtm_dst_len	= 32;
2641 	r->rtm_src_len	= 0;
2642 	r->rtm_tos	= rt->fl.fl4_tos;
2643 	r->rtm_table	= RT_TABLE_MAIN;
2644 	r->rtm_type	= rt->rt_type;
2645 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2646 	r->rtm_protocol = RTPROT_UNSPEC;
2647 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648 	if (rt->rt_flags & RTCF_NOTIFY)
2649 		r->rtm_flags |= RTM_F_NOTIFY;
2650 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2651 	if (rt->fl.fl4_src) {
2652 		r->rtm_src_len = 32;
2653 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2654 	}
2655 	if (rt->u.dst.dev)
2656 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658 	if (rt->u.dst.tclassid)
2659 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2660 #endif
2661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2662 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2663 		__u32 alg = rt->rt_multipath_alg;
2664 
2665 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2666 	}
2667 #endif
2668 	if (rt->fl.iif)
2669 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2670 	else if (rt->rt_src != rt->fl.fl4_src)
2671 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2672 	if (rt->rt_dst != rt->rt_gateway)
2673 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2674 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2675 		goto rtattr_failure;
2676 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2677 	ci.rta_used	= rt->u.dst.__use;
2678 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2679 	if (rt->u.dst.expires)
2680 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2681 	else
2682 		ci.rta_expires = 0;
2683 	ci.rta_error	= rt->u.dst.error;
2684 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2685 	if (rt->peer) {
2686 		ci.rta_id = rt->peer->ip_id_count;
2687 		if (rt->peer->tcp_ts_stamp) {
2688 			ci.rta_ts = rt->peer->tcp_ts;
2689 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2690 		}
2691 	}
2692 #ifdef CONFIG_IP_MROUTE
2693 	eptr = (struct rtattr*)skb->tail;
2694 #endif
2695 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2696 	if (rt->fl.iif) {
2697 #ifdef CONFIG_IP_MROUTE
2698 		u32 dst = rt->rt_dst;
2699 
2700 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701 		    ipv4_devconf.mc_forwarding) {
2702 			int err = ipmr_get_route(skb, r, nowait);
2703 			if (err <= 0) {
2704 				if (!nowait) {
2705 					if (err == 0)
2706 						return 0;
2707 					goto nlmsg_failure;
2708 				} else {
2709 					if (err == -EMSGSIZE)
2710 						goto nlmsg_failure;
2711 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2712 				}
2713 			}
2714 		} else
2715 #endif
2716 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2717 	}
2718 
2719 	nlh->nlmsg_len = skb->tail - b;
2720 	return skb->len;
2721 
2722 nlmsg_failure:
2723 rtattr_failure:
2724 	skb_trim(skb, b - skb->data);
2725 	return -1;
2726 }
2727 
2728 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2729 {
2730 	struct rtattr **rta = arg;
2731 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2732 	struct rtable *rt = NULL;
2733 	u32 dst = 0;
2734 	u32 src = 0;
2735 	int iif = 0;
2736 	int err = -ENOBUFS;
2737 	struct sk_buff *skb;
2738 
2739 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2740 	if (!skb)
2741 		goto out;
2742 
2743 	/* Reserve room for dummy headers, this skb can pass
2744 	   through good chunk of routing engine.
2745 	 */
2746 	skb->mac.raw = skb->data;
2747 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2748 
2749 	if (rta[RTA_SRC - 1])
2750 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 	if (rta[RTA_DST - 1])
2752 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 	if (rta[RTA_IIF - 1])
2754 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2755 
2756 	if (iif) {
2757 		struct net_device *dev = __dev_get_by_index(iif);
2758 		err = -ENODEV;
2759 		if (!dev)
2760 			goto out_free;
2761 		skb->protocol	= htons(ETH_P_IP);
2762 		skb->dev	= dev;
2763 		local_bh_disable();
2764 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765 		local_bh_enable();
2766 		rt = (struct rtable*)skb->dst;
2767 		if (!err && rt->u.dst.error)
2768 			err = -rt->u.dst.error;
2769 	} else {
2770 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771 							 .saddr = src,
2772 							 .tos = rtm->rtm_tos } } };
2773 		int oif = 0;
2774 		if (rta[RTA_OIF - 1])
2775 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776 		fl.oif = oif;
2777 		err = ip_route_output_key(&rt, &fl);
2778 	}
2779 	if (err)
2780 		goto out_free;
2781 
2782 	skb->dst = &rt->u.dst;
2783 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 		rt->rt_flags |= RTCF_NOTIFY;
2785 
2786 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2787 
2788 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789 				RTM_NEWROUTE, 0, 0);
2790 	if (!err)
2791 		goto out_free;
2792 	if (err < 0) {
2793 		err = -EMSGSIZE;
2794 		goto out_free;
2795 	}
2796 
2797 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798 	if (err > 0)
2799 		err = 0;
2800 out:	return err;
2801 
2802 out_free:
2803 	kfree_skb(skb);
2804 	goto out;
2805 }
2806 
2807 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2808 {
2809 	struct rtable *rt;
2810 	int h, s_h;
2811 	int idx, s_idx;
2812 
2813 	s_h = cb->args[0];
2814 	s_idx = idx = cb->args[1];
2815 	for (h = 0; h <= rt_hash_mask; h++) {
2816 		if (h < s_h) continue;
2817 		if (h > s_h)
2818 			s_idx = 0;
2819 		rcu_read_lock_bh();
2820 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2822 			if (idx < s_idx)
2823 				continue;
2824 			skb->dst = dst_clone(&rt->u.dst);
2825 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 					 1, NLM_F_MULTI) <= 0) {
2828 				dst_release(xchg(&skb->dst, NULL));
2829 				rcu_read_unlock_bh();
2830 				goto done;
2831 			}
2832 			dst_release(xchg(&skb->dst, NULL));
2833 		}
2834 		rcu_read_unlock_bh();
2835 	}
2836 
2837 done:
2838 	cb->args[0] = h;
2839 	cb->args[1] = idx;
2840 	return skb->len;
2841 }
2842 
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2844 {
2845 	rt_cache_flush(0);
2846 }
2847 
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2850 
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 					struct file *filp, void __user *buffer,
2853 					size_t *lenp, loff_t *ppos)
2854 {
2855 	if (write) {
2856 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 		rt_cache_flush(flush_delay);
2858 		return 0;
2859 	}
2860 
2861 	return -EINVAL;
2862 }
2863 
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865 						int __user *name,
2866 						int nlen,
2867 						void __user *oldval,
2868 						size_t __user *oldlenp,
2869 						void __user *newval,
2870 						size_t newlen,
2871 						void **context)
2872 {
2873 	int delay;
2874 	if (newlen != sizeof(int))
2875 		return -EINVAL;
2876 	if (get_user(delay, (int __user *)newval))
2877 		return -EFAULT;
2878 	rt_cache_flush(delay);
2879 	return 0;
2880 }
2881 
2882 ctl_table ipv4_route_table[] = {
2883         {
2884 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2885 		.procname	= "flush",
2886 		.data		= &flush_delay,
2887 		.maxlen		= sizeof(int),
2888 		.mode		= 0200,
2889 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2890 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2891 	},
2892 	{
2893 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2894 		.procname	= "min_delay",
2895 		.data		= &ip_rt_min_delay,
2896 		.maxlen		= sizeof(int),
2897 		.mode		= 0644,
2898 		.proc_handler	= &proc_dointvec_jiffies,
2899 		.strategy	= &sysctl_jiffies,
2900 	},
2901 	{
2902 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2903 		.procname	= "max_delay",
2904 		.data		= &ip_rt_max_delay,
2905 		.maxlen		= sizeof(int),
2906 		.mode		= 0644,
2907 		.proc_handler	= &proc_dointvec_jiffies,
2908 		.strategy	= &sysctl_jiffies,
2909 	},
2910 	{
2911 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2912 		.procname	= "gc_thresh",
2913 		.data		= &ipv4_dst_ops.gc_thresh,
2914 		.maxlen		= sizeof(int),
2915 		.mode		= 0644,
2916 		.proc_handler	= &proc_dointvec,
2917 	},
2918 	{
2919 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2920 		.procname	= "max_size",
2921 		.data		= &ip_rt_max_size,
2922 		.maxlen		= sizeof(int),
2923 		.mode		= 0644,
2924 		.proc_handler	= &proc_dointvec,
2925 	},
2926 	{
2927 		/*  Deprecated. Use gc_min_interval_ms */
2928 
2929 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 		.procname	= "gc_min_interval",
2931 		.data		= &ip_rt_gc_min_interval,
2932 		.maxlen		= sizeof(int),
2933 		.mode		= 0644,
2934 		.proc_handler	= &proc_dointvec_jiffies,
2935 		.strategy	= &sysctl_jiffies,
2936 	},
2937 	{
2938 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 		.procname	= "gc_min_interval_ms",
2940 		.data		= &ip_rt_gc_min_interval,
2941 		.maxlen		= sizeof(int),
2942 		.mode		= 0644,
2943 		.proc_handler	= &proc_dointvec_ms_jiffies,
2944 		.strategy	= &sysctl_ms_jiffies,
2945 	},
2946 	{
2947 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2948 		.procname	= "gc_timeout",
2949 		.data		= &ip_rt_gc_timeout,
2950 		.maxlen		= sizeof(int),
2951 		.mode		= 0644,
2952 		.proc_handler	= &proc_dointvec_jiffies,
2953 		.strategy	= &sysctl_jiffies,
2954 	},
2955 	{
2956 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2957 		.procname	= "gc_interval",
2958 		.data		= &ip_rt_gc_interval,
2959 		.maxlen		= sizeof(int),
2960 		.mode		= 0644,
2961 		.proc_handler	= &proc_dointvec_jiffies,
2962 		.strategy	= &sysctl_jiffies,
2963 	},
2964 	{
2965 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 		.procname	= "redirect_load",
2967 		.data		= &ip_rt_redirect_load,
2968 		.maxlen		= sizeof(int),
2969 		.mode		= 0644,
2970 		.proc_handler	= &proc_dointvec,
2971 	},
2972 	{
2973 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 		.procname	= "redirect_number",
2975 		.data		= &ip_rt_redirect_number,
2976 		.maxlen		= sizeof(int),
2977 		.mode		= 0644,
2978 		.proc_handler	= &proc_dointvec,
2979 	},
2980 	{
2981 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 		.procname	= "redirect_silence",
2983 		.data		= &ip_rt_redirect_silence,
2984 		.maxlen		= sizeof(int),
2985 		.mode		= 0644,
2986 		.proc_handler	= &proc_dointvec,
2987 	},
2988 	{
2989 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2990 		.procname	= "error_cost",
2991 		.data		= &ip_rt_error_cost,
2992 		.maxlen		= sizeof(int),
2993 		.mode		= 0644,
2994 		.proc_handler	= &proc_dointvec,
2995 	},
2996 	{
2997 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2998 		.procname	= "error_burst",
2999 		.data		= &ip_rt_error_burst,
3000 		.maxlen		= sizeof(int),
3001 		.mode		= 0644,
3002 		.proc_handler	= &proc_dointvec,
3003 	},
3004 	{
3005 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3006 		.procname	= "gc_elasticity",
3007 		.data		= &ip_rt_gc_elasticity,
3008 		.maxlen		= sizeof(int),
3009 		.mode		= 0644,
3010 		.proc_handler	= &proc_dointvec,
3011 	},
3012 	{
3013 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3014 		.procname	= "mtu_expires",
3015 		.data		= &ip_rt_mtu_expires,
3016 		.maxlen		= sizeof(int),
3017 		.mode		= 0644,
3018 		.proc_handler	= &proc_dointvec_jiffies,
3019 		.strategy	= &sysctl_jiffies,
3020 	},
3021 	{
3022 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3023 		.procname	= "min_pmtu",
3024 		.data		= &ip_rt_min_pmtu,
3025 		.maxlen		= sizeof(int),
3026 		.mode		= 0644,
3027 		.proc_handler	= &proc_dointvec,
3028 	},
3029 	{
3030 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3031 		.procname	= "min_adv_mss",
3032 		.data		= &ip_rt_min_advmss,
3033 		.maxlen		= sizeof(int),
3034 		.mode		= 0644,
3035 		.proc_handler	= &proc_dointvec,
3036 	},
3037 	{
3038 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 		.procname	= "secret_interval",
3040 		.data		= &ip_rt_secret_interval,
3041 		.maxlen		= sizeof(int),
3042 		.mode		= 0644,
3043 		.proc_handler	= &proc_dointvec_jiffies,
3044 		.strategy	= &sysctl_jiffies,
3045 	},
3046 	{ .ctl_name = 0 }
3047 };
3048 #endif
3049 
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3052 
3053 /* This code sucks.  But you should have seen it before! --RR */
3054 
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3057 
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 			   int length, int *eof, void *data)
3061 {
3062 	unsigned int i;
3063 
3064 	if ((offset & 3) || (length & 3))
3065 		return -EIO;
3066 
3067 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068 		*eof = 1;
3069 		return 0;
3070 	}
3071 
3072 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3074 		*eof = 1;
3075 	}
3076 
3077 	offset /= sizeof(u32);
3078 
3079 	if (length > 0) {
3080 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 		u32 *dst = (u32 *) buffer;
3082 
3083 		/* Copy first cpu. */
3084 		*start = buffer;
3085 		memcpy(dst, src, length);
3086 
3087 		/* Add the other cpus in, one int at a time */
3088 		for_each_cpu(i) {
3089 			unsigned int j;
3090 
3091 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3092 
3093 			for (j = 0; j < length/4; j++)
3094 				dst[j] += src[j];
3095 		}
3096 	}
3097 	return length;
3098 }
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3101 
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3104 {
3105 	if (!str)
3106 		return 0;
3107 	rhash_entries = simple_strtoul(str, &str, 0);
3108 	return 1;
3109 }
3110 __setup("rhash_entries=", set_rhash_entries);
3111 
3112 int __init ip_rt_init(void)
3113 {
3114 	int rc = 0;
3115 
3116 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 			     (jiffies ^ (jiffies >> 7)));
3118 
3119 #ifdef CONFIG_NET_CLS_ROUTE
3120 	{
3121 	int order;
3122 	for (order = 0;
3123 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124 		/* NOTHING */;
3125 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126 	if (!ip_rt_acct)
3127 		panic("IP: failed to allocate ip_rt_acct\n");
3128 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 	}
3130 #endif
3131 
3132 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 						     sizeof(struct rtable),
3134 						     0, SLAB_HWCACHE_ALIGN,
3135 						     NULL, NULL);
3136 
3137 	if (!ipv4_dst_ops.kmem_cachep)
3138 		panic("IP: failed to allocate ip_dst_cache\n");
3139 
3140 	rt_hash_table = (struct rt_hash_bucket *)
3141 		alloc_large_system_hash("IP route cache",
3142 					sizeof(struct rt_hash_bucket),
3143 					rhash_entries,
3144 					(num_physpages >= 128 * 1024) ?
3145 						(27 - PAGE_SHIFT) :
3146 						(29 - PAGE_SHIFT),
3147 					HASH_HIGHMEM,
3148 					&rt_hash_log,
3149 					&rt_hash_mask,
3150 					0);
3151 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 	rt_hash_lock_init();
3153 
3154 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156 
3157 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3158 	if (!rt_cache_stat)
3159 		return -ENOMEM;
3160 
3161 	devinet_init();
3162 	ip_fib_init();
3163 
3164 	init_timer(&rt_flush_timer);
3165 	rt_flush_timer.function = rt_run_flush;
3166 	init_timer(&rt_periodic_timer);
3167 	rt_periodic_timer.function = rt_check_expire;
3168 	init_timer(&rt_secret_timer);
3169 	rt_secret_timer.function = rt_secret_rebuild;
3170 
3171 	/* All the timers, started at system startup tend
3172 	   to synchronize. Perturb it a bit.
3173 	 */
3174 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3175 					ip_rt_gc_interval;
3176 	add_timer(&rt_periodic_timer);
3177 
3178 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179 		ip_rt_secret_interval;
3180 	add_timer(&rt_secret_timer);
3181 
3182 #ifdef CONFIG_PROC_FS
3183 	{
3184 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3187 			    		     proc_net_stat))) {
3188 		free_percpu(rt_cache_stat);
3189 		return -ENOMEM;
3190 	}
3191 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3192 	}
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195 #endif
3196 #endif
3197 #ifdef CONFIG_XFRM
3198 	xfrm_init();
3199 	xfrm4_init();
3200 #endif
3201 	return rc;
3202 }
3203 
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);
3207