xref: /linux/net/ipv4/route.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_min_delay		= 2 * HZ;
120 static int ip_rt_max_delay		= 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval		= 60 * HZ;
124 static int ip_rt_gc_min_interval	= HZ / 2;
125 static int ip_rt_redirect_number	= 9;
126 static int ip_rt_redirect_load		= HZ / 50;
127 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost		= HZ;
129 static int ip_rt_error_burst		= 5 * HZ;
130 static int ip_rt_gc_elasticity		= 8;
131 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu		= 512 + 20 + 20;
133 static int ip_rt_min_advmss		= 256;
134 static int ip_rt_secret_interval	= 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136 
137 #define RTprint(a...)	printk(KERN_DEBUG a)
138 
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142 
143 /*
144  *	Interface to generic destination cache.
145  */
146 
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
150 					 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void		 ipv4_link_failure(struct sk_buff *skb);
153 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155 
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		__constant_htons(ETH_P_IP),
160 	.gc =			rt_garbage_collect,
161 	.check =		ipv4_dst_check,
162 	.destroy =		ipv4_dst_destroy,
163 	.ifdown =		ipv4_dst_ifdown,
164 	.negative_advice =	ipv4_negative_advice,
165 	.link_failure =		ipv4_link_failure,
166 	.update_pmtu =		ip_rt_update_pmtu,
167 	.entry_size =		sizeof(struct rtable),
168 };
169 
170 #define ECN_OR_COST(class)	TC_PRIO_##class
171 
172 __u8 ip_tos2prio[16] = {
173 	TC_PRIO_BESTEFFORT,
174 	ECN_OR_COST(FILLER),
175 	TC_PRIO_BESTEFFORT,
176 	ECN_OR_COST(BESTEFFORT),
177 	TC_PRIO_BULK,
178 	ECN_OR_COST(BULK),
179 	TC_PRIO_BULK,
180 	ECN_OR_COST(BULK),
181 	TC_PRIO_INTERACTIVE,
182 	ECN_OR_COST(INTERACTIVE),
183 	TC_PRIO_INTERACTIVE,
184 	ECN_OR_COST(INTERACTIVE),
185 	TC_PRIO_INTERACTIVE_BULK,
186 	ECN_OR_COST(INTERACTIVE_BULK),
187 	TC_PRIO_INTERACTIVE_BULK,
188 	ECN_OR_COST(INTERACTIVE_BULK)
189 };
190 
191 
192 /*
193  * Route cache.
194  */
195 
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205 
206 struct rt_hash_bucket {
207 	struct rtable	*chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 	defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ	256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ	4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ	2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ	1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ	512
227 # else
228 #  define RT_HASH_LOCK_SZ	256
229 # endif
230 #endif
231 
232 static spinlock_t	*rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()	{ \
235 		int i; \
236 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 			spin_lock_init(&rt_hash_locks[i]); \
240 		}
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245 
246 static struct rt_hash_bucket 	*rt_hash_table;
247 static unsigned			rt_hash_mask;
248 static int			rt_hash_log;
249 static unsigned int		rt_hash_rnd;
250 
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253 	(__raw_get_cpu_var(rt_cache_stat).field++)
254 
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 				struct rtable **res);
257 
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
261 		& rt_hash_mask);
262 }
263 
264 #ifdef CONFIG_PROC_FS
265 struct rt_cache_iter_state {
266 	int bucket;
267 };
268 
269 static struct rtable *rt_cache_get_first(struct seq_file *seq)
270 {
271 	struct rtable *r = NULL;
272 	struct rt_cache_iter_state *st = seq->private;
273 
274 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
275 		rcu_read_lock_bh();
276 		r = rt_hash_table[st->bucket].chain;
277 		if (r)
278 			break;
279 		rcu_read_unlock_bh();
280 	}
281 	return r;
282 }
283 
284 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
285 {
286 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
287 
288 	r = r->u.rt_next;
289 	while (!r) {
290 		rcu_read_unlock_bh();
291 		if (--st->bucket < 0)
292 			break;
293 		rcu_read_lock_bh();
294 		r = rt_hash_table[st->bucket].chain;
295 	}
296 	return r;
297 }
298 
299 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
300 {
301 	struct rtable *r = rt_cache_get_first(seq);
302 
303 	if (r)
304 		while (pos && (r = rt_cache_get_next(seq, r)))
305 			--pos;
306 	return pos ? NULL : r;
307 }
308 
309 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
310 {
311 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
312 }
313 
314 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
315 {
316 	struct rtable *r = NULL;
317 
318 	if (v == SEQ_START_TOKEN)
319 		r = rt_cache_get_first(seq);
320 	else
321 		r = rt_cache_get_next(seq, v);
322 	++*pos;
323 	return r;
324 }
325 
326 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
327 {
328 	if (v && v != SEQ_START_TOKEN)
329 		rcu_read_unlock_bh();
330 }
331 
332 static int rt_cache_seq_show(struct seq_file *seq, void *v)
333 {
334 	if (v == SEQ_START_TOKEN)
335 		seq_printf(seq, "%-127s\n",
336 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
337 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
338 			   "HHUptod\tSpecDst");
339 	else {
340 		struct rtable *r = v;
341 		char temp[256];
342 
343 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
344 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
345 			r->u.dst.dev ? r->u.dst.dev->name : "*",
346 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
347 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
348 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
349 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
350 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
351 			dst_metric(&r->u.dst, RTAX_WINDOW),
352 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
353 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
354 			r->fl.fl4_tos,
355 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
356 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
357 				       dev_queue_xmit) : 0,
358 			r->rt_spec_dst);
359 		seq_printf(seq, "%-127s\n", temp);
360         }
361   	return 0;
362 }
363 
364 static struct seq_operations rt_cache_seq_ops = {
365 	.start  = rt_cache_seq_start,
366 	.next   = rt_cache_seq_next,
367 	.stop   = rt_cache_seq_stop,
368 	.show   = rt_cache_seq_show,
369 };
370 
371 static int rt_cache_seq_open(struct inode *inode, struct file *file)
372 {
373 	struct seq_file *seq;
374 	int rc = -ENOMEM;
375 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
376 
377 	if (!s)
378 		goto out;
379 	rc = seq_open(file, &rt_cache_seq_ops);
380 	if (rc)
381 		goto out_kfree;
382 	seq          = file->private_data;
383 	seq->private = s;
384 	memset(s, 0, sizeof(*s));
385 out:
386 	return rc;
387 out_kfree:
388 	kfree(s);
389 	goto out;
390 }
391 
392 static struct file_operations rt_cache_seq_fops = {
393 	.owner	 = THIS_MODULE,
394 	.open	 = rt_cache_seq_open,
395 	.read	 = seq_read,
396 	.llseek	 = seq_lseek,
397 	.release = seq_release_private,
398 };
399 
400 
401 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
402 {
403 	int cpu;
404 
405 	if (*pos == 0)
406 		return SEQ_START_TOKEN;
407 
408 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
409 		if (!cpu_possible(cpu))
410 			continue;
411 		*pos = cpu+1;
412 		return &per_cpu(rt_cache_stat, cpu);
413 	}
414 	return NULL;
415 }
416 
417 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
418 {
419 	int cpu;
420 
421 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
422 		if (!cpu_possible(cpu))
423 			continue;
424 		*pos = cpu+1;
425 		return &per_cpu(rt_cache_stat, cpu);
426 	}
427 	return NULL;
428 
429 }
430 
431 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
432 {
433 
434 }
435 
436 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
437 {
438 	struct rt_cache_stat *st = v;
439 
440 	if (v == SEQ_START_TOKEN) {
441 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
442 		return 0;
443 	}
444 
445 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
446 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
447 		   atomic_read(&ipv4_dst_ops.entries),
448 		   st->in_hit,
449 		   st->in_slow_tot,
450 		   st->in_slow_mc,
451 		   st->in_no_route,
452 		   st->in_brd,
453 		   st->in_martian_dst,
454 		   st->in_martian_src,
455 
456 		   st->out_hit,
457 		   st->out_slow_tot,
458 		   st->out_slow_mc,
459 
460 		   st->gc_total,
461 		   st->gc_ignored,
462 		   st->gc_goal_miss,
463 		   st->gc_dst_overflow,
464 		   st->in_hlist_search,
465 		   st->out_hlist_search
466 		);
467 	return 0;
468 }
469 
470 static struct seq_operations rt_cpu_seq_ops = {
471 	.start  = rt_cpu_seq_start,
472 	.next   = rt_cpu_seq_next,
473 	.stop   = rt_cpu_seq_stop,
474 	.show   = rt_cpu_seq_show,
475 };
476 
477 
478 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
479 {
480 	return seq_open(file, &rt_cpu_seq_ops);
481 }
482 
483 static struct file_operations rt_cpu_seq_fops = {
484 	.owner	 = THIS_MODULE,
485 	.open	 = rt_cpu_seq_open,
486 	.read	 = seq_read,
487 	.llseek	 = seq_lseek,
488 	.release = seq_release,
489 };
490 
491 #endif /* CONFIG_PROC_FS */
492 
493 static __inline__ void rt_free(struct rtable *rt)
494 {
495 	multipath_remove(rt);
496 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
497 }
498 
499 static __inline__ void rt_drop(struct rtable *rt)
500 {
501 	multipath_remove(rt);
502 	ip_rt_put(rt);
503 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
504 }
505 
506 static __inline__ int rt_fast_clean(struct rtable *rth)
507 {
508 	/* Kill broadcast/multicast entries very aggresively, if they
509 	   collide in hash table with more useful entries */
510 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
511 		rth->fl.iif && rth->u.rt_next;
512 }
513 
514 static __inline__ int rt_valuable(struct rtable *rth)
515 {
516 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
517 		rth->u.dst.expires;
518 }
519 
520 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
521 {
522 	unsigned long age;
523 	int ret = 0;
524 
525 	if (atomic_read(&rth->u.dst.__refcnt))
526 		goto out;
527 
528 	ret = 1;
529 	if (rth->u.dst.expires &&
530 	    time_after_eq(jiffies, rth->u.dst.expires))
531 		goto out;
532 
533 	age = jiffies - rth->u.dst.lastuse;
534 	ret = 0;
535 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
536 	    (age <= tmo2 && rt_valuable(rth)))
537 		goto out;
538 	ret = 1;
539 out:	return ret;
540 }
541 
542 /* Bits of score are:
543  * 31: very valuable
544  * 30: not quite useless
545  * 29..0: usage counter
546  */
547 static inline u32 rt_score(struct rtable *rt)
548 {
549 	u32 score = jiffies - rt->u.dst.lastuse;
550 
551 	score = ~score & ~(3<<30);
552 
553 	if (rt_valuable(rt))
554 		score |= (1<<31);
555 
556 	if (!rt->fl.iif ||
557 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
558 		score |= (1<<30);
559 
560 	return score;
561 }
562 
563 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 {
565 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
566 	       fl1->oif     == fl2->oif &&
567 	       fl1->iif     == fl2->iif;
568 }
569 
570 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
571 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
572 						struct rtable *expentry,
573 						int *removed_count)
574 {
575 	int passedexpired = 0;
576 	struct rtable **nextstep = NULL;
577 	struct rtable **rthp = chain_head;
578 	struct rtable *rth;
579 
580 	if (removed_count)
581 		*removed_count = 0;
582 
583 	while ((rth = *rthp) != NULL) {
584 		if (rth == expentry)
585 			passedexpired = 1;
586 
587 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
588 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
589 			if (*rthp == expentry) {
590 				*rthp = rth->u.rt_next;
591 				continue;
592 			} else {
593 				*rthp = rth->u.rt_next;
594 				rt_free(rth);
595 				if (removed_count)
596 					++(*removed_count);
597 			}
598 		} else {
599 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
600 			    passedexpired && !nextstep)
601 				nextstep = &rth->u.rt_next;
602 
603 			rthp = &rth->u.rt_next;
604 		}
605 	}
606 
607 	rt_free(expentry);
608 	if (removed_count)
609 		++(*removed_count);
610 
611 	return nextstep;
612 }
613 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
614 
615 
616 /* This runs via a timer and thus is always in BH context. */
617 static void rt_check_expire(unsigned long dummy)
618 {
619 	static unsigned int rover;
620 	unsigned int i = rover, goal;
621 	struct rtable *rth, **rthp;
622 	unsigned long now = jiffies;
623 	u64 mult;
624 
625 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
626 	if (ip_rt_gc_timeout > 1)
627 		do_div(mult, ip_rt_gc_timeout);
628 	goal = (unsigned int)mult;
629 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
630 	for (; goal > 0; goal--) {
631 		unsigned long tmo = ip_rt_gc_timeout;
632 
633 		i = (i + 1) & rt_hash_mask;
634 		rthp = &rt_hash_table[i].chain;
635 
636 		if (*rthp == 0)
637 			continue;
638 		spin_lock(rt_hash_lock_addr(i));
639 		while ((rth = *rthp) != NULL) {
640 			if (rth->u.dst.expires) {
641 				/* Entry is expired even if it is in use */
642 				if (time_before_eq(now, rth->u.dst.expires)) {
643 					tmo >>= 1;
644 					rthp = &rth->u.rt_next;
645 					continue;
646 				}
647 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
648 				tmo >>= 1;
649 				rthp = &rth->u.rt_next;
650 				continue;
651 			}
652 
653 			/* Cleanup aged off entries. */
654 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
655 			/* remove all related balanced entries if necessary */
656 			if (rth->u.dst.flags & DST_BALANCED) {
657 				rthp = rt_remove_balanced_route(
658 					&rt_hash_table[i].chain,
659 					rth, NULL);
660 				if (!rthp)
661 					break;
662 			} else {
663 				*rthp = rth->u.rt_next;
664 				rt_free(rth);
665 			}
666 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
667  			*rthp = rth->u.rt_next;
668  			rt_free(rth);
669 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
670 		}
671 		spin_unlock(rt_hash_lock_addr(i));
672 
673 		/* Fallback loop breaker. */
674 		if (time_after(jiffies, now))
675 			break;
676 	}
677 	rover = i;
678 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
679 }
680 
681 /* This can run from both BH and non-BH contexts, the latter
682  * in the case of a forced flush event.
683  */
684 static void rt_run_flush(unsigned long dummy)
685 {
686 	int i;
687 	struct rtable *rth, *next;
688 
689 	rt_deadline = 0;
690 
691 	get_random_bytes(&rt_hash_rnd, 4);
692 
693 	for (i = rt_hash_mask; i >= 0; i--) {
694 		spin_lock_bh(rt_hash_lock_addr(i));
695 		rth = rt_hash_table[i].chain;
696 		if (rth)
697 			rt_hash_table[i].chain = NULL;
698 		spin_unlock_bh(rt_hash_lock_addr(i));
699 
700 		for (; rth; rth = next) {
701 			next = rth->u.rt_next;
702 			rt_free(rth);
703 		}
704 	}
705 }
706 
707 static DEFINE_SPINLOCK(rt_flush_lock);
708 
709 void rt_cache_flush(int delay)
710 {
711 	unsigned long now = jiffies;
712 	int user_mode = !in_softirq();
713 
714 	if (delay < 0)
715 		delay = ip_rt_min_delay;
716 
717 	/* flush existing multipath state*/
718 	multipath_flush();
719 
720 	spin_lock_bh(&rt_flush_lock);
721 
722 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
723 		long tmo = (long)(rt_deadline - now);
724 
725 		/* If flush timer is already running
726 		   and flush request is not immediate (delay > 0):
727 
728 		   if deadline is not achieved, prolongate timer to "delay",
729 		   otherwise fire it at deadline time.
730 		 */
731 
732 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
733 			tmo = 0;
734 
735 		if (delay > tmo)
736 			delay = tmo;
737 	}
738 
739 	if (delay <= 0) {
740 		spin_unlock_bh(&rt_flush_lock);
741 		rt_run_flush(0);
742 		return;
743 	}
744 
745 	if (rt_deadline == 0)
746 		rt_deadline = now + ip_rt_max_delay;
747 
748 	mod_timer(&rt_flush_timer, now+delay);
749 	spin_unlock_bh(&rt_flush_lock);
750 }
751 
752 static void rt_secret_rebuild(unsigned long dummy)
753 {
754 	unsigned long now = jiffies;
755 
756 	rt_cache_flush(0);
757 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
758 }
759 
760 /*
761    Short description of GC goals.
762 
763    We want to build algorithm, which will keep routing cache
764    at some equilibrium point, when number of aged off entries
765    is kept approximately equal to newly generated ones.
766 
767    Current expiration strength is variable "expire".
768    We try to adjust it dynamically, so that if networking
769    is idle expires is large enough to keep enough of warm entries,
770    and when load increases it reduces to limit cache size.
771  */
772 
773 static int rt_garbage_collect(void)
774 {
775 	static unsigned long expire = RT_GC_TIMEOUT;
776 	static unsigned long last_gc;
777 	static int rover;
778 	static int equilibrium;
779 	struct rtable *rth, **rthp;
780 	unsigned long now = jiffies;
781 	int goal;
782 
783 	/*
784 	 * Garbage collection is pretty expensive,
785 	 * do not make it too frequently.
786 	 */
787 
788 	RT_CACHE_STAT_INC(gc_total);
789 
790 	if (now - last_gc < ip_rt_gc_min_interval &&
791 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
792 		RT_CACHE_STAT_INC(gc_ignored);
793 		goto out;
794 	}
795 
796 	/* Calculate number of entries, which we want to expire now. */
797 	goal = atomic_read(&ipv4_dst_ops.entries) -
798 		(ip_rt_gc_elasticity << rt_hash_log);
799 	if (goal <= 0) {
800 		if (equilibrium < ipv4_dst_ops.gc_thresh)
801 			equilibrium = ipv4_dst_ops.gc_thresh;
802 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
803 		if (goal > 0) {
804 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
805 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806 		}
807 	} else {
808 		/* We are in dangerous area. Try to reduce cache really
809 		 * aggressively.
810 		 */
811 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
812 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
813 	}
814 
815 	if (now - last_gc >= ip_rt_gc_min_interval)
816 		last_gc = now;
817 
818 	if (goal <= 0) {
819 		equilibrium += goal;
820 		goto work_done;
821 	}
822 
823 	do {
824 		int i, k;
825 
826 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
827 			unsigned long tmo = expire;
828 
829 			k = (k + 1) & rt_hash_mask;
830 			rthp = &rt_hash_table[k].chain;
831 			spin_lock_bh(rt_hash_lock_addr(k));
832 			while ((rth = *rthp) != NULL) {
833 				if (!rt_may_expire(rth, tmo, expire)) {
834 					tmo >>= 1;
835 					rthp = &rth->u.rt_next;
836 					continue;
837 				}
838 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
839 				/* remove all related balanced entries
840 				 * if necessary
841 				 */
842 				if (rth->u.dst.flags & DST_BALANCED) {
843 					int r;
844 
845 					rthp = rt_remove_balanced_route(
846 						&rt_hash_table[k].chain,
847 						rth,
848 						&r);
849 					goal -= r;
850 					if (!rthp)
851 						break;
852 				} else {
853 					*rthp = rth->u.rt_next;
854 					rt_free(rth);
855 					goal--;
856 				}
857 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
858 				*rthp = rth->u.rt_next;
859 				rt_free(rth);
860 				goal--;
861 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
862 			}
863 			spin_unlock_bh(rt_hash_lock_addr(k));
864 			if (goal <= 0)
865 				break;
866 		}
867 		rover = k;
868 
869 		if (goal <= 0)
870 			goto work_done;
871 
872 		/* Goal is not achieved. We stop process if:
873 
874 		   - if expire reduced to zero. Otherwise, expire is halfed.
875 		   - if table is not full.
876 		   - if we are called from interrupt.
877 		   - jiffies check is just fallback/debug loop breaker.
878 		     We will not spin here for long time in any case.
879 		 */
880 
881 		RT_CACHE_STAT_INC(gc_goal_miss);
882 
883 		if (expire == 0)
884 			break;
885 
886 		expire >>= 1;
887 #if RT_CACHE_DEBUG >= 2
888 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
889 				atomic_read(&ipv4_dst_ops.entries), goal, i);
890 #endif
891 
892 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
893 			goto out;
894 	} while (!in_softirq() && time_before_eq(jiffies, now));
895 
896 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897 		goto out;
898 	if (net_ratelimit())
899 		printk(KERN_WARNING "dst cache overflow\n");
900 	RT_CACHE_STAT_INC(gc_dst_overflow);
901 	return 1;
902 
903 work_done:
904 	expire += ip_rt_gc_min_interval;
905 	if (expire > ip_rt_gc_timeout ||
906 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
907 		expire = ip_rt_gc_timeout;
908 #if RT_CACHE_DEBUG >= 2
909 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
910 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
911 #endif
912 out:	return 0;
913 }
914 
915 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
916 {
917 	struct rtable	*rth, **rthp;
918 	unsigned long	now;
919 	struct rtable *cand, **candp;
920 	u32 		min_score;
921 	int		chain_length;
922 	int attempts = !in_softirq();
923 
924 restart:
925 	chain_length = 0;
926 	min_score = ~(u32)0;
927 	cand = NULL;
928 	candp = NULL;
929 	now = jiffies;
930 
931 	rthp = &rt_hash_table[hash].chain;
932 
933 	spin_lock_bh(rt_hash_lock_addr(hash));
934 	while ((rth = *rthp) != NULL) {
935 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
936 		if (!(rth->u.dst.flags & DST_BALANCED) &&
937 		    compare_keys(&rth->fl, &rt->fl)) {
938 #else
939 		if (compare_keys(&rth->fl, &rt->fl)) {
940 #endif
941 			/* Put it first */
942 			*rthp = rth->u.rt_next;
943 			/*
944 			 * Since lookup is lockfree, the deletion
945 			 * must be visible to another weakly ordered CPU before
946 			 * the insertion at the start of the hash chain.
947 			 */
948 			rcu_assign_pointer(rth->u.rt_next,
949 					   rt_hash_table[hash].chain);
950 			/*
951 			 * Since lookup is lockfree, the update writes
952 			 * must be ordered for consistency on SMP.
953 			 */
954 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
955 
956 			rth->u.dst.__use++;
957 			dst_hold(&rth->u.dst);
958 			rth->u.dst.lastuse = now;
959 			spin_unlock_bh(rt_hash_lock_addr(hash));
960 
961 			rt_drop(rt);
962 			*rp = rth;
963 			return 0;
964 		}
965 
966 		if (!atomic_read(&rth->u.dst.__refcnt)) {
967 			u32 score = rt_score(rth);
968 
969 			if (score <= min_score) {
970 				cand = rth;
971 				candp = rthp;
972 				min_score = score;
973 			}
974 		}
975 
976 		chain_length++;
977 
978 		rthp = &rth->u.rt_next;
979 	}
980 
981 	if (cand) {
982 		/* ip_rt_gc_elasticity used to be average length of chain
983 		 * length, when exceeded gc becomes really aggressive.
984 		 *
985 		 * The second limit is less certain. At the moment it allows
986 		 * only 2 entries per bucket. We will see.
987 		 */
988 		if (chain_length > ip_rt_gc_elasticity) {
989 			*candp = cand->u.rt_next;
990 			rt_free(cand);
991 		}
992 	}
993 
994 	/* Try to bind route to arp only if it is output
995 	   route or unicast forwarding path.
996 	 */
997 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
998 		int err = arp_bind_neighbour(&rt->u.dst);
999 		if (err) {
1000 			spin_unlock_bh(rt_hash_lock_addr(hash));
1001 
1002 			if (err != -ENOBUFS) {
1003 				rt_drop(rt);
1004 				return err;
1005 			}
1006 
1007 			/* Neighbour tables are full and nothing
1008 			   can be released. Try to shrink route cache,
1009 			   it is most likely it holds some neighbour records.
1010 			 */
1011 			if (attempts-- > 0) {
1012 				int saved_elasticity = ip_rt_gc_elasticity;
1013 				int saved_int = ip_rt_gc_min_interval;
1014 				ip_rt_gc_elasticity	= 1;
1015 				ip_rt_gc_min_interval	= 0;
1016 				rt_garbage_collect();
1017 				ip_rt_gc_min_interval	= saved_int;
1018 				ip_rt_gc_elasticity	= saved_elasticity;
1019 				goto restart;
1020 			}
1021 
1022 			if (net_ratelimit())
1023 				printk(KERN_WARNING "Neighbour table overflow.\n");
1024 			rt_drop(rt);
1025 			return -ENOBUFS;
1026 		}
1027 	}
1028 
1029 	rt->u.rt_next = rt_hash_table[hash].chain;
1030 #if RT_CACHE_DEBUG >= 2
1031 	if (rt->u.rt_next) {
1032 		struct rtable *trt;
1033 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1034 		       NIPQUAD(rt->rt_dst));
1035 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1036 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1037 		printk("\n");
1038 	}
1039 #endif
1040 	rt_hash_table[hash].chain = rt;
1041 	spin_unlock_bh(rt_hash_lock_addr(hash));
1042 	*rp = rt;
1043 	return 0;
1044 }
1045 
1046 void rt_bind_peer(struct rtable *rt, int create)
1047 {
1048 	static DEFINE_SPINLOCK(rt_peer_lock);
1049 	struct inet_peer *peer;
1050 
1051 	peer = inet_getpeer(rt->rt_dst, create);
1052 
1053 	spin_lock_bh(&rt_peer_lock);
1054 	if (rt->peer == NULL) {
1055 		rt->peer = peer;
1056 		peer = NULL;
1057 	}
1058 	spin_unlock_bh(&rt_peer_lock);
1059 	if (peer)
1060 		inet_putpeer(peer);
1061 }
1062 
1063 /*
1064  * Peer allocation may fail only in serious out-of-memory conditions.  However
1065  * we still can generate some output.
1066  * Random ID selection looks a bit dangerous because we have no chances to
1067  * select ID being unique in a reasonable period of time.
1068  * But broken packet identifier may be better than no packet at all.
1069  */
1070 static void ip_select_fb_ident(struct iphdr *iph)
1071 {
1072 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1073 	static u32 ip_fallback_id;
1074 	u32 salt;
1075 
1076 	spin_lock_bh(&ip_fb_id_lock);
1077 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1078 	iph->id = htons(salt & 0xFFFF);
1079 	ip_fallback_id = salt;
1080 	spin_unlock_bh(&ip_fb_id_lock);
1081 }
1082 
1083 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1084 {
1085 	struct rtable *rt = (struct rtable *) dst;
1086 
1087 	if (rt) {
1088 		if (rt->peer == NULL)
1089 			rt_bind_peer(rt, 1);
1090 
1091 		/* If peer is attached to destination, it is never detached,
1092 		   so that we need not to grab a lock to dereference it.
1093 		 */
1094 		if (rt->peer) {
1095 			iph->id = htons(inet_getid(rt->peer, more));
1096 			return;
1097 		}
1098 	} else
1099 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1100 		       __builtin_return_address(0));
1101 
1102 	ip_select_fb_ident(iph);
1103 }
1104 
1105 static void rt_del(unsigned hash, struct rtable *rt)
1106 {
1107 	struct rtable **rthp;
1108 
1109 	spin_lock_bh(rt_hash_lock_addr(hash));
1110 	ip_rt_put(rt);
1111 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1112 	     rthp = &(*rthp)->u.rt_next)
1113 		if (*rthp == rt) {
1114 			*rthp = rt->u.rt_next;
1115 			rt_free(rt);
1116 			break;
1117 		}
1118 	spin_unlock_bh(rt_hash_lock_addr(hash));
1119 }
1120 
1121 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1122 		    u32 saddr, struct net_device *dev)
1123 {
1124 	int i, k;
1125 	struct in_device *in_dev = in_dev_get(dev);
1126 	struct rtable *rth, **rthp;
1127 	u32  skeys[2] = { saddr, 0 };
1128 	int  ikeys[2] = { dev->ifindex, 0 };
1129 	struct netevent_redirect netevent;
1130 
1131 	if (!in_dev)
1132 		return;
1133 
1134 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1135 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1136 		goto reject_redirect;
1137 
1138 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1139 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1140 			goto reject_redirect;
1141 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1142 			goto reject_redirect;
1143 	} else {
1144 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1145 			goto reject_redirect;
1146 	}
1147 
1148 	for (i = 0; i < 2; i++) {
1149 		for (k = 0; k < 2; k++) {
1150 			unsigned hash = rt_hash_code(daddr,
1151 						     skeys[i] ^ (ikeys[k] << 5));
1152 
1153 			rthp=&rt_hash_table[hash].chain;
1154 
1155 			rcu_read_lock();
1156 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1157 				struct rtable *rt;
1158 
1159 				if (rth->fl.fl4_dst != daddr ||
1160 				    rth->fl.fl4_src != skeys[i] ||
1161 				    rth->fl.oif != ikeys[k] ||
1162 				    rth->fl.iif != 0) {
1163 					rthp = &rth->u.rt_next;
1164 					continue;
1165 				}
1166 
1167 				if (rth->rt_dst != daddr ||
1168 				    rth->rt_src != saddr ||
1169 				    rth->u.dst.error ||
1170 				    rth->rt_gateway != old_gw ||
1171 				    rth->u.dst.dev != dev)
1172 					break;
1173 
1174 				dst_hold(&rth->u.dst);
1175 				rcu_read_unlock();
1176 
1177 				rt = dst_alloc(&ipv4_dst_ops);
1178 				if (rt == NULL) {
1179 					ip_rt_put(rth);
1180 					in_dev_put(in_dev);
1181 					return;
1182 				}
1183 
1184 				/* Copy all the information. */
1185 				*rt = *rth;
1186  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1187 				rt->u.dst.__use		= 1;
1188 				atomic_set(&rt->u.dst.__refcnt, 1);
1189 				rt->u.dst.child		= NULL;
1190 				if (rt->u.dst.dev)
1191 					dev_hold(rt->u.dst.dev);
1192 				if (rt->idev)
1193 					in_dev_hold(rt->idev);
1194 				rt->u.dst.obsolete	= 0;
1195 				rt->u.dst.lastuse	= jiffies;
1196 				rt->u.dst.path		= &rt->u.dst;
1197 				rt->u.dst.neighbour	= NULL;
1198 				rt->u.dst.hh		= NULL;
1199 				rt->u.dst.xfrm		= NULL;
1200 
1201 				rt->rt_flags		|= RTCF_REDIRECTED;
1202 
1203 				/* Gateway is different ... */
1204 				rt->rt_gateway		= new_gw;
1205 
1206 				/* Redirect received -> path was valid */
1207 				dst_confirm(&rth->u.dst);
1208 
1209 				if (rt->peer)
1210 					atomic_inc(&rt->peer->refcnt);
1211 
1212 				if (arp_bind_neighbour(&rt->u.dst) ||
1213 				    !(rt->u.dst.neighbour->nud_state &
1214 					    NUD_VALID)) {
1215 					if (rt->u.dst.neighbour)
1216 						neigh_event_send(rt->u.dst.neighbour, NULL);
1217 					ip_rt_put(rth);
1218 					rt_drop(rt);
1219 					goto do_next;
1220 				}
1221 
1222 				netevent.old = &rth->u.dst;
1223 				netevent.new = &rt->u.dst;
1224 				call_netevent_notifiers(NETEVENT_REDIRECT,
1225 						        &netevent);
1226 
1227 				rt_del(hash, rth);
1228 				if (!rt_intern_hash(hash, rt, &rt))
1229 					ip_rt_put(rt);
1230 				goto do_next;
1231 			}
1232 			rcu_read_unlock();
1233 		do_next:
1234 			;
1235 		}
1236 	}
1237 	in_dev_put(in_dev);
1238 	return;
1239 
1240 reject_redirect:
1241 #ifdef CONFIG_IP_ROUTE_VERBOSE
1242 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1243 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1244 			"%u.%u.%u.%u ignored.\n"
1245 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1246 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1247 		       NIPQUAD(saddr), NIPQUAD(daddr));
1248 #endif
1249 	in_dev_put(in_dev);
1250 }
1251 
1252 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1253 {
1254 	struct rtable *rt = (struct rtable*)dst;
1255 	struct dst_entry *ret = dst;
1256 
1257 	if (rt) {
1258 		if (dst->obsolete) {
1259 			ip_rt_put(rt);
1260 			ret = NULL;
1261 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1262 			   rt->u.dst.expires) {
1263 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1264 						     rt->fl.fl4_src ^
1265 							(rt->fl.oif << 5));
1266 #if RT_CACHE_DEBUG >= 1
1267 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1268 					  "%u.%u.%u.%u/%02x dropped\n",
1269 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1270 #endif
1271 			rt_del(hash, rt);
1272 			ret = NULL;
1273 		}
1274 	}
1275 	return ret;
1276 }
1277 
1278 /*
1279  * Algorithm:
1280  *	1. The first ip_rt_redirect_number redirects are sent
1281  *	   with exponential backoff, then we stop sending them at all,
1282  *	   assuming that the host ignores our redirects.
1283  *	2. If we did not see packets requiring redirects
1284  *	   during ip_rt_redirect_silence, we assume that the host
1285  *	   forgot redirected route and start to send redirects again.
1286  *
1287  * This algorithm is much cheaper and more intelligent than dumb load limiting
1288  * in icmp.c.
1289  *
1290  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1291  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1292  */
1293 
1294 void ip_rt_send_redirect(struct sk_buff *skb)
1295 {
1296 	struct rtable *rt = (struct rtable*)skb->dst;
1297 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1298 
1299 	if (!in_dev)
1300 		return;
1301 
1302 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1303 		goto out;
1304 
1305 	/* No redirected packets during ip_rt_redirect_silence;
1306 	 * reset the algorithm.
1307 	 */
1308 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1309 		rt->u.dst.rate_tokens = 0;
1310 
1311 	/* Too many ignored redirects; do not send anything
1312 	 * set u.dst.rate_last to the last seen redirected packet.
1313 	 */
1314 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1315 		rt->u.dst.rate_last = jiffies;
1316 		goto out;
1317 	}
1318 
1319 	/* Check for load limit; set rate_last to the latest sent
1320 	 * redirect.
1321 	 */
1322 	if (time_after(jiffies,
1323 		       (rt->u.dst.rate_last +
1324 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1325 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1326 		rt->u.dst.rate_last = jiffies;
1327 		++rt->u.dst.rate_tokens;
1328 #ifdef CONFIG_IP_ROUTE_VERBOSE
1329 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1330 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1331 		    net_ratelimit())
1332 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1333 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1334 				NIPQUAD(rt->rt_src), rt->rt_iif,
1335 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1336 #endif
1337 	}
1338 out:
1339         in_dev_put(in_dev);
1340 }
1341 
1342 static int ip_error(struct sk_buff *skb)
1343 {
1344 	struct rtable *rt = (struct rtable*)skb->dst;
1345 	unsigned long now;
1346 	int code;
1347 
1348 	switch (rt->u.dst.error) {
1349 		case EINVAL:
1350 		default:
1351 			goto out;
1352 		case EHOSTUNREACH:
1353 			code = ICMP_HOST_UNREACH;
1354 			break;
1355 		case ENETUNREACH:
1356 			code = ICMP_NET_UNREACH;
1357 			break;
1358 		case EACCES:
1359 			code = ICMP_PKT_FILTERED;
1360 			break;
1361 	}
1362 
1363 	now = jiffies;
1364 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1365 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1366 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1367 	rt->u.dst.rate_last = now;
1368 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1369 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1370 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1371 	}
1372 
1373 out:	kfree_skb(skb);
1374 	return 0;
1375 }
1376 
1377 /*
1378  *	The last two values are not from the RFC but
1379  *	are needed for AMPRnet AX.25 paths.
1380  */
1381 
1382 static const unsigned short mtu_plateau[] =
1383 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1384 
1385 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1386 {
1387 	int i;
1388 
1389 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1390 		if (old_mtu > mtu_plateau[i])
1391 			return mtu_plateau[i];
1392 	return 68;
1393 }
1394 
1395 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1396 {
1397 	int i;
1398 	unsigned short old_mtu = ntohs(iph->tot_len);
1399 	struct rtable *rth;
1400 	u32  skeys[2] = { iph->saddr, 0, };
1401 	u32  daddr = iph->daddr;
1402 	unsigned short est_mtu = 0;
1403 
1404 	if (ipv4_config.no_pmtu_disc)
1405 		return 0;
1406 
1407 	for (i = 0; i < 2; i++) {
1408 		unsigned hash = rt_hash_code(daddr, skeys[i]);
1409 
1410 		rcu_read_lock();
1411 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1412 		     rth = rcu_dereference(rth->u.rt_next)) {
1413 			if (rth->fl.fl4_dst == daddr &&
1414 			    rth->fl.fl4_src == skeys[i] &&
1415 			    rth->rt_dst  == daddr &&
1416 			    rth->rt_src  == iph->saddr &&
1417 			    rth->fl.iif == 0 &&
1418 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1419 				unsigned short mtu = new_mtu;
1420 
1421 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1422 
1423 					/* BSD 4.2 compatibility hack :-( */
1424 					if (mtu == 0 &&
1425 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1426 					    old_mtu >= 68 + (iph->ihl << 2))
1427 						old_mtu -= iph->ihl << 2;
1428 
1429 					mtu = guess_mtu(old_mtu);
1430 				}
1431 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1432 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1433 						dst_confirm(&rth->u.dst);
1434 						if (mtu < ip_rt_min_pmtu) {
1435 							mtu = ip_rt_min_pmtu;
1436 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1437 								(1 << RTAX_MTU);
1438 						}
1439 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1440 						dst_set_expires(&rth->u.dst,
1441 							ip_rt_mtu_expires);
1442 					}
1443 					est_mtu = mtu;
1444 				}
1445 			}
1446 		}
1447 		rcu_read_unlock();
1448 	}
1449 	return est_mtu ? : new_mtu;
1450 }
1451 
1452 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1453 {
1454 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1455 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1456 		if (mtu < ip_rt_min_pmtu) {
1457 			mtu = ip_rt_min_pmtu;
1458 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1459 		}
1460 		dst->metrics[RTAX_MTU-1] = mtu;
1461 		dst_set_expires(dst, ip_rt_mtu_expires);
1462 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1463 	}
1464 }
1465 
1466 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1467 {
1468 	return NULL;
1469 }
1470 
1471 static void ipv4_dst_destroy(struct dst_entry *dst)
1472 {
1473 	struct rtable *rt = (struct rtable *) dst;
1474 	struct inet_peer *peer = rt->peer;
1475 	struct in_device *idev = rt->idev;
1476 
1477 	if (peer) {
1478 		rt->peer = NULL;
1479 		inet_putpeer(peer);
1480 	}
1481 
1482 	if (idev) {
1483 		rt->idev = NULL;
1484 		in_dev_put(idev);
1485 	}
1486 }
1487 
1488 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1489 			    int how)
1490 {
1491 	struct rtable *rt = (struct rtable *) dst;
1492 	struct in_device *idev = rt->idev;
1493 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1494 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1495 		if (loopback_idev) {
1496 			rt->idev = loopback_idev;
1497 			in_dev_put(idev);
1498 		}
1499 	}
1500 }
1501 
1502 static void ipv4_link_failure(struct sk_buff *skb)
1503 {
1504 	struct rtable *rt;
1505 
1506 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1507 
1508 	rt = (struct rtable *) skb->dst;
1509 	if (rt)
1510 		dst_set_expires(&rt->u.dst, 0);
1511 }
1512 
1513 static int ip_rt_bug(struct sk_buff *skb)
1514 {
1515 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1516 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1517 		skb->dev ? skb->dev->name : "?");
1518 	kfree_skb(skb);
1519 	return 0;
1520 }
1521 
1522 /*
1523    We do not cache source address of outgoing interface,
1524    because it is used only by IP RR, TS and SRR options,
1525    so that it out of fast path.
1526 
1527    BTW remember: "addr" is allowed to be not aligned
1528    in IP options!
1529  */
1530 
1531 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1532 {
1533 	u32 src;
1534 	struct fib_result res;
1535 
1536 	if (rt->fl.iif == 0)
1537 		src = rt->rt_src;
1538 	else if (fib_lookup(&rt->fl, &res) == 0) {
1539 		src = FIB_RES_PREFSRC(res);
1540 		fib_res_put(&res);
1541 	} else
1542 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1543 					RT_SCOPE_UNIVERSE);
1544 	memcpy(addr, &src, 4);
1545 }
1546 
1547 #ifdef CONFIG_NET_CLS_ROUTE
1548 static void set_class_tag(struct rtable *rt, u32 tag)
1549 {
1550 	if (!(rt->u.dst.tclassid & 0xFFFF))
1551 		rt->u.dst.tclassid |= tag & 0xFFFF;
1552 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1553 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1554 }
1555 #endif
1556 
1557 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1558 {
1559 	struct fib_info *fi = res->fi;
1560 
1561 	if (fi) {
1562 		if (FIB_RES_GW(*res) &&
1563 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1564 			rt->rt_gateway = FIB_RES_GW(*res);
1565 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1566 		       sizeof(rt->u.dst.metrics));
1567 		if (fi->fib_mtu == 0) {
1568 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1569 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1570 			    rt->rt_gateway != rt->rt_dst &&
1571 			    rt->u.dst.dev->mtu > 576)
1572 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1573 		}
1574 #ifdef CONFIG_NET_CLS_ROUTE
1575 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1576 #endif
1577 	} else
1578 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1579 
1580 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1581 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1582 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1583 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1584 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1585 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1586 				       ip_rt_min_advmss);
1587 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1588 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1589 
1590 #ifdef CONFIG_NET_CLS_ROUTE
1591 #ifdef CONFIG_IP_MULTIPLE_TABLES
1592 	set_class_tag(rt, fib_rules_tclass(res));
1593 #endif
1594 	set_class_tag(rt, itag);
1595 #endif
1596         rt->rt_type = res->type;
1597 }
1598 
1599 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1600 				u8 tos, struct net_device *dev, int our)
1601 {
1602 	unsigned hash;
1603 	struct rtable *rth;
1604 	u32 spec_dst;
1605 	struct in_device *in_dev = in_dev_get(dev);
1606 	u32 itag = 0;
1607 
1608 	/* Primary sanity checks. */
1609 
1610 	if (in_dev == NULL)
1611 		return -EINVAL;
1612 
1613 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1614 	    skb->protocol != htons(ETH_P_IP))
1615 		goto e_inval;
1616 
1617 	if (ZERONET(saddr)) {
1618 		if (!LOCAL_MCAST(daddr))
1619 			goto e_inval;
1620 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1621 	} else if (fib_validate_source(saddr, 0, tos, 0,
1622 					dev, &spec_dst, &itag) < 0)
1623 		goto e_inval;
1624 
1625 	rth = dst_alloc(&ipv4_dst_ops);
1626 	if (!rth)
1627 		goto e_nobufs;
1628 
1629 	rth->u.dst.output= ip_rt_bug;
1630 
1631 	atomic_set(&rth->u.dst.__refcnt, 1);
1632 	rth->u.dst.flags= DST_HOST;
1633 	if (in_dev->cnf.no_policy)
1634 		rth->u.dst.flags |= DST_NOPOLICY;
1635 	rth->fl.fl4_dst	= daddr;
1636 	rth->rt_dst	= daddr;
1637 	rth->fl.fl4_tos	= tos;
1638 #ifdef CONFIG_IP_ROUTE_FWMARK
1639 	rth->fl.fl4_fwmark= skb->nfmark;
1640 #endif
1641 	rth->fl.fl4_src	= saddr;
1642 	rth->rt_src	= saddr;
1643 #ifdef CONFIG_NET_CLS_ROUTE
1644 	rth->u.dst.tclassid = itag;
1645 #endif
1646 	rth->rt_iif	=
1647 	rth->fl.iif	= dev->ifindex;
1648 	rth->u.dst.dev	= &loopback_dev;
1649 	dev_hold(rth->u.dst.dev);
1650 	rth->idev	= in_dev_get(rth->u.dst.dev);
1651 	rth->fl.oif	= 0;
1652 	rth->rt_gateway	= daddr;
1653 	rth->rt_spec_dst= spec_dst;
1654 	rth->rt_type	= RTN_MULTICAST;
1655 	rth->rt_flags	= RTCF_MULTICAST;
1656 	if (our) {
1657 		rth->u.dst.input= ip_local_deliver;
1658 		rth->rt_flags |= RTCF_LOCAL;
1659 	}
1660 
1661 #ifdef CONFIG_IP_MROUTE
1662 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1663 		rth->u.dst.input = ip_mr_input;
1664 #endif
1665 	RT_CACHE_STAT_INC(in_slow_mc);
1666 
1667 	in_dev_put(in_dev);
1668 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1669 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1670 
1671 e_nobufs:
1672 	in_dev_put(in_dev);
1673 	return -ENOBUFS;
1674 
1675 e_inval:
1676 	in_dev_put(in_dev);
1677 	return -EINVAL;
1678 }
1679 
1680 
1681 static void ip_handle_martian_source(struct net_device *dev,
1682 				     struct in_device *in_dev,
1683 				     struct sk_buff *skb,
1684 				     u32 daddr,
1685 				     u32 saddr)
1686 {
1687 	RT_CACHE_STAT_INC(in_martian_src);
1688 #ifdef CONFIG_IP_ROUTE_VERBOSE
1689 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1690 		/*
1691 		 *	RFC1812 recommendation, if source is martian,
1692 		 *	the only hint is MAC header.
1693 		 */
1694 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1695 			"%u.%u.%u.%u, on dev %s\n",
1696 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1697 		if (dev->hard_header_len && skb->mac.raw) {
1698 			int i;
1699 			unsigned char *p = skb->mac.raw;
1700 			printk(KERN_WARNING "ll header: ");
1701 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1702 				printk("%02x", *p);
1703 				if (i < (dev->hard_header_len - 1))
1704 					printk(":");
1705 			}
1706 			printk("\n");
1707 		}
1708 	}
1709 #endif
1710 }
1711 
1712 static inline int __mkroute_input(struct sk_buff *skb,
1713 				  struct fib_result* res,
1714 				  struct in_device *in_dev,
1715 				  u32 daddr, u32 saddr, u32 tos,
1716 				  struct rtable **result)
1717 {
1718 
1719 	struct rtable *rth;
1720 	int err;
1721 	struct in_device *out_dev;
1722 	unsigned flags = 0;
1723 	u32 spec_dst, itag;
1724 
1725 	/* get a working reference to the output device */
1726 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1727 	if (out_dev == NULL) {
1728 		if (net_ratelimit())
1729 			printk(KERN_CRIT "Bug in ip_route_input" \
1730 			       "_slow(). Please, report\n");
1731 		return -EINVAL;
1732 	}
1733 
1734 
1735 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1736 				  in_dev->dev, &spec_dst, &itag);
1737 	if (err < 0) {
1738 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1739 					 saddr);
1740 
1741 		err = -EINVAL;
1742 		goto cleanup;
1743 	}
1744 
1745 	if (err)
1746 		flags |= RTCF_DIRECTSRC;
1747 
1748 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1749 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1750 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1751 		flags |= RTCF_DOREDIRECT;
1752 
1753 	if (skb->protocol != htons(ETH_P_IP)) {
1754 		/* Not IP (i.e. ARP). Do not create route, if it is
1755 		 * invalid for proxy arp. DNAT routes are always valid.
1756 		 */
1757 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1758 			err = -EINVAL;
1759 			goto cleanup;
1760 		}
1761 	}
1762 
1763 
1764 	rth = dst_alloc(&ipv4_dst_ops);
1765 	if (!rth) {
1766 		err = -ENOBUFS;
1767 		goto cleanup;
1768 	}
1769 
1770 	atomic_set(&rth->u.dst.__refcnt, 1);
1771 	rth->u.dst.flags= DST_HOST;
1772 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1773 	if (res->fi->fib_nhs > 1)
1774 		rth->u.dst.flags |= DST_BALANCED;
1775 #endif
1776 	if (in_dev->cnf.no_policy)
1777 		rth->u.dst.flags |= DST_NOPOLICY;
1778 	if (in_dev->cnf.no_xfrm)
1779 		rth->u.dst.flags |= DST_NOXFRM;
1780 	rth->fl.fl4_dst	= daddr;
1781 	rth->rt_dst	= daddr;
1782 	rth->fl.fl4_tos	= tos;
1783 #ifdef CONFIG_IP_ROUTE_FWMARK
1784 	rth->fl.fl4_fwmark= skb->nfmark;
1785 #endif
1786 	rth->fl.fl4_src	= saddr;
1787 	rth->rt_src	= saddr;
1788 	rth->rt_gateway	= daddr;
1789 	rth->rt_iif 	=
1790 		rth->fl.iif	= in_dev->dev->ifindex;
1791 	rth->u.dst.dev	= (out_dev)->dev;
1792 	dev_hold(rth->u.dst.dev);
1793 	rth->idev	= in_dev_get(rth->u.dst.dev);
1794 	rth->fl.oif 	= 0;
1795 	rth->rt_spec_dst= spec_dst;
1796 
1797 	rth->u.dst.input = ip_forward;
1798 	rth->u.dst.output = ip_output;
1799 
1800 	rt_set_nexthop(rth, res, itag);
1801 
1802 	rth->rt_flags = flags;
1803 
1804 	*result = rth;
1805 	err = 0;
1806  cleanup:
1807 	/* release the working reference to the output device */
1808 	in_dev_put(out_dev);
1809 	return err;
1810 }
1811 
1812 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1813 				       struct fib_result* res,
1814 				       const struct flowi *fl,
1815 				       struct in_device *in_dev,
1816 				       u32 daddr, u32 saddr, u32 tos)
1817 {
1818 	struct rtable* rth = NULL;
1819 	int err;
1820 	unsigned hash;
1821 
1822 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1823 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1824 		fib_select_multipath(fl, res);
1825 #endif
1826 
1827 	/* create a routing cache entry */
1828 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1829 	if (err)
1830 		return err;
1831 
1832 	/* put it into the cache */
1833 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1834 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1835 }
1836 
1837 static inline int ip_mkroute_input(struct sk_buff *skb,
1838 				   struct fib_result* res,
1839 				   const struct flowi *fl,
1840 				   struct in_device *in_dev,
1841 				   u32 daddr, u32 saddr, u32 tos)
1842 {
1843 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1844 	struct rtable* rth = NULL, *rtres;
1845 	unsigned char hop, hopcount;
1846 	int err = -EINVAL;
1847 	unsigned int hash;
1848 
1849 	if (res->fi)
1850 		hopcount = res->fi->fib_nhs;
1851 	else
1852 		hopcount = 1;
1853 
1854 	/* distinguish between multipath and singlepath */
1855 	if (hopcount < 2)
1856 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1857 					    saddr, tos);
1858 
1859 	/* add all alternatives to the routing cache */
1860 	for (hop = 0; hop < hopcount; hop++) {
1861 		res->nh_sel = hop;
1862 
1863 		/* put reference to previous result */
1864 		if (hop)
1865 			ip_rt_put(rtres);
1866 
1867 		/* create a routing cache entry */
1868 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1869 				      &rth);
1870 		if (err)
1871 			return err;
1872 
1873 		/* put it into the cache */
1874 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1875 		err = rt_intern_hash(hash, rth, &rtres);
1876 		if (err)
1877 			return err;
1878 
1879 		/* forward hop information to multipath impl. */
1880 		multipath_set_nhinfo(rth,
1881 				     FIB_RES_NETWORK(*res),
1882 				     FIB_RES_NETMASK(*res),
1883 				     res->prefixlen,
1884 				     &FIB_RES_NH(*res));
1885 	}
1886 	skb->dst = &rtres->u.dst;
1887 	return err;
1888 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1889 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1890 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1891 }
1892 
1893 
1894 /*
1895  *	NOTE. We drop all the packets that has local source
1896  *	addresses, because every properly looped back packet
1897  *	must have correct destination already attached by output routine.
1898  *
1899  *	Such approach solves two big problems:
1900  *	1. Not simplex devices are handled properly.
1901  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1902  */
1903 
1904 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1905 			       u8 tos, struct net_device *dev)
1906 {
1907 	struct fib_result res;
1908 	struct in_device *in_dev = in_dev_get(dev);
1909 	struct flowi fl = { .nl_u = { .ip4_u =
1910 				      { .daddr = daddr,
1911 					.saddr = saddr,
1912 					.tos = tos,
1913 					.scope = RT_SCOPE_UNIVERSE,
1914 #ifdef CONFIG_IP_ROUTE_FWMARK
1915 					.fwmark = skb->nfmark
1916 #endif
1917 				      } },
1918 			    .iif = dev->ifindex };
1919 	unsigned	flags = 0;
1920 	u32		itag = 0;
1921 	struct rtable * rth;
1922 	unsigned	hash;
1923 	u32		spec_dst;
1924 	int		err = -EINVAL;
1925 	int		free_res = 0;
1926 
1927 	/* IP on this device is disabled. */
1928 
1929 	if (!in_dev)
1930 		goto out;
1931 
1932 	/* Check for the most weird martians, which can be not detected
1933 	   by fib_lookup.
1934 	 */
1935 
1936 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1937 		goto martian_source;
1938 
1939 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1940 		goto brd_input;
1941 
1942 	/* Accept zero addresses only to limited broadcast;
1943 	 * I even do not know to fix it or not. Waiting for complains :-)
1944 	 */
1945 	if (ZERONET(saddr))
1946 		goto martian_source;
1947 
1948 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1949 		goto martian_destination;
1950 
1951 	/*
1952 	 *	Now we are ready to route packet.
1953 	 */
1954 	if ((err = fib_lookup(&fl, &res)) != 0) {
1955 		if (!IN_DEV_FORWARD(in_dev))
1956 			goto e_hostunreach;
1957 		goto no_route;
1958 	}
1959 	free_res = 1;
1960 
1961 	RT_CACHE_STAT_INC(in_slow_tot);
1962 
1963 	if (res.type == RTN_BROADCAST)
1964 		goto brd_input;
1965 
1966 	if (res.type == RTN_LOCAL) {
1967 		int result;
1968 		result = fib_validate_source(saddr, daddr, tos,
1969 					     loopback_dev.ifindex,
1970 					     dev, &spec_dst, &itag);
1971 		if (result < 0)
1972 			goto martian_source;
1973 		if (result)
1974 			flags |= RTCF_DIRECTSRC;
1975 		spec_dst = daddr;
1976 		goto local_input;
1977 	}
1978 
1979 	if (!IN_DEV_FORWARD(in_dev))
1980 		goto e_hostunreach;
1981 	if (res.type != RTN_UNICAST)
1982 		goto martian_destination;
1983 
1984 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1985 	if (err == -ENOBUFS)
1986 		goto e_nobufs;
1987 	if (err == -EINVAL)
1988 		goto e_inval;
1989 
1990 done:
1991 	in_dev_put(in_dev);
1992 	if (free_res)
1993 		fib_res_put(&res);
1994 out:	return err;
1995 
1996 brd_input:
1997 	if (skb->protocol != htons(ETH_P_IP))
1998 		goto e_inval;
1999 
2000 	if (ZERONET(saddr))
2001 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2002 	else {
2003 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2004 					  &itag);
2005 		if (err < 0)
2006 			goto martian_source;
2007 		if (err)
2008 			flags |= RTCF_DIRECTSRC;
2009 	}
2010 	flags |= RTCF_BROADCAST;
2011 	res.type = RTN_BROADCAST;
2012 	RT_CACHE_STAT_INC(in_brd);
2013 
2014 local_input:
2015 	rth = dst_alloc(&ipv4_dst_ops);
2016 	if (!rth)
2017 		goto e_nobufs;
2018 
2019 	rth->u.dst.output= ip_rt_bug;
2020 
2021 	atomic_set(&rth->u.dst.__refcnt, 1);
2022 	rth->u.dst.flags= DST_HOST;
2023 	if (in_dev->cnf.no_policy)
2024 		rth->u.dst.flags |= DST_NOPOLICY;
2025 	rth->fl.fl4_dst	= daddr;
2026 	rth->rt_dst	= daddr;
2027 	rth->fl.fl4_tos	= tos;
2028 #ifdef CONFIG_IP_ROUTE_FWMARK
2029 	rth->fl.fl4_fwmark= skb->nfmark;
2030 #endif
2031 	rth->fl.fl4_src	= saddr;
2032 	rth->rt_src	= saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034 	rth->u.dst.tclassid = itag;
2035 #endif
2036 	rth->rt_iif	=
2037 	rth->fl.iif	= dev->ifindex;
2038 	rth->u.dst.dev	= &loopback_dev;
2039 	dev_hold(rth->u.dst.dev);
2040 	rth->idev	= in_dev_get(rth->u.dst.dev);
2041 	rth->rt_gateway	= daddr;
2042 	rth->rt_spec_dst= spec_dst;
2043 	rth->u.dst.input= ip_local_deliver;
2044 	rth->rt_flags 	= flags|RTCF_LOCAL;
2045 	if (res.type == RTN_UNREACHABLE) {
2046 		rth->u.dst.input= ip_error;
2047 		rth->u.dst.error= -err;
2048 		rth->rt_flags 	&= ~RTCF_LOCAL;
2049 	}
2050 	rth->rt_type	= res.type;
2051 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2052 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053 	goto done;
2054 
2055 no_route:
2056 	RT_CACHE_STAT_INC(in_no_route);
2057 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 	res.type = RTN_UNREACHABLE;
2059 	goto local_input;
2060 
2061 	/*
2062 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2063 	 */
2064 martian_destination:
2065 	RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 			"%u.%u.%u.%u, dev %s\n",
2070 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071 #endif
2072 
2073 e_hostunreach:
2074         err = -EHOSTUNREACH;
2075         goto done;
2076 
2077 e_inval:
2078 	err = -EINVAL;
2079 	goto done;
2080 
2081 e_nobufs:
2082 	err = -ENOBUFS;
2083 	goto done;
2084 
2085 martian_source:
2086 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087 	goto e_inval;
2088 }
2089 
2090 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2091 		   u8 tos, struct net_device *dev)
2092 {
2093 	struct rtable * rth;
2094 	unsigned	hash;
2095 	int iif = dev->ifindex;
2096 
2097 	tos &= IPTOS_RT_MASK;
2098 	hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2099 
2100 	rcu_read_lock();
2101 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 	     rth = rcu_dereference(rth->u.rt_next)) {
2103 		if (rth->fl.fl4_dst == daddr &&
2104 		    rth->fl.fl4_src == saddr &&
2105 		    rth->fl.iif == iif &&
2106 		    rth->fl.oif == 0 &&
2107 #ifdef CONFIG_IP_ROUTE_FWMARK
2108 		    rth->fl.fl4_fwmark == skb->nfmark &&
2109 #endif
2110 		    rth->fl.fl4_tos == tos) {
2111 			rth->u.dst.lastuse = jiffies;
2112 			dst_hold(&rth->u.dst);
2113 			rth->u.dst.__use++;
2114 			RT_CACHE_STAT_INC(in_hit);
2115 			rcu_read_unlock();
2116 			skb->dst = (struct dst_entry*)rth;
2117 			return 0;
2118 		}
2119 		RT_CACHE_STAT_INC(in_hlist_search);
2120 	}
2121 	rcu_read_unlock();
2122 
2123 	/* Multicast recognition logic is moved from route cache to here.
2124 	   The problem was that too many Ethernet cards have broken/missing
2125 	   hardware multicast filters :-( As result the host on multicasting
2126 	   network acquires a lot of useless route cache entries, sort of
2127 	   SDR messages from all the world. Now we try to get rid of them.
2128 	   Really, provided software IP multicast filter is organized
2129 	   reasonably (at least, hashed), it does not result in a slowdown
2130 	   comparing with route cache reject entries.
2131 	   Note, that multicast routers are not affected, because
2132 	   route cache entry is created eventually.
2133 	 */
2134 	if (MULTICAST(daddr)) {
2135 		struct in_device *in_dev;
2136 
2137 		rcu_read_lock();
2138 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2139 			int our = ip_check_mc(in_dev, daddr, saddr,
2140 				skb->nh.iph->protocol);
2141 			if (our
2142 #ifdef CONFIG_IP_MROUTE
2143 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2144 #endif
2145 			    ) {
2146 				rcu_read_unlock();
2147 				return ip_route_input_mc(skb, daddr, saddr,
2148 							 tos, dev, our);
2149 			}
2150 		}
2151 		rcu_read_unlock();
2152 		return -EINVAL;
2153 	}
2154 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2155 }
2156 
2157 static inline int __mkroute_output(struct rtable **result,
2158 				   struct fib_result* res,
2159 				   const struct flowi *fl,
2160 				   const struct flowi *oldflp,
2161 				   struct net_device *dev_out,
2162 				   unsigned flags)
2163 {
2164 	struct rtable *rth;
2165 	struct in_device *in_dev;
2166 	u32 tos = RT_FL_TOS(oldflp);
2167 	int err = 0;
2168 
2169 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2170 		return -EINVAL;
2171 
2172 	if (fl->fl4_dst == 0xFFFFFFFF)
2173 		res->type = RTN_BROADCAST;
2174 	else if (MULTICAST(fl->fl4_dst))
2175 		res->type = RTN_MULTICAST;
2176 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2177 		return -EINVAL;
2178 
2179 	if (dev_out->flags & IFF_LOOPBACK)
2180 		flags |= RTCF_LOCAL;
2181 
2182 	/* get work reference to inet device */
2183 	in_dev = in_dev_get(dev_out);
2184 	if (!in_dev)
2185 		return -EINVAL;
2186 
2187 	if (res->type == RTN_BROADCAST) {
2188 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2189 		if (res->fi) {
2190 			fib_info_put(res->fi);
2191 			res->fi = NULL;
2192 		}
2193 	} else if (res->type == RTN_MULTICAST) {
2194 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2195 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2196 				 oldflp->proto))
2197 			flags &= ~RTCF_LOCAL;
2198 		/* If multicast route do not exist use
2199 		   default one, but do not gateway in this case.
2200 		   Yes, it is hack.
2201 		 */
2202 		if (res->fi && res->prefixlen < 4) {
2203 			fib_info_put(res->fi);
2204 			res->fi = NULL;
2205 		}
2206 	}
2207 
2208 
2209 	rth = dst_alloc(&ipv4_dst_ops);
2210 	if (!rth) {
2211 		err = -ENOBUFS;
2212 		goto cleanup;
2213 	}
2214 
2215 	atomic_set(&rth->u.dst.__refcnt, 1);
2216 	rth->u.dst.flags= DST_HOST;
2217 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2218 	if (res->fi) {
2219 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2220 		if (res->fi->fib_nhs > 1)
2221 			rth->u.dst.flags |= DST_BALANCED;
2222 	}
2223 #endif
2224 	if (in_dev->cnf.no_xfrm)
2225 		rth->u.dst.flags |= DST_NOXFRM;
2226 	if (in_dev->cnf.no_policy)
2227 		rth->u.dst.flags |= DST_NOPOLICY;
2228 
2229 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2230 	rth->fl.fl4_tos	= tos;
2231 	rth->fl.fl4_src	= oldflp->fl4_src;
2232 	rth->fl.oif	= oldflp->oif;
2233 #ifdef CONFIG_IP_ROUTE_FWMARK
2234 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2235 #endif
2236 	rth->rt_dst	= fl->fl4_dst;
2237 	rth->rt_src	= fl->fl4_src;
2238 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2239 	/* get references to the devices that are to be hold by the routing
2240 	   cache entry */
2241 	rth->u.dst.dev	= dev_out;
2242 	dev_hold(dev_out);
2243 	rth->idev	= in_dev_get(dev_out);
2244 	rth->rt_gateway = fl->fl4_dst;
2245 	rth->rt_spec_dst= fl->fl4_src;
2246 
2247 	rth->u.dst.output=ip_output;
2248 
2249 	RT_CACHE_STAT_INC(out_slow_tot);
2250 
2251 	if (flags & RTCF_LOCAL) {
2252 		rth->u.dst.input = ip_local_deliver;
2253 		rth->rt_spec_dst = fl->fl4_dst;
2254 	}
2255 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2256 		rth->rt_spec_dst = fl->fl4_src;
2257 		if (flags & RTCF_LOCAL &&
2258 		    !(dev_out->flags & IFF_LOOPBACK)) {
2259 			rth->u.dst.output = ip_mc_output;
2260 			RT_CACHE_STAT_INC(out_slow_mc);
2261 		}
2262 #ifdef CONFIG_IP_MROUTE
2263 		if (res->type == RTN_MULTICAST) {
2264 			if (IN_DEV_MFORWARD(in_dev) &&
2265 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2266 				rth->u.dst.input = ip_mr_input;
2267 				rth->u.dst.output = ip_mc_output;
2268 			}
2269 		}
2270 #endif
2271 	}
2272 
2273 	rt_set_nexthop(rth, res, 0);
2274 
2275 	rth->rt_flags = flags;
2276 
2277 	*result = rth;
2278  cleanup:
2279 	/* release work reference to inet device */
2280 	in_dev_put(in_dev);
2281 
2282 	return err;
2283 }
2284 
2285 static inline int ip_mkroute_output_def(struct rtable **rp,
2286 					struct fib_result* res,
2287 					const struct flowi *fl,
2288 					const struct flowi *oldflp,
2289 					struct net_device *dev_out,
2290 					unsigned flags)
2291 {
2292 	struct rtable *rth = NULL;
2293 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2294 	unsigned hash;
2295 	if (err == 0) {
2296 		hash = rt_hash_code(oldflp->fl4_dst,
2297 				    oldflp->fl4_src ^ (oldflp->oif << 5));
2298 		err = rt_intern_hash(hash, rth, rp);
2299 	}
2300 
2301 	return err;
2302 }
2303 
2304 static inline int ip_mkroute_output(struct rtable** rp,
2305 				    struct fib_result* res,
2306 				    const struct flowi *fl,
2307 				    const struct flowi *oldflp,
2308 				    struct net_device *dev_out,
2309 				    unsigned flags)
2310 {
2311 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2312 	unsigned char hop;
2313 	unsigned hash;
2314 	int err = -EINVAL;
2315 	struct rtable *rth = NULL;
2316 
2317 	if (res->fi && res->fi->fib_nhs > 1) {
2318 		unsigned char hopcount = res->fi->fib_nhs;
2319 
2320 		for (hop = 0; hop < hopcount; hop++) {
2321 			struct net_device *dev2nexthop;
2322 
2323 			res->nh_sel = hop;
2324 
2325 			/* hold a work reference to the output device */
2326 			dev2nexthop = FIB_RES_DEV(*res);
2327 			dev_hold(dev2nexthop);
2328 
2329 			/* put reference to previous result */
2330 			if (hop)
2331 				ip_rt_put(*rp);
2332 
2333 			err = __mkroute_output(&rth, res, fl, oldflp,
2334 					       dev2nexthop, flags);
2335 
2336 			if (err != 0)
2337 				goto cleanup;
2338 
2339 			hash = rt_hash_code(oldflp->fl4_dst,
2340 					    oldflp->fl4_src ^
2341 					    (oldflp->oif << 5));
2342 			err = rt_intern_hash(hash, rth, rp);
2343 
2344 			/* forward hop information to multipath impl. */
2345 			multipath_set_nhinfo(rth,
2346 					     FIB_RES_NETWORK(*res),
2347 					     FIB_RES_NETMASK(*res),
2348 					     res->prefixlen,
2349 					     &FIB_RES_NH(*res));
2350 		cleanup:
2351 			/* release work reference to output device */
2352 			dev_put(dev2nexthop);
2353 
2354 			if (err != 0)
2355 				return err;
2356 		}
2357 		return err;
2358 	} else {
2359 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2360 					     flags);
2361 	}
2362 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2363 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2364 #endif
2365 }
2366 
2367 /*
2368  * Major route resolver routine.
2369  */
2370 
2371 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2372 {
2373 	u32 tos	= RT_FL_TOS(oldflp);
2374 	struct flowi fl = { .nl_u = { .ip4_u =
2375 				      { .daddr = oldflp->fl4_dst,
2376 					.saddr = oldflp->fl4_src,
2377 					.tos = tos & IPTOS_RT_MASK,
2378 					.scope = ((tos & RTO_ONLINK) ?
2379 						  RT_SCOPE_LINK :
2380 						  RT_SCOPE_UNIVERSE),
2381 #ifdef CONFIG_IP_ROUTE_FWMARK
2382 					.fwmark = oldflp->fl4_fwmark
2383 #endif
2384 				      } },
2385 			    .iif = loopback_dev.ifindex,
2386 			    .oif = oldflp->oif };
2387 	struct fib_result res;
2388 	unsigned flags = 0;
2389 	struct net_device *dev_out = NULL;
2390 	int free_res = 0;
2391 	int err;
2392 
2393 
2394 	res.fi		= NULL;
2395 #ifdef CONFIG_IP_MULTIPLE_TABLES
2396 	res.r		= NULL;
2397 #endif
2398 
2399 	if (oldflp->fl4_src) {
2400 		err = -EINVAL;
2401 		if (MULTICAST(oldflp->fl4_src) ||
2402 		    BADCLASS(oldflp->fl4_src) ||
2403 		    ZERONET(oldflp->fl4_src))
2404 			goto out;
2405 
2406 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2407 		dev_out = ip_dev_find(oldflp->fl4_src);
2408 		if (dev_out == NULL)
2409 			goto out;
2410 
2411 		/* I removed check for oif == dev_out->oif here.
2412 		   It was wrong for two reasons:
2413 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2414 		      assigned to multiple interfaces.
2415 		   2. Moreover, we are allowed to send packets with saddr
2416 		      of another iface. --ANK
2417 		 */
2418 
2419 		if (oldflp->oif == 0
2420 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2421 			/* Special hack: user can direct multicasts
2422 			   and limited broadcast via necessary interface
2423 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2424 			   This hack is not just for fun, it allows
2425 			   vic,vat and friends to work.
2426 			   They bind socket to loopback, set ttl to zero
2427 			   and expect that it will work.
2428 			   From the viewpoint of routing cache they are broken,
2429 			   because we are not allowed to build multicast path
2430 			   with loopback source addr (look, routing cache
2431 			   cannot know, that ttl is zero, so that packet
2432 			   will not leave this host and route is valid).
2433 			   Luckily, this hack is good workaround.
2434 			 */
2435 
2436 			fl.oif = dev_out->ifindex;
2437 			goto make_route;
2438 		}
2439 		if (dev_out)
2440 			dev_put(dev_out);
2441 		dev_out = NULL;
2442 	}
2443 
2444 
2445 	if (oldflp->oif) {
2446 		dev_out = dev_get_by_index(oldflp->oif);
2447 		err = -ENODEV;
2448 		if (dev_out == NULL)
2449 			goto out;
2450 
2451 		/* RACE: Check return value of inet_select_addr instead. */
2452 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2453 			dev_put(dev_out);
2454 			goto out;	/* Wrong error code */
2455 		}
2456 
2457 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2458 			if (!fl.fl4_src)
2459 				fl.fl4_src = inet_select_addr(dev_out, 0,
2460 							      RT_SCOPE_LINK);
2461 			goto make_route;
2462 		}
2463 		if (!fl.fl4_src) {
2464 			if (MULTICAST(oldflp->fl4_dst))
2465 				fl.fl4_src = inet_select_addr(dev_out, 0,
2466 							      fl.fl4_scope);
2467 			else if (!oldflp->fl4_dst)
2468 				fl.fl4_src = inet_select_addr(dev_out, 0,
2469 							      RT_SCOPE_HOST);
2470 		}
2471 	}
2472 
2473 	if (!fl.fl4_dst) {
2474 		fl.fl4_dst = fl.fl4_src;
2475 		if (!fl.fl4_dst)
2476 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2477 		if (dev_out)
2478 			dev_put(dev_out);
2479 		dev_out = &loopback_dev;
2480 		dev_hold(dev_out);
2481 		fl.oif = loopback_dev.ifindex;
2482 		res.type = RTN_LOCAL;
2483 		flags |= RTCF_LOCAL;
2484 		goto make_route;
2485 	}
2486 
2487 	if (fib_lookup(&fl, &res)) {
2488 		res.fi = NULL;
2489 		if (oldflp->oif) {
2490 			/* Apparently, routing tables are wrong. Assume,
2491 			   that the destination is on link.
2492 
2493 			   WHY? DW.
2494 			   Because we are allowed to send to iface
2495 			   even if it has NO routes and NO assigned
2496 			   addresses. When oif is specified, routing
2497 			   tables are looked up with only one purpose:
2498 			   to catch if destination is gatewayed, rather than
2499 			   direct. Moreover, if MSG_DONTROUTE is set,
2500 			   we send packet, ignoring both routing tables
2501 			   and ifaddr state. --ANK
2502 
2503 
2504 			   We could make it even if oif is unknown,
2505 			   likely IPv6, but we do not.
2506 			 */
2507 
2508 			if (fl.fl4_src == 0)
2509 				fl.fl4_src = inet_select_addr(dev_out, 0,
2510 							      RT_SCOPE_LINK);
2511 			res.type = RTN_UNICAST;
2512 			goto make_route;
2513 		}
2514 		if (dev_out)
2515 			dev_put(dev_out);
2516 		err = -ENETUNREACH;
2517 		goto out;
2518 	}
2519 	free_res = 1;
2520 
2521 	if (res.type == RTN_LOCAL) {
2522 		if (!fl.fl4_src)
2523 			fl.fl4_src = fl.fl4_dst;
2524 		if (dev_out)
2525 			dev_put(dev_out);
2526 		dev_out = &loopback_dev;
2527 		dev_hold(dev_out);
2528 		fl.oif = dev_out->ifindex;
2529 		if (res.fi)
2530 			fib_info_put(res.fi);
2531 		res.fi = NULL;
2532 		flags |= RTCF_LOCAL;
2533 		goto make_route;
2534 	}
2535 
2536 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2537 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2538 		fib_select_multipath(&fl, &res);
2539 	else
2540 #endif
2541 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2542 		fib_select_default(&fl, &res);
2543 
2544 	if (!fl.fl4_src)
2545 		fl.fl4_src = FIB_RES_PREFSRC(res);
2546 
2547 	if (dev_out)
2548 		dev_put(dev_out);
2549 	dev_out = FIB_RES_DEV(res);
2550 	dev_hold(dev_out);
2551 	fl.oif = dev_out->ifindex;
2552 
2553 
2554 make_route:
2555 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2556 
2557 
2558 	if (free_res)
2559 		fib_res_put(&res);
2560 	if (dev_out)
2561 		dev_put(dev_out);
2562 out:	return err;
2563 }
2564 
2565 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2566 {
2567 	unsigned hash;
2568 	struct rtable *rth;
2569 
2570 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2571 
2572 	rcu_read_lock_bh();
2573 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2574 		rth = rcu_dereference(rth->u.rt_next)) {
2575 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2576 		    rth->fl.fl4_src == flp->fl4_src &&
2577 		    rth->fl.iif == 0 &&
2578 		    rth->fl.oif == flp->oif &&
2579 #ifdef CONFIG_IP_ROUTE_FWMARK
2580 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2581 #endif
2582 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2583 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2584 
2585 			/* check for multipath routes and choose one if
2586 			 * necessary
2587 			 */
2588 			if (multipath_select_route(flp, rth, rp)) {
2589 				dst_hold(&(*rp)->u.dst);
2590 				RT_CACHE_STAT_INC(out_hit);
2591 				rcu_read_unlock_bh();
2592 				return 0;
2593 			}
2594 
2595 			rth->u.dst.lastuse = jiffies;
2596 			dst_hold(&rth->u.dst);
2597 			rth->u.dst.__use++;
2598 			RT_CACHE_STAT_INC(out_hit);
2599 			rcu_read_unlock_bh();
2600 			*rp = rth;
2601 			return 0;
2602 		}
2603 		RT_CACHE_STAT_INC(out_hlist_search);
2604 	}
2605 	rcu_read_unlock_bh();
2606 
2607 	return ip_route_output_slow(rp, flp);
2608 }
2609 
2610 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2611 
2612 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2613 {
2614 	int err;
2615 
2616 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2617 		return err;
2618 
2619 	if (flp->proto) {
2620 		if (!flp->fl4_src)
2621 			flp->fl4_src = (*rp)->rt_src;
2622 		if (!flp->fl4_dst)
2623 			flp->fl4_dst = (*rp)->rt_dst;
2624 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2625 	}
2626 
2627 	return 0;
2628 }
2629 
2630 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2631 
2632 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2633 {
2634 	return ip_route_output_flow(rp, flp, NULL, 0);
2635 }
2636 
2637 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2638 			int nowait, unsigned int flags)
2639 {
2640 	struct rtable *rt = (struct rtable*)skb->dst;
2641 	struct rtmsg *r;
2642 	struct nlmsghdr  *nlh;
2643 	unsigned char	 *b = skb->tail;
2644 	struct rta_cacheinfo ci;
2645 #ifdef CONFIG_IP_MROUTE
2646 	struct rtattr *eptr;
2647 #endif
2648 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2649 	r = NLMSG_DATA(nlh);
2650 	r->rtm_family	 = AF_INET;
2651 	r->rtm_dst_len	= 32;
2652 	r->rtm_src_len	= 0;
2653 	r->rtm_tos	= rt->fl.fl4_tos;
2654 	r->rtm_table	= RT_TABLE_MAIN;
2655 	r->rtm_type	= rt->rt_type;
2656 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2657 	r->rtm_protocol = RTPROT_UNSPEC;
2658 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2659 	if (rt->rt_flags & RTCF_NOTIFY)
2660 		r->rtm_flags |= RTM_F_NOTIFY;
2661 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2662 	if (rt->fl.fl4_src) {
2663 		r->rtm_src_len = 32;
2664 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2665 	}
2666 	if (rt->u.dst.dev)
2667 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2668 #ifdef CONFIG_NET_CLS_ROUTE
2669 	if (rt->u.dst.tclassid)
2670 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2671 #endif
2672 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2673 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2674 		__u32 alg = rt->rt_multipath_alg;
2675 
2676 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2677 	}
2678 #endif
2679 	if (rt->fl.iif)
2680 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2681 	else if (rt->rt_src != rt->fl.fl4_src)
2682 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2683 	if (rt->rt_dst != rt->rt_gateway)
2684 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2685 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2686 		goto rtattr_failure;
2687 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2688 	ci.rta_used	= rt->u.dst.__use;
2689 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2690 	if (rt->u.dst.expires)
2691 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2692 	else
2693 		ci.rta_expires = 0;
2694 	ci.rta_error	= rt->u.dst.error;
2695 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2696 	if (rt->peer) {
2697 		ci.rta_id = rt->peer->ip_id_count;
2698 		if (rt->peer->tcp_ts_stamp) {
2699 			ci.rta_ts = rt->peer->tcp_ts;
2700 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2701 		}
2702 	}
2703 #ifdef CONFIG_IP_MROUTE
2704 	eptr = (struct rtattr*)skb->tail;
2705 #endif
2706 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2707 	if (rt->fl.iif) {
2708 #ifdef CONFIG_IP_MROUTE
2709 		u32 dst = rt->rt_dst;
2710 
2711 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2712 		    ipv4_devconf.mc_forwarding) {
2713 			int err = ipmr_get_route(skb, r, nowait);
2714 			if (err <= 0) {
2715 				if (!nowait) {
2716 					if (err == 0)
2717 						return 0;
2718 					goto nlmsg_failure;
2719 				} else {
2720 					if (err == -EMSGSIZE)
2721 						goto nlmsg_failure;
2722 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2723 				}
2724 			}
2725 		} else
2726 #endif
2727 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2728 	}
2729 
2730 	nlh->nlmsg_len = skb->tail - b;
2731 	return skb->len;
2732 
2733 nlmsg_failure:
2734 rtattr_failure:
2735 	skb_trim(skb, b - skb->data);
2736 	return -1;
2737 }
2738 
2739 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2740 {
2741 	struct rtattr **rta = arg;
2742 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2743 	struct rtable *rt = NULL;
2744 	u32 dst = 0;
2745 	u32 src = 0;
2746 	int iif = 0;
2747 	int err = -ENOBUFS;
2748 	struct sk_buff *skb;
2749 
2750 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2751 	if (!skb)
2752 		goto out;
2753 
2754 	/* Reserve room for dummy headers, this skb can pass
2755 	   through good chunk of routing engine.
2756 	 */
2757 	skb->mac.raw = skb->nh.raw = skb->data;
2758 
2759 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2760 	skb->nh.iph->protocol = IPPROTO_ICMP;
2761 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2762 
2763 	if (rta[RTA_SRC - 1])
2764 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2765 	if (rta[RTA_DST - 1])
2766 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2767 	if (rta[RTA_IIF - 1])
2768 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2769 
2770 	if (iif) {
2771 		struct net_device *dev = __dev_get_by_index(iif);
2772 		err = -ENODEV;
2773 		if (!dev)
2774 			goto out_free;
2775 		skb->protocol	= htons(ETH_P_IP);
2776 		skb->dev	= dev;
2777 		local_bh_disable();
2778 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2779 		local_bh_enable();
2780 		rt = (struct rtable*)skb->dst;
2781 		if (!err && rt->u.dst.error)
2782 			err = -rt->u.dst.error;
2783 	} else {
2784 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2785 							 .saddr = src,
2786 							 .tos = rtm->rtm_tos } } };
2787 		int oif = 0;
2788 		if (rta[RTA_OIF - 1])
2789 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2790 		fl.oif = oif;
2791 		err = ip_route_output_key(&rt, &fl);
2792 	}
2793 	if (err)
2794 		goto out_free;
2795 
2796 	skb->dst = &rt->u.dst;
2797 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2798 		rt->rt_flags |= RTCF_NOTIFY;
2799 
2800 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2801 
2802 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2803 				RTM_NEWROUTE, 0, 0);
2804 	if (!err)
2805 		goto out_free;
2806 	if (err < 0) {
2807 		err = -EMSGSIZE;
2808 		goto out_free;
2809 	}
2810 
2811 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2812 	if (err > 0)
2813 		err = 0;
2814 out:	return err;
2815 
2816 out_free:
2817 	kfree_skb(skb);
2818 	goto out;
2819 }
2820 
2821 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2822 {
2823 	struct rtable *rt;
2824 	int h, s_h;
2825 	int idx, s_idx;
2826 
2827 	s_h = cb->args[0];
2828 	s_idx = idx = cb->args[1];
2829 	for (h = 0; h <= rt_hash_mask; h++) {
2830 		if (h < s_h) continue;
2831 		if (h > s_h)
2832 			s_idx = 0;
2833 		rcu_read_lock_bh();
2834 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2835 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2836 			if (idx < s_idx)
2837 				continue;
2838 			skb->dst = dst_clone(&rt->u.dst);
2839 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2840 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2841 					 1, NLM_F_MULTI) <= 0) {
2842 				dst_release(xchg(&skb->dst, NULL));
2843 				rcu_read_unlock_bh();
2844 				goto done;
2845 			}
2846 			dst_release(xchg(&skb->dst, NULL));
2847 		}
2848 		rcu_read_unlock_bh();
2849 	}
2850 
2851 done:
2852 	cb->args[0] = h;
2853 	cb->args[1] = idx;
2854 	return skb->len;
2855 }
2856 
2857 void ip_rt_multicast_event(struct in_device *in_dev)
2858 {
2859 	rt_cache_flush(0);
2860 }
2861 
2862 #ifdef CONFIG_SYSCTL
2863 static int flush_delay;
2864 
2865 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2866 					struct file *filp, void __user *buffer,
2867 					size_t *lenp, loff_t *ppos)
2868 {
2869 	if (write) {
2870 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2871 		rt_cache_flush(flush_delay);
2872 		return 0;
2873 	}
2874 
2875 	return -EINVAL;
2876 }
2877 
2878 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2879 						int __user *name,
2880 						int nlen,
2881 						void __user *oldval,
2882 						size_t __user *oldlenp,
2883 						void __user *newval,
2884 						size_t newlen,
2885 						void **context)
2886 {
2887 	int delay;
2888 	if (newlen != sizeof(int))
2889 		return -EINVAL;
2890 	if (get_user(delay, (int __user *)newval))
2891 		return -EFAULT;
2892 	rt_cache_flush(delay);
2893 	return 0;
2894 }
2895 
2896 ctl_table ipv4_route_table[] = {
2897         {
2898 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2899 		.procname	= "flush",
2900 		.data		= &flush_delay,
2901 		.maxlen		= sizeof(int),
2902 		.mode		= 0200,
2903 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2904 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2905 	},
2906 	{
2907 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2908 		.procname	= "min_delay",
2909 		.data		= &ip_rt_min_delay,
2910 		.maxlen		= sizeof(int),
2911 		.mode		= 0644,
2912 		.proc_handler	= &proc_dointvec_jiffies,
2913 		.strategy	= &sysctl_jiffies,
2914 	},
2915 	{
2916 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2917 		.procname	= "max_delay",
2918 		.data		= &ip_rt_max_delay,
2919 		.maxlen		= sizeof(int),
2920 		.mode		= 0644,
2921 		.proc_handler	= &proc_dointvec_jiffies,
2922 		.strategy	= &sysctl_jiffies,
2923 	},
2924 	{
2925 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2926 		.procname	= "gc_thresh",
2927 		.data		= &ipv4_dst_ops.gc_thresh,
2928 		.maxlen		= sizeof(int),
2929 		.mode		= 0644,
2930 		.proc_handler	= &proc_dointvec,
2931 	},
2932 	{
2933 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2934 		.procname	= "max_size",
2935 		.data		= &ip_rt_max_size,
2936 		.maxlen		= sizeof(int),
2937 		.mode		= 0644,
2938 		.proc_handler	= &proc_dointvec,
2939 	},
2940 	{
2941 		/*  Deprecated. Use gc_min_interval_ms */
2942 
2943 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2944 		.procname	= "gc_min_interval",
2945 		.data		= &ip_rt_gc_min_interval,
2946 		.maxlen		= sizeof(int),
2947 		.mode		= 0644,
2948 		.proc_handler	= &proc_dointvec_jiffies,
2949 		.strategy	= &sysctl_jiffies,
2950 	},
2951 	{
2952 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2953 		.procname	= "gc_min_interval_ms",
2954 		.data		= &ip_rt_gc_min_interval,
2955 		.maxlen		= sizeof(int),
2956 		.mode		= 0644,
2957 		.proc_handler	= &proc_dointvec_ms_jiffies,
2958 		.strategy	= &sysctl_ms_jiffies,
2959 	},
2960 	{
2961 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2962 		.procname	= "gc_timeout",
2963 		.data		= &ip_rt_gc_timeout,
2964 		.maxlen		= sizeof(int),
2965 		.mode		= 0644,
2966 		.proc_handler	= &proc_dointvec_jiffies,
2967 		.strategy	= &sysctl_jiffies,
2968 	},
2969 	{
2970 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2971 		.procname	= "gc_interval",
2972 		.data		= &ip_rt_gc_interval,
2973 		.maxlen		= sizeof(int),
2974 		.mode		= 0644,
2975 		.proc_handler	= &proc_dointvec_jiffies,
2976 		.strategy	= &sysctl_jiffies,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2980 		.procname	= "redirect_load",
2981 		.data		= &ip_rt_redirect_load,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec,
2985 	},
2986 	{
2987 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2988 		.procname	= "redirect_number",
2989 		.data		= &ip_rt_redirect_number,
2990 		.maxlen		= sizeof(int),
2991 		.mode		= 0644,
2992 		.proc_handler	= &proc_dointvec,
2993 	},
2994 	{
2995 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2996 		.procname	= "redirect_silence",
2997 		.data		= &ip_rt_redirect_silence,
2998 		.maxlen		= sizeof(int),
2999 		.mode		= 0644,
3000 		.proc_handler	= &proc_dointvec,
3001 	},
3002 	{
3003 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3004 		.procname	= "error_cost",
3005 		.data		= &ip_rt_error_cost,
3006 		.maxlen		= sizeof(int),
3007 		.mode		= 0644,
3008 		.proc_handler	= &proc_dointvec,
3009 	},
3010 	{
3011 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3012 		.procname	= "error_burst",
3013 		.data		= &ip_rt_error_burst,
3014 		.maxlen		= sizeof(int),
3015 		.mode		= 0644,
3016 		.proc_handler	= &proc_dointvec,
3017 	},
3018 	{
3019 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3020 		.procname	= "gc_elasticity",
3021 		.data		= &ip_rt_gc_elasticity,
3022 		.maxlen		= sizeof(int),
3023 		.mode		= 0644,
3024 		.proc_handler	= &proc_dointvec,
3025 	},
3026 	{
3027 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3028 		.procname	= "mtu_expires",
3029 		.data		= &ip_rt_mtu_expires,
3030 		.maxlen		= sizeof(int),
3031 		.mode		= 0644,
3032 		.proc_handler	= &proc_dointvec_jiffies,
3033 		.strategy	= &sysctl_jiffies,
3034 	},
3035 	{
3036 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3037 		.procname	= "min_pmtu",
3038 		.data		= &ip_rt_min_pmtu,
3039 		.maxlen		= sizeof(int),
3040 		.mode		= 0644,
3041 		.proc_handler	= &proc_dointvec,
3042 	},
3043 	{
3044 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3045 		.procname	= "min_adv_mss",
3046 		.data		= &ip_rt_min_advmss,
3047 		.maxlen		= sizeof(int),
3048 		.mode		= 0644,
3049 		.proc_handler	= &proc_dointvec,
3050 	},
3051 	{
3052 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3053 		.procname	= "secret_interval",
3054 		.data		= &ip_rt_secret_interval,
3055 		.maxlen		= sizeof(int),
3056 		.mode		= 0644,
3057 		.proc_handler	= &proc_dointvec_jiffies,
3058 		.strategy	= &sysctl_jiffies,
3059 	},
3060 	{ .ctl_name = 0 }
3061 };
3062 #endif
3063 
3064 #ifdef CONFIG_NET_CLS_ROUTE
3065 struct ip_rt_acct *ip_rt_acct;
3066 
3067 /* This code sucks.  But you should have seen it before! --RR */
3068 
3069 /* IP route accounting ptr for this logical cpu number. */
3070 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3071 
3072 #ifdef CONFIG_PROC_FS
3073 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3074 			   int length, int *eof, void *data)
3075 {
3076 	unsigned int i;
3077 
3078 	if ((offset & 3) || (length & 3))
3079 		return -EIO;
3080 
3081 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3082 		*eof = 1;
3083 		return 0;
3084 	}
3085 
3086 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3087 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3088 		*eof = 1;
3089 	}
3090 
3091 	offset /= sizeof(u32);
3092 
3093 	if (length > 0) {
3094 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3095 		u32 *dst = (u32 *) buffer;
3096 
3097 		/* Copy first cpu. */
3098 		*start = buffer;
3099 		memcpy(dst, src, length);
3100 
3101 		/* Add the other cpus in, one int at a time */
3102 		for_each_possible_cpu(i) {
3103 			unsigned int j;
3104 
3105 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3106 
3107 			for (j = 0; j < length/4; j++)
3108 				dst[j] += src[j];
3109 		}
3110 	}
3111 	return length;
3112 }
3113 #endif /* CONFIG_PROC_FS */
3114 #endif /* CONFIG_NET_CLS_ROUTE */
3115 
3116 static __initdata unsigned long rhash_entries;
3117 static int __init set_rhash_entries(char *str)
3118 {
3119 	if (!str)
3120 		return 0;
3121 	rhash_entries = simple_strtoul(str, &str, 0);
3122 	return 1;
3123 }
3124 __setup("rhash_entries=", set_rhash_entries);
3125 
3126 int __init ip_rt_init(void)
3127 {
3128 	int rc = 0;
3129 
3130 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3131 			     (jiffies ^ (jiffies >> 7)));
3132 
3133 #ifdef CONFIG_NET_CLS_ROUTE
3134 	{
3135 	int order;
3136 	for (order = 0;
3137 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3138 		/* NOTHING */;
3139 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3140 	if (!ip_rt_acct)
3141 		panic("IP: failed to allocate ip_rt_acct\n");
3142 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3143 	}
3144 #endif
3145 
3146 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3147 						     sizeof(struct rtable),
3148 						     0, SLAB_HWCACHE_ALIGN,
3149 						     NULL, NULL);
3150 
3151 	if (!ipv4_dst_ops.kmem_cachep)
3152 		panic("IP: failed to allocate ip_dst_cache\n");
3153 
3154 	rt_hash_table = (struct rt_hash_bucket *)
3155 		alloc_large_system_hash("IP route cache",
3156 					sizeof(struct rt_hash_bucket),
3157 					rhash_entries,
3158 					(num_physpages >= 128 * 1024) ?
3159 					15 : 17,
3160 					0,
3161 					&rt_hash_log,
3162 					&rt_hash_mask,
3163 					0);
3164 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3165 	rt_hash_lock_init();
3166 
3167 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3168 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3169 
3170 	devinet_init();
3171 	ip_fib_init();
3172 
3173 	init_timer(&rt_flush_timer);
3174 	rt_flush_timer.function = rt_run_flush;
3175 	init_timer(&rt_periodic_timer);
3176 	rt_periodic_timer.function = rt_check_expire;
3177 	init_timer(&rt_secret_timer);
3178 	rt_secret_timer.function = rt_secret_rebuild;
3179 
3180 	/* All the timers, started at system startup tend
3181 	   to synchronize. Perturb it a bit.
3182 	 */
3183 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3184 					ip_rt_gc_interval;
3185 	add_timer(&rt_periodic_timer);
3186 
3187 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3188 		ip_rt_secret_interval;
3189 	add_timer(&rt_secret_timer);
3190 
3191 #ifdef CONFIG_PROC_FS
3192 	{
3193 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3194 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3195 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3196 			    		     proc_net_stat))) {
3197 		return -ENOMEM;
3198 	}
3199 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3200 	}
3201 #ifdef CONFIG_NET_CLS_ROUTE
3202 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3203 #endif
3204 #endif
3205 #ifdef CONFIG_XFRM
3206 	xfrm_init();
3207 	xfrm4_init();
3208 #endif
3209 	return rc;
3210 }
3211 
3212 EXPORT_SYMBOL(__ip_select_ident);
3213 EXPORT_SYMBOL(ip_route_input);
3214 EXPORT_SYMBOL(ip_route_output_key);
3215