xref: /linux/net/ipv4/route.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
59  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
60  *
61  *		This program is free software; you can redistribute it and/or
62  *		modify it under the terms of the GNU General Public License
63  *		as published by the Free Software Foundation; either version
64  *		2 of the License, or (at your option) any later version.
65  */
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_min_delay		= 2 * HZ;
119 static int ip_rt_max_delay		= 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval		= 60 * HZ;
123 static int ip_rt_gc_min_interval	= HZ / 2;
124 static int ip_rt_redirect_number	= 9;
125 static int ip_rt_redirect_load		= HZ / 50;
126 static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost		= HZ;
128 static int ip_rt_error_burst		= 5 * HZ;
129 static int ip_rt_gc_elasticity		= 8;
130 static int ip_rt_mtu_expires		= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu		= 512 + 20 + 20;
132 static int ip_rt_min_advmss		= 256;
133 static int ip_rt_secret_interval	= 10 * 60 * HZ;
134 static unsigned long rt_deadline;
135 
136 #define RTprint(a...)	printk(KERN_DEBUG a)
137 
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
141 
142 /*
143  *	Interface to generic destination cache.
144  */
145 
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void		 ipv4_dst_destroy(struct dst_entry *dst);
148 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
149 					 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void		 ipv4_link_failure(struct sk_buff *skb);
152 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
154 
155 
156 static struct dst_ops ipv4_dst_ops = {
157 	.family =		AF_INET,
158 	.protocol =		__constant_htons(ETH_P_IP),
159 	.gc =			rt_garbage_collect,
160 	.check =		ipv4_dst_check,
161 	.destroy =		ipv4_dst_destroy,
162 	.ifdown =		ipv4_dst_ifdown,
163 	.negative_advice =	ipv4_negative_advice,
164 	.link_failure =		ipv4_link_failure,
165 	.update_pmtu =		ip_rt_update_pmtu,
166 	.entry_size =		sizeof(struct rtable),
167 };
168 
169 #define ECN_OR_COST(class)	TC_PRIO_##class
170 
171 __u8 ip_tos2prio[16] = {
172 	TC_PRIO_BESTEFFORT,
173 	ECN_OR_COST(FILLER),
174 	TC_PRIO_BESTEFFORT,
175 	ECN_OR_COST(BESTEFFORT),
176 	TC_PRIO_BULK,
177 	ECN_OR_COST(BULK),
178 	TC_PRIO_BULK,
179 	ECN_OR_COST(BULK),
180 	TC_PRIO_INTERACTIVE,
181 	ECN_OR_COST(INTERACTIVE),
182 	TC_PRIO_INTERACTIVE,
183 	ECN_OR_COST(INTERACTIVE),
184 	TC_PRIO_INTERACTIVE_BULK,
185 	ECN_OR_COST(INTERACTIVE_BULK),
186 	TC_PRIO_INTERACTIVE_BULK,
187 	ECN_OR_COST(INTERACTIVE_BULK)
188 };
189 
190 
191 /*
192  * Route cache.
193  */
194 
195 /* The locking scheme is rather straight forward:
196  *
197  * 1) Read-Copy Update protects the buckets of the central route hash.
198  * 2) Only writers remove entries, and they hold the lock
199  *    as they look at rtable reference counts.
200  * 3) Only readers acquire references to rtable entries,
201  *    they do so with atomic increments and with the
202  *    lock held.
203  */
204 
205 struct rt_hash_bucket {
206 	struct rtable	*chain;
207 };
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 	defined(CONFIG_PROVE_LOCKING)
210 /*
211  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212  * The size of this table is a power of two and depends on the number of CPUS.
213  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214  */
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ	256
217 #else
218 # if NR_CPUS >= 32
219 #  define RT_HASH_LOCK_SZ	4096
220 # elif NR_CPUS >= 16
221 #  define RT_HASH_LOCK_SZ	2048
222 # elif NR_CPUS >= 8
223 #  define RT_HASH_LOCK_SZ	1024
224 # elif NR_CPUS >= 4
225 #  define RT_HASH_LOCK_SZ	512
226 # else
227 #  define RT_HASH_LOCK_SZ	256
228 # endif
229 #endif
230 
231 static spinlock_t	*rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init()	{ \
234 		int i; \
235 		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 			spin_lock_init(&rt_hash_locks[i]); \
239 		}
240 #else
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
243 #endif
244 
245 static struct rt_hash_bucket 	*rt_hash_table;
246 static unsigned			rt_hash_mask;
247 static int			rt_hash_log;
248 static unsigned int		rt_hash_rnd;
249 
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 	(__raw_get_cpu_var(rt_cache_stat).field++)
253 
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 				struct rtable **res);
256 
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
258 {
259 	return (jhash_2words(daddr, saddr, rt_hash_rnd)
260 		& rt_hash_mask);
261 }
262 
263 #ifdef CONFIG_PROC_FS
264 struct rt_cache_iter_state {
265 	int bucket;
266 };
267 
268 static struct rtable *rt_cache_get_first(struct seq_file *seq)
269 {
270 	struct rtable *r = NULL;
271 	struct rt_cache_iter_state *st = seq->private;
272 
273 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
274 		rcu_read_lock_bh();
275 		r = rt_hash_table[st->bucket].chain;
276 		if (r)
277 			break;
278 		rcu_read_unlock_bh();
279 	}
280 	return r;
281 }
282 
283 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
284 {
285 	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
286 
287 	r = r->u.rt_next;
288 	while (!r) {
289 		rcu_read_unlock_bh();
290 		if (--st->bucket < 0)
291 			break;
292 		rcu_read_lock_bh();
293 		r = rt_hash_table[st->bucket].chain;
294 	}
295 	return r;
296 }
297 
298 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
299 {
300 	struct rtable *r = rt_cache_get_first(seq);
301 
302 	if (r)
303 		while (pos && (r = rt_cache_get_next(seq, r)))
304 			--pos;
305 	return pos ? NULL : r;
306 }
307 
308 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
309 {
310 	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
311 }
312 
313 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
314 {
315 	struct rtable *r = NULL;
316 
317 	if (v == SEQ_START_TOKEN)
318 		r = rt_cache_get_first(seq);
319 	else
320 		r = rt_cache_get_next(seq, v);
321 	++*pos;
322 	return r;
323 }
324 
325 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
326 {
327 	if (v && v != SEQ_START_TOKEN)
328 		rcu_read_unlock_bh();
329 }
330 
331 static int rt_cache_seq_show(struct seq_file *seq, void *v)
332 {
333 	if (v == SEQ_START_TOKEN)
334 		seq_printf(seq, "%-127s\n",
335 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
336 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
337 			   "HHUptod\tSpecDst");
338 	else {
339 		struct rtable *r = v;
340 		char temp[256];
341 
342 		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
343 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
344 			r->u.dst.dev ? r->u.dst.dev->name : "*",
345 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
346 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
347 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
348 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
349 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
350 			dst_metric(&r->u.dst, RTAX_WINDOW),
351 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
352 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
353 			r->fl.fl4_tos,
354 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
355 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
356 				       dev_queue_xmit) : 0,
357 			r->rt_spec_dst);
358 		seq_printf(seq, "%-127s\n", temp);
359         }
360   	return 0;
361 }
362 
363 static struct seq_operations rt_cache_seq_ops = {
364 	.start  = rt_cache_seq_start,
365 	.next   = rt_cache_seq_next,
366 	.stop   = rt_cache_seq_stop,
367 	.show   = rt_cache_seq_show,
368 };
369 
370 static int rt_cache_seq_open(struct inode *inode, struct file *file)
371 {
372 	struct seq_file *seq;
373 	int rc = -ENOMEM;
374 	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
375 
376 	if (!s)
377 		goto out;
378 	rc = seq_open(file, &rt_cache_seq_ops);
379 	if (rc)
380 		goto out_kfree;
381 	seq          = file->private_data;
382 	seq->private = s;
383 	memset(s, 0, sizeof(*s));
384 out:
385 	return rc;
386 out_kfree:
387 	kfree(s);
388 	goto out;
389 }
390 
391 static struct file_operations rt_cache_seq_fops = {
392 	.owner	 = THIS_MODULE,
393 	.open	 = rt_cache_seq_open,
394 	.read	 = seq_read,
395 	.llseek	 = seq_lseek,
396 	.release = seq_release_private,
397 };
398 
399 
400 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
401 {
402 	int cpu;
403 
404 	if (*pos == 0)
405 		return SEQ_START_TOKEN;
406 
407 	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
408 		if (!cpu_possible(cpu))
409 			continue;
410 		*pos = cpu+1;
411 		return &per_cpu(rt_cache_stat, cpu);
412 	}
413 	return NULL;
414 }
415 
416 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
417 {
418 	int cpu;
419 
420 	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
421 		if (!cpu_possible(cpu))
422 			continue;
423 		*pos = cpu+1;
424 		return &per_cpu(rt_cache_stat, cpu);
425 	}
426 	return NULL;
427 
428 }
429 
430 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
431 {
432 
433 }
434 
435 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
436 {
437 	struct rt_cache_stat *st = v;
438 
439 	if (v == SEQ_START_TOKEN) {
440 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
441 		return 0;
442 	}
443 
444 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
445 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
446 		   atomic_read(&ipv4_dst_ops.entries),
447 		   st->in_hit,
448 		   st->in_slow_tot,
449 		   st->in_slow_mc,
450 		   st->in_no_route,
451 		   st->in_brd,
452 		   st->in_martian_dst,
453 		   st->in_martian_src,
454 
455 		   st->out_hit,
456 		   st->out_slow_tot,
457 		   st->out_slow_mc,
458 
459 		   st->gc_total,
460 		   st->gc_ignored,
461 		   st->gc_goal_miss,
462 		   st->gc_dst_overflow,
463 		   st->in_hlist_search,
464 		   st->out_hlist_search
465 		);
466 	return 0;
467 }
468 
469 static struct seq_operations rt_cpu_seq_ops = {
470 	.start  = rt_cpu_seq_start,
471 	.next   = rt_cpu_seq_next,
472 	.stop   = rt_cpu_seq_stop,
473 	.show   = rt_cpu_seq_show,
474 };
475 
476 
477 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
478 {
479 	return seq_open(file, &rt_cpu_seq_ops);
480 }
481 
482 static struct file_operations rt_cpu_seq_fops = {
483 	.owner	 = THIS_MODULE,
484 	.open	 = rt_cpu_seq_open,
485 	.read	 = seq_read,
486 	.llseek	 = seq_lseek,
487 	.release = seq_release,
488 };
489 
490 #endif /* CONFIG_PROC_FS */
491 
492 static __inline__ void rt_free(struct rtable *rt)
493 {
494 	multipath_remove(rt);
495 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497 
498 static __inline__ void rt_drop(struct rtable *rt)
499 {
500 	multipath_remove(rt);
501 	ip_rt_put(rt);
502 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
503 }
504 
505 static __inline__ int rt_fast_clean(struct rtable *rth)
506 {
507 	/* Kill broadcast/multicast entries very aggresively, if they
508 	   collide in hash table with more useful entries */
509 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
510 		rth->fl.iif && rth->u.rt_next;
511 }
512 
513 static __inline__ int rt_valuable(struct rtable *rth)
514 {
515 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
516 		rth->u.dst.expires;
517 }
518 
519 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
520 {
521 	unsigned long age;
522 	int ret = 0;
523 
524 	if (atomic_read(&rth->u.dst.__refcnt))
525 		goto out;
526 
527 	ret = 1;
528 	if (rth->u.dst.expires &&
529 	    time_after_eq(jiffies, rth->u.dst.expires))
530 		goto out;
531 
532 	age = jiffies - rth->u.dst.lastuse;
533 	ret = 0;
534 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
535 	    (age <= tmo2 && rt_valuable(rth)))
536 		goto out;
537 	ret = 1;
538 out:	return ret;
539 }
540 
541 /* Bits of score are:
542  * 31: very valuable
543  * 30: not quite useless
544  * 29..0: usage counter
545  */
546 static inline u32 rt_score(struct rtable *rt)
547 {
548 	u32 score = jiffies - rt->u.dst.lastuse;
549 
550 	score = ~score & ~(3<<30);
551 
552 	if (rt_valuable(rt))
553 		score |= (1<<31);
554 
555 	if (!rt->fl.iif ||
556 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
557 		score |= (1<<30);
558 
559 	return score;
560 }
561 
562 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
563 {
564 	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
565 	       fl1->oif     == fl2->oif &&
566 	       fl1->iif     == fl2->iif;
567 }
568 
569 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
570 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
571 						struct rtable *expentry,
572 						int *removed_count)
573 {
574 	int passedexpired = 0;
575 	struct rtable **nextstep = NULL;
576 	struct rtable **rthp = chain_head;
577 	struct rtable *rth;
578 
579 	if (removed_count)
580 		*removed_count = 0;
581 
582 	while ((rth = *rthp) != NULL) {
583 		if (rth == expentry)
584 			passedexpired = 1;
585 
586 		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
587 		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
588 			if (*rthp == expentry) {
589 				*rthp = rth->u.rt_next;
590 				continue;
591 			} else {
592 				*rthp = rth->u.rt_next;
593 				rt_free(rth);
594 				if (removed_count)
595 					++(*removed_count);
596 			}
597 		} else {
598 			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
599 			    passedexpired && !nextstep)
600 				nextstep = &rth->u.rt_next;
601 
602 			rthp = &rth->u.rt_next;
603 		}
604 	}
605 
606 	rt_free(expentry);
607 	if (removed_count)
608 		++(*removed_count);
609 
610 	return nextstep;
611 }
612 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
613 
614 
615 /* This runs via a timer and thus is always in BH context. */
616 static void rt_check_expire(unsigned long dummy)
617 {
618 	static unsigned int rover;
619 	unsigned int i = rover, goal;
620 	struct rtable *rth, **rthp;
621 	unsigned long now = jiffies;
622 	u64 mult;
623 
624 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
625 	if (ip_rt_gc_timeout > 1)
626 		do_div(mult, ip_rt_gc_timeout);
627 	goal = (unsigned int)mult;
628 	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
629 	for (; goal > 0; goal--) {
630 		unsigned long tmo = ip_rt_gc_timeout;
631 
632 		i = (i + 1) & rt_hash_mask;
633 		rthp = &rt_hash_table[i].chain;
634 
635 		if (*rthp == 0)
636 			continue;
637 		spin_lock(rt_hash_lock_addr(i));
638 		while ((rth = *rthp) != NULL) {
639 			if (rth->u.dst.expires) {
640 				/* Entry is expired even if it is in use */
641 				if (time_before_eq(now, rth->u.dst.expires)) {
642 					tmo >>= 1;
643 					rthp = &rth->u.rt_next;
644 					continue;
645 				}
646 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
647 				tmo >>= 1;
648 				rthp = &rth->u.rt_next;
649 				continue;
650 			}
651 
652 			/* Cleanup aged off entries. */
653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 			/* remove all related balanced entries if necessary */
655 			if (rth->u.dst.flags & DST_BALANCED) {
656 				rthp = rt_remove_balanced_route(
657 					&rt_hash_table[i].chain,
658 					rth, NULL);
659 				if (!rthp)
660 					break;
661 			} else {
662 				*rthp = rth->u.rt_next;
663 				rt_free(rth);
664 			}
665 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
666  			*rthp = rth->u.rt_next;
667  			rt_free(rth);
668 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
669 		}
670 		spin_unlock(rt_hash_lock_addr(i));
671 
672 		/* Fallback loop breaker. */
673 		if (time_after(jiffies, now))
674 			break;
675 	}
676 	rover = i;
677 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
678 }
679 
680 /* This can run from both BH and non-BH contexts, the latter
681  * in the case of a forced flush event.
682  */
683 static void rt_run_flush(unsigned long dummy)
684 {
685 	int i;
686 	struct rtable *rth, *next;
687 
688 	rt_deadline = 0;
689 
690 	get_random_bytes(&rt_hash_rnd, 4);
691 
692 	for (i = rt_hash_mask; i >= 0; i--) {
693 		spin_lock_bh(rt_hash_lock_addr(i));
694 		rth = rt_hash_table[i].chain;
695 		if (rth)
696 			rt_hash_table[i].chain = NULL;
697 		spin_unlock_bh(rt_hash_lock_addr(i));
698 
699 		for (; rth; rth = next) {
700 			next = rth->u.rt_next;
701 			rt_free(rth);
702 		}
703 	}
704 }
705 
706 static DEFINE_SPINLOCK(rt_flush_lock);
707 
708 void rt_cache_flush(int delay)
709 {
710 	unsigned long now = jiffies;
711 	int user_mode = !in_softirq();
712 
713 	if (delay < 0)
714 		delay = ip_rt_min_delay;
715 
716 	/* flush existing multipath state*/
717 	multipath_flush();
718 
719 	spin_lock_bh(&rt_flush_lock);
720 
721 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
722 		long tmo = (long)(rt_deadline - now);
723 
724 		/* If flush timer is already running
725 		   and flush request is not immediate (delay > 0):
726 
727 		   if deadline is not achieved, prolongate timer to "delay",
728 		   otherwise fire it at deadline time.
729 		 */
730 
731 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
732 			tmo = 0;
733 
734 		if (delay > tmo)
735 			delay = tmo;
736 	}
737 
738 	if (delay <= 0) {
739 		spin_unlock_bh(&rt_flush_lock);
740 		rt_run_flush(0);
741 		return;
742 	}
743 
744 	if (rt_deadline == 0)
745 		rt_deadline = now + ip_rt_max_delay;
746 
747 	mod_timer(&rt_flush_timer, now+delay);
748 	spin_unlock_bh(&rt_flush_lock);
749 }
750 
751 static void rt_secret_rebuild(unsigned long dummy)
752 {
753 	unsigned long now = jiffies;
754 
755 	rt_cache_flush(0);
756 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
757 }
758 
759 /*
760    Short description of GC goals.
761 
762    We want to build algorithm, which will keep routing cache
763    at some equilibrium point, when number of aged off entries
764    is kept approximately equal to newly generated ones.
765 
766    Current expiration strength is variable "expire".
767    We try to adjust it dynamically, so that if networking
768    is idle expires is large enough to keep enough of warm entries,
769    and when load increases it reduces to limit cache size.
770  */
771 
772 static int rt_garbage_collect(void)
773 {
774 	static unsigned long expire = RT_GC_TIMEOUT;
775 	static unsigned long last_gc;
776 	static int rover;
777 	static int equilibrium;
778 	struct rtable *rth, **rthp;
779 	unsigned long now = jiffies;
780 	int goal;
781 
782 	/*
783 	 * Garbage collection is pretty expensive,
784 	 * do not make it too frequently.
785 	 */
786 
787 	RT_CACHE_STAT_INC(gc_total);
788 
789 	if (now - last_gc < ip_rt_gc_min_interval &&
790 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
791 		RT_CACHE_STAT_INC(gc_ignored);
792 		goto out;
793 	}
794 
795 	/* Calculate number of entries, which we want to expire now. */
796 	goal = atomic_read(&ipv4_dst_ops.entries) -
797 		(ip_rt_gc_elasticity << rt_hash_log);
798 	if (goal <= 0) {
799 		if (equilibrium < ipv4_dst_ops.gc_thresh)
800 			equilibrium = ipv4_dst_ops.gc_thresh;
801 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
802 		if (goal > 0) {
803 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
805 		}
806 	} else {
807 		/* We are in dangerous area. Try to reduce cache really
808 		 * aggressively.
809 		 */
810 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
811 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
812 	}
813 
814 	if (now - last_gc >= ip_rt_gc_min_interval)
815 		last_gc = now;
816 
817 	if (goal <= 0) {
818 		equilibrium += goal;
819 		goto work_done;
820 	}
821 
822 	do {
823 		int i, k;
824 
825 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
826 			unsigned long tmo = expire;
827 
828 			k = (k + 1) & rt_hash_mask;
829 			rthp = &rt_hash_table[k].chain;
830 			spin_lock_bh(rt_hash_lock_addr(k));
831 			while ((rth = *rthp) != NULL) {
832 				if (!rt_may_expire(rth, tmo, expire)) {
833 					tmo >>= 1;
834 					rthp = &rth->u.rt_next;
835 					continue;
836 				}
837 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
838 				/* remove all related balanced entries
839 				 * if necessary
840 				 */
841 				if (rth->u.dst.flags & DST_BALANCED) {
842 					int r;
843 
844 					rthp = rt_remove_balanced_route(
845 						&rt_hash_table[k].chain,
846 						rth,
847 						&r);
848 					goal -= r;
849 					if (!rthp)
850 						break;
851 				} else {
852 					*rthp = rth->u.rt_next;
853 					rt_free(rth);
854 					goal--;
855 				}
856 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
857 				*rthp = rth->u.rt_next;
858 				rt_free(rth);
859 				goal--;
860 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
861 			}
862 			spin_unlock_bh(rt_hash_lock_addr(k));
863 			if (goal <= 0)
864 				break;
865 		}
866 		rover = k;
867 
868 		if (goal <= 0)
869 			goto work_done;
870 
871 		/* Goal is not achieved. We stop process if:
872 
873 		   - if expire reduced to zero. Otherwise, expire is halfed.
874 		   - if table is not full.
875 		   - if we are called from interrupt.
876 		   - jiffies check is just fallback/debug loop breaker.
877 		     We will not spin here for long time in any case.
878 		 */
879 
880 		RT_CACHE_STAT_INC(gc_goal_miss);
881 
882 		if (expire == 0)
883 			break;
884 
885 		expire >>= 1;
886 #if RT_CACHE_DEBUG >= 2
887 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
888 				atomic_read(&ipv4_dst_ops.entries), goal, i);
889 #endif
890 
891 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
892 			goto out;
893 	} while (!in_softirq() && time_before_eq(jiffies, now));
894 
895 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
896 		goto out;
897 	if (net_ratelimit())
898 		printk(KERN_WARNING "dst cache overflow\n");
899 	RT_CACHE_STAT_INC(gc_dst_overflow);
900 	return 1;
901 
902 work_done:
903 	expire += ip_rt_gc_min_interval;
904 	if (expire > ip_rt_gc_timeout ||
905 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
906 		expire = ip_rt_gc_timeout;
907 #if RT_CACHE_DEBUG >= 2
908 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
909 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
910 #endif
911 out:	return 0;
912 }
913 
914 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
915 {
916 	struct rtable	*rth, **rthp;
917 	unsigned long	now;
918 	struct rtable *cand, **candp;
919 	u32 		min_score;
920 	int		chain_length;
921 	int attempts = !in_softirq();
922 
923 restart:
924 	chain_length = 0;
925 	min_score = ~(u32)0;
926 	cand = NULL;
927 	candp = NULL;
928 	now = jiffies;
929 
930 	rthp = &rt_hash_table[hash].chain;
931 
932 	spin_lock_bh(rt_hash_lock_addr(hash));
933 	while ((rth = *rthp) != NULL) {
934 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
935 		if (!(rth->u.dst.flags & DST_BALANCED) &&
936 		    compare_keys(&rth->fl, &rt->fl)) {
937 #else
938 		if (compare_keys(&rth->fl, &rt->fl)) {
939 #endif
940 			/* Put it first */
941 			*rthp = rth->u.rt_next;
942 			/*
943 			 * Since lookup is lockfree, the deletion
944 			 * must be visible to another weakly ordered CPU before
945 			 * the insertion at the start of the hash chain.
946 			 */
947 			rcu_assign_pointer(rth->u.rt_next,
948 					   rt_hash_table[hash].chain);
949 			/*
950 			 * Since lookup is lockfree, the update writes
951 			 * must be ordered for consistency on SMP.
952 			 */
953 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
954 
955 			rth->u.dst.__use++;
956 			dst_hold(&rth->u.dst);
957 			rth->u.dst.lastuse = now;
958 			spin_unlock_bh(rt_hash_lock_addr(hash));
959 
960 			rt_drop(rt);
961 			*rp = rth;
962 			return 0;
963 		}
964 
965 		if (!atomic_read(&rth->u.dst.__refcnt)) {
966 			u32 score = rt_score(rth);
967 
968 			if (score <= min_score) {
969 				cand = rth;
970 				candp = rthp;
971 				min_score = score;
972 			}
973 		}
974 
975 		chain_length++;
976 
977 		rthp = &rth->u.rt_next;
978 	}
979 
980 	if (cand) {
981 		/* ip_rt_gc_elasticity used to be average length of chain
982 		 * length, when exceeded gc becomes really aggressive.
983 		 *
984 		 * The second limit is less certain. At the moment it allows
985 		 * only 2 entries per bucket. We will see.
986 		 */
987 		if (chain_length > ip_rt_gc_elasticity) {
988 			*candp = cand->u.rt_next;
989 			rt_free(cand);
990 		}
991 	}
992 
993 	/* Try to bind route to arp only if it is output
994 	   route or unicast forwarding path.
995 	 */
996 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
997 		int err = arp_bind_neighbour(&rt->u.dst);
998 		if (err) {
999 			spin_unlock_bh(rt_hash_lock_addr(hash));
1000 
1001 			if (err != -ENOBUFS) {
1002 				rt_drop(rt);
1003 				return err;
1004 			}
1005 
1006 			/* Neighbour tables are full and nothing
1007 			   can be released. Try to shrink route cache,
1008 			   it is most likely it holds some neighbour records.
1009 			 */
1010 			if (attempts-- > 0) {
1011 				int saved_elasticity = ip_rt_gc_elasticity;
1012 				int saved_int = ip_rt_gc_min_interval;
1013 				ip_rt_gc_elasticity	= 1;
1014 				ip_rt_gc_min_interval	= 0;
1015 				rt_garbage_collect();
1016 				ip_rt_gc_min_interval	= saved_int;
1017 				ip_rt_gc_elasticity	= saved_elasticity;
1018 				goto restart;
1019 			}
1020 
1021 			if (net_ratelimit())
1022 				printk(KERN_WARNING "Neighbour table overflow.\n");
1023 			rt_drop(rt);
1024 			return -ENOBUFS;
1025 		}
1026 	}
1027 
1028 	rt->u.rt_next = rt_hash_table[hash].chain;
1029 #if RT_CACHE_DEBUG >= 2
1030 	if (rt->u.rt_next) {
1031 		struct rtable *trt;
1032 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1033 		       NIPQUAD(rt->rt_dst));
1034 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1035 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1036 		printk("\n");
1037 	}
1038 #endif
1039 	rt_hash_table[hash].chain = rt;
1040 	spin_unlock_bh(rt_hash_lock_addr(hash));
1041 	*rp = rt;
1042 	return 0;
1043 }
1044 
1045 void rt_bind_peer(struct rtable *rt, int create)
1046 {
1047 	static DEFINE_SPINLOCK(rt_peer_lock);
1048 	struct inet_peer *peer;
1049 
1050 	peer = inet_getpeer(rt->rt_dst, create);
1051 
1052 	spin_lock_bh(&rt_peer_lock);
1053 	if (rt->peer == NULL) {
1054 		rt->peer = peer;
1055 		peer = NULL;
1056 	}
1057 	spin_unlock_bh(&rt_peer_lock);
1058 	if (peer)
1059 		inet_putpeer(peer);
1060 }
1061 
1062 /*
1063  * Peer allocation may fail only in serious out-of-memory conditions.  However
1064  * we still can generate some output.
1065  * Random ID selection looks a bit dangerous because we have no chances to
1066  * select ID being unique in a reasonable period of time.
1067  * But broken packet identifier may be better than no packet at all.
1068  */
1069 static void ip_select_fb_ident(struct iphdr *iph)
1070 {
1071 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1072 	static u32 ip_fallback_id;
1073 	u32 salt;
1074 
1075 	spin_lock_bh(&ip_fb_id_lock);
1076 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1077 	iph->id = htons(salt & 0xFFFF);
1078 	ip_fallback_id = salt;
1079 	spin_unlock_bh(&ip_fb_id_lock);
1080 }
1081 
1082 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1083 {
1084 	struct rtable *rt = (struct rtable *) dst;
1085 
1086 	if (rt) {
1087 		if (rt->peer == NULL)
1088 			rt_bind_peer(rt, 1);
1089 
1090 		/* If peer is attached to destination, it is never detached,
1091 		   so that we need not to grab a lock to dereference it.
1092 		 */
1093 		if (rt->peer) {
1094 			iph->id = htons(inet_getid(rt->peer, more));
1095 			return;
1096 		}
1097 	} else
1098 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1099 		       __builtin_return_address(0));
1100 
1101 	ip_select_fb_ident(iph);
1102 }
1103 
1104 static void rt_del(unsigned hash, struct rtable *rt)
1105 {
1106 	struct rtable **rthp;
1107 
1108 	spin_lock_bh(rt_hash_lock_addr(hash));
1109 	ip_rt_put(rt);
1110 	for (rthp = &rt_hash_table[hash].chain; *rthp;
1111 	     rthp = &(*rthp)->u.rt_next)
1112 		if (*rthp == rt) {
1113 			*rthp = rt->u.rt_next;
1114 			rt_free(rt);
1115 			break;
1116 		}
1117 	spin_unlock_bh(rt_hash_lock_addr(hash));
1118 }
1119 
1120 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1121 		    u32 saddr, struct net_device *dev)
1122 {
1123 	int i, k;
1124 	struct in_device *in_dev = in_dev_get(dev);
1125 	struct rtable *rth, **rthp;
1126 	u32  skeys[2] = { saddr, 0 };
1127 	int  ikeys[2] = { dev->ifindex, 0 };
1128 
1129 	if (!in_dev)
1130 		return;
1131 
1132 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1133 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1134 		goto reject_redirect;
1135 
1136 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1137 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1138 			goto reject_redirect;
1139 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1140 			goto reject_redirect;
1141 	} else {
1142 		if (inet_addr_type(new_gw) != RTN_UNICAST)
1143 			goto reject_redirect;
1144 	}
1145 
1146 	for (i = 0; i < 2; i++) {
1147 		for (k = 0; k < 2; k++) {
1148 			unsigned hash = rt_hash_code(daddr,
1149 						     skeys[i] ^ (ikeys[k] << 5));
1150 
1151 			rthp=&rt_hash_table[hash].chain;
1152 
1153 			rcu_read_lock();
1154 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1155 				struct rtable *rt;
1156 
1157 				if (rth->fl.fl4_dst != daddr ||
1158 				    rth->fl.fl4_src != skeys[i] ||
1159 				    rth->fl.oif != ikeys[k] ||
1160 				    rth->fl.iif != 0) {
1161 					rthp = &rth->u.rt_next;
1162 					continue;
1163 				}
1164 
1165 				if (rth->rt_dst != daddr ||
1166 				    rth->rt_src != saddr ||
1167 				    rth->u.dst.error ||
1168 				    rth->rt_gateway != old_gw ||
1169 				    rth->u.dst.dev != dev)
1170 					break;
1171 
1172 				dst_hold(&rth->u.dst);
1173 				rcu_read_unlock();
1174 
1175 				rt = dst_alloc(&ipv4_dst_ops);
1176 				if (rt == NULL) {
1177 					ip_rt_put(rth);
1178 					in_dev_put(in_dev);
1179 					return;
1180 				}
1181 
1182 				/* Copy all the information. */
1183 				*rt = *rth;
1184  				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1185 				rt->u.dst.__use		= 1;
1186 				atomic_set(&rt->u.dst.__refcnt, 1);
1187 				rt->u.dst.child		= NULL;
1188 				if (rt->u.dst.dev)
1189 					dev_hold(rt->u.dst.dev);
1190 				if (rt->idev)
1191 					in_dev_hold(rt->idev);
1192 				rt->u.dst.obsolete	= 0;
1193 				rt->u.dst.lastuse	= jiffies;
1194 				rt->u.dst.path		= &rt->u.dst;
1195 				rt->u.dst.neighbour	= NULL;
1196 				rt->u.dst.hh		= NULL;
1197 				rt->u.dst.xfrm		= NULL;
1198 
1199 				rt->rt_flags		|= RTCF_REDIRECTED;
1200 
1201 				/* Gateway is different ... */
1202 				rt->rt_gateway		= new_gw;
1203 
1204 				/* Redirect received -> path was valid */
1205 				dst_confirm(&rth->u.dst);
1206 
1207 				if (rt->peer)
1208 					atomic_inc(&rt->peer->refcnt);
1209 
1210 				if (arp_bind_neighbour(&rt->u.dst) ||
1211 				    !(rt->u.dst.neighbour->nud_state &
1212 					    NUD_VALID)) {
1213 					if (rt->u.dst.neighbour)
1214 						neigh_event_send(rt->u.dst.neighbour, NULL);
1215 					ip_rt_put(rth);
1216 					rt_drop(rt);
1217 					goto do_next;
1218 				}
1219 
1220 				rt_del(hash, rth);
1221 				if (!rt_intern_hash(hash, rt, &rt))
1222 					ip_rt_put(rt);
1223 				goto do_next;
1224 			}
1225 			rcu_read_unlock();
1226 		do_next:
1227 			;
1228 		}
1229 	}
1230 	in_dev_put(in_dev);
1231 	return;
1232 
1233 reject_redirect:
1234 #ifdef CONFIG_IP_ROUTE_VERBOSE
1235 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1236 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1237 			"%u.%u.%u.%u ignored.\n"
1238 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1239 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1240 		       NIPQUAD(saddr), NIPQUAD(daddr));
1241 #endif
1242 	in_dev_put(in_dev);
1243 }
1244 
1245 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1246 {
1247 	struct rtable *rt = (struct rtable*)dst;
1248 	struct dst_entry *ret = dst;
1249 
1250 	if (rt) {
1251 		if (dst->obsolete) {
1252 			ip_rt_put(rt);
1253 			ret = NULL;
1254 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1255 			   rt->u.dst.expires) {
1256 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1257 						     rt->fl.fl4_src ^
1258 							(rt->fl.oif << 5));
1259 #if RT_CACHE_DEBUG >= 1
1260 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
1261 					  "%u.%u.%u.%u/%02x dropped\n",
1262 				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1263 #endif
1264 			rt_del(hash, rt);
1265 			ret = NULL;
1266 		}
1267 	}
1268 	return ret;
1269 }
1270 
1271 /*
1272  * Algorithm:
1273  *	1. The first ip_rt_redirect_number redirects are sent
1274  *	   with exponential backoff, then we stop sending them at all,
1275  *	   assuming that the host ignores our redirects.
1276  *	2. If we did not see packets requiring redirects
1277  *	   during ip_rt_redirect_silence, we assume that the host
1278  *	   forgot redirected route and start to send redirects again.
1279  *
1280  * This algorithm is much cheaper and more intelligent than dumb load limiting
1281  * in icmp.c.
1282  *
1283  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1284  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1285  */
1286 
1287 void ip_rt_send_redirect(struct sk_buff *skb)
1288 {
1289 	struct rtable *rt = (struct rtable*)skb->dst;
1290 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1291 
1292 	if (!in_dev)
1293 		return;
1294 
1295 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1296 		goto out;
1297 
1298 	/* No redirected packets during ip_rt_redirect_silence;
1299 	 * reset the algorithm.
1300 	 */
1301 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1302 		rt->u.dst.rate_tokens = 0;
1303 
1304 	/* Too many ignored redirects; do not send anything
1305 	 * set u.dst.rate_last to the last seen redirected packet.
1306 	 */
1307 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1308 		rt->u.dst.rate_last = jiffies;
1309 		goto out;
1310 	}
1311 
1312 	/* Check for load limit; set rate_last to the latest sent
1313 	 * redirect.
1314 	 */
1315 	if (time_after(jiffies,
1316 		       (rt->u.dst.rate_last +
1317 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1318 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1319 		rt->u.dst.rate_last = jiffies;
1320 		++rt->u.dst.rate_tokens;
1321 #ifdef CONFIG_IP_ROUTE_VERBOSE
1322 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1323 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1324 		    net_ratelimit())
1325 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1326 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1327 				NIPQUAD(rt->rt_src), rt->rt_iif,
1328 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1329 #endif
1330 	}
1331 out:
1332         in_dev_put(in_dev);
1333 }
1334 
1335 static int ip_error(struct sk_buff *skb)
1336 {
1337 	struct rtable *rt = (struct rtable*)skb->dst;
1338 	unsigned long now;
1339 	int code;
1340 
1341 	switch (rt->u.dst.error) {
1342 		case EINVAL:
1343 		default:
1344 			goto out;
1345 		case EHOSTUNREACH:
1346 			code = ICMP_HOST_UNREACH;
1347 			break;
1348 		case ENETUNREACH:
1349 			code = ICMP_NET_UNREACH;
1350 			break;
1351 		case EACCES:
1352 			code = ICMP_PKT_FILTERED;
1353 			break;
1354 	}
1355 
1356 	now = jiffies;
1357 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1358 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1359 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1360 	rt->u.dst.rate_last = now;
1361 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1362 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1363 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1364 	}
1365 
1366 out:	kfree_skb(skb);
1367 	return 0;
1368 }
1369 
1370 /*
1371  *	The last two values are not from the RFC but
1372  *	are needed for AMPRnet AX.25 paths.
1373  */
1374 
1375 static const unsigned short mtu_plateau[] =
1376 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1377 
1378 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1379 {
1380 	int i;
1381 
1382 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1383 		if (old_mtu > mtu_plateau[i])
1384 			return mtu_plateau[i];
1385 	return 68;
1386 }
1387 
1388 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1389 {
1390 	int i;
1391 	unsigned short old_mtu = ntohs(iph->tot_len);
1392 	struct rtable *rth;
1393 	u32  skeys[2] = { iph->saddr, 0, };
1394 	u32  daddr = iph->daddr;
1395 	unsigned short est_mtu = 0;
1396 
1397 	if (ipv4_config.no_pmtu_disc)
1398 		return 0;
1399 
1400 	for (i = 0; i < 2; i++) {
1401 		unsigned hash = rt_hash_code(daddr, skeys[i]);
1402 
1403 		rcu_read_lock();
1404 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 		     rth = rcu_dereference(rth->u.rt_next)) {
1406 			if (rth->fl.fl4_dst == daddr &&
1407 			    rth->fl.fl4_src == skeys[i] &&
1408 			    rth->rt_dst  == daddr &&
1409 			    rth->rt_src  == iph->saddr &&
1410 			    rth->fl.iif == 0 &&
1411 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1412 				unsigned short mtu = new_mtu;
1413 
1414 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1415 
1416 					/* BSD 4.2 compatibility hack :-( */
1417 					if (mtu == 0 &&
1418 					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1419 					    old_mtu >= 68 + (iph->ihl << 2))
1420 						old_mtu -= iph->ihl << 2;
1421 
1422 					mtu = guess_mtu(old_mtu);
1423 				}
1424 				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1425 					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1426 						dst_confirm(&rth->u.dst);
1427 						if (mtu < ip_rt_min_pmtu) {
1428 							mtu = ip_rt_min_pmtu;
1429 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1430 								(1 << RTAX_MTU);
1431 						}
1432 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1433 						dst_set_expires(&rth->u.dst,
1434 							ip_rt_mtu_expires);
1435 					}
1436 					est_mtu = mtu;
1437 				}
1438 			}
1439 		}
1440 		rcu_read_unlock();
1441 	}
1442 	return est_mtu ? : new_mtu;
1443 }
1444 
1445 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1446 {
1447 	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1448 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1449 		if (mtu < ip_rt_min_pmtu) {
1450 			mtu = ip_rt_min_pmtu;
1451 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1452 		}
1453 		dst->metrics[RTAX_MTU-1] = mtu;
1454 		dst_set_expires(dst, ip_rt_mtu_expires);
1455 	}
1456 }
1457 
1458 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1459 {
1460 	return NULL;
1461 }
1462 
1463 static void ipv4_dst_destroy(struct dst_entry *dst)
1464 {
1465 	struct rtable *rt = (struct rtable *) dst;
1466 	struct inet_peer *peer = rt->peer;
1467 	struct in_device *idev = rt->idev;
1468 
1469 	if (peer) {
1470 		rt->peer = NULL;
1471 		inet_putpeer(peer);
1472 	}
1473 
1474 	if (idev) {
1475 		rt->idev = NULL;
1476 		in_dev_put(idev);
1477 	}
1478 }
1479 
1480 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1481 			    int how)
1482 {
1483 	struct rtable *rt = (struct rtable *) dst;
1484 	struct in_device *idev = rt->idev;
1485 	if (dev != &loopback_dev && idev && idev->dev == dev) {
1486 		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1487 		if (loopback_idev) {
1488 			rt->idev = loopback_idev;
1489 			in_dev_put(idev);
1490 		}
1491 	}
1492 }
1493 
1494 static void ipv4_link_failure(struct sk_buff *skb)
1495 {
1496 	struct rtable *rt;
1497 
1498 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1499 
1500 	rt = (struct rtable *) skb->dst;
1501 	if (rt)
1502 		dst_set_expires(&rt->u.dst, 0);
1503 }
1504 
1505 static int ip_rt_bug(struct sk_buff *skb)
1506 {
1507 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1508 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1509 		skb->dev ? skb->dev->name : "?");
1510 	kfree_skb(skb);
1511 	return 0;
1512 }
1513 
1514 /*
1515    We do not cache source address of outgoing interface,
1516    because it is used only by IP RR, TS and SRR options,
1517    so that it out of fast path.
1518 
1519    BTW remember: "addr" is allowed to be not aligned
1520    in IP options!
1521  */
1522 
1523 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1524 {
1525 	u32 src;
1526 	struct fib_result res;
1527 
1528 	if (rt->fl.iif == 0)
1529 		src = rt->rt_src;
1530 	else if (fib_lookup(&rt->fl, &res) == 0) {
1531 		src = FIB_RES_PREFSRC(res);
1532 		fib_res_put(&res);
1533 	} else
1534 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1535 					RT_SCOPE_UNIVERSE);
1536 	memcpy(addr, &src, 4);
1537 }
1538 
1539 #ifdef CONFIG_NET_CLS_ROUTE
1540 static void set_class_tag(struct rtable *rt, u32 tag)
1541 {
1542 	if (!(rt->u.dst.tclassid & 0xFFFF))
1543 		rt->u.dst.tclassid |= tag & 0xFFFF;
1544 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1545 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1546 }
1547 #endif
1548 
1549 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1550 {
1551 	struct fib_info *fi = res->fi;
1552 
1553 	if (fi) {
1554 		if (FIB_RES_GW(*res) &&
1555 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1556 			rt->rt_gateway = FIB_RES_GW(*res);
1557 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1558 		       sizeof(rt->u.dst.metrics));
1559 		if (fi->fib_mtu == 0) {
1560 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1561 			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1562 			    rt->rt_gateway != rt->rt_dst &&
1563 			    rt->u.dst.dev->mtu > 576)
1564 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1565 		}
1566 #ifdef CONFIG_NET_CLS_ROUTE
1567 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1568 #endif
1569 	} else
1570 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1571 
1572 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1573 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1574 	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1575 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1576 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1577 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1578 				       ip_rt_min_advmss);
1579 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1580 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1581 
1582 #ifdef CONFIG_NET_CLS_ROUTE
1583 #ifdef CONFIG_IP_MULTIPLE_TABLES
1584 	set_class_tag(rt, fib_rules_tclass(res));
1585 #endif
1586 	set_class_tag(rt, itag);
1587 #endif
1588         rt->rt_type = res->type;
1589 }
1590 
1591 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1592 				u8 tos, struct net_device *dev, int our)
1593 {
1594 	unsigned hash;
1595 	struct rtable *rth;
1596 	u32 spec_dst;
1597 	struct in_device *in_dev = in_dev_get(dev);
1598 	u32 itag = 0;
1599 
1600 	/* Primary sanity checks. */
1601 
1602 	if (in_dev == NULL)
1603 		return -EINVAL;
1604 
1605 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1606 	    skb->protocol != htons(ETH_P_IP))
1607 		goto e_inval;
1608 
1609 	if (ZERONET(saddr)) {
1610 		if (!LOCAL_MCAST(daddr))
1611 			goto e_inval;
1612 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1613 	} else if (fib_validate_source(saddr, 0, tos, 0,
1614 					dev, &spec_dst, &itag) < 0)
1615 		goto e_inval;
1616 
1617 	rth = dst_alloc(&ipv4_dst_ops);
1618 	if (!rth)
1619 		goto e_nobufs;
1620 
1621 	rth->u.dst.output= ip_rt_bug;
1622 
1623 	atomic_set(&rth->u.dst.__refcnt, 1);
1624 	rth->u.dst.flags= DST_HOST;
1625 	if (in_dev->cnf.no_policy)
1626 		rth->u.dst.flags |= DST_NOPOLICY;
1627 	rth->fl.fl4_dst	= daddr;
1628 	rth->rt_dst	= daddr;
1629 	rth->fl.fl4_tos	= tos;
1630 #ifdef CONFIG_IP_ROUTE_FWMARK
1631 	rth->fl.fl4_fwmark= skb->nfmark;
1632 #endif
1633 	rth->fl.fl4_src	= saddr;
1634 	rth->rt_src	= saddr;
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 	rth->u.dst.tclassid = itag;
1637 #endif
1638 	rth->rt_iif	=
1639 	rth->fl.iif	= dev->ifindex;
1640 	rth->u.dst.dev	= &loopback_dev;
1641 	dev_hold(rth->u.dst.dev);
1642 	rth->idev	= in_dev_get(rth->u.dst.dev);
1643 	rth->fl.oif	= 0;
1644 	rth->rt_gateway	= daddr;
1645 	rth->rt_spec_dst= spec_dst;
1646 	rth->rt_type	= RTN_MULTICAST;
1647 	rth->rt_flags	= RTCF_MULTICAST;
1648 	if (our) {
1649 		rth->u.dst.input= ip_local_deliver;
1650 		rth->rt_flags |= RTCF_LOCAL;
1651 	}
1652 
1653 #ifdef CONFIG_IP_MROUTE
1654 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1655 		rth->u.dst.input = ip_mr_input;
1656 #endif
1657 	RT_CACHE_STAT_INC(in_slow_mc);
1658 
1659 	in_dev_put(in_dev);
1660 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1661 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1662 
1663 e_nobufs:
1664 	in_dev_put(in_dev);
1665 	return -ENOBUFS;
1666 
1667 e_inval:
1668 	in_dev_put(in_dev);
1669 	return -EINVAL;
1670 }
1671 
1672 
1673 static void ip_handle_martian_source(struct net_device *dev,
1674 				     struct in_device *in_dev,
1675 				     struct sk_buff *skb,
1676 				     u32 daddr,
1677 				     u32 saddr)
1678 {
1679 	RT_CACHE_STAT_INC(in_martian_src);
1680 #ifdef CONFIG_IP_ROUTE_VERBOSE
1681 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1682 		/*
1683 		 *	RFC1812 recommendation, if source is martian,
1684 		 *	the only hint is MAC header.
1685 		 */
1686 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1687 			"%u.%u.%u.%u, on dev %s\n",
1688 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1689 		if (dev->hard_header_len && skb->mac.raw) {
1690 			int i;
1691 			unsigned char *p = skb->mac.raw;
1692 			printk(KERN_WARNING "ll header: ");
1693 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1694 				printk("%02x", *p);
1695 				if (i < (dev->hard_header_len - 1))
1696 					printk(":");
1697 			}
1698 			printk("\n");
1699 		}
1700 	}
1701 #endif
1702 }
1703 
1704 static inline int __mkroute_input(struct sk_buff *skb,
1705 				  struct fib_result* res,
1706 				  struct in_device *in_dev,
1707 				  u32 daddr, u32 saddr, u32 tos,
1708 				  struct rtable **result)
1709 {
1710 
1711 	struct rtable *rth;
1712 	int err;
1713 	struct in_device *out_dev;
1714 	unsigned flags = 0;
1715 	u32 spec_dst, itag;
1716 
1717 	/* get a working reference to the output device */
1718 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1719 	if (out_dev == NULL) {
1720 		if (net_ratelimit())
1721 			printk(KERN_CRIT "Bug in ip_route_input" \
1722 			       "_slow(). Please, report\n");
1723 		return -EINVAL;
1724 	}
1725 
1726 
1727 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1728 				  in_dev->dev, &spec_dst, &itag);
1729 	if (err < 0) {
1730 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1731 					 saddr);
1732 
1733 		err = -EINVAL;
1734 		goto cleanup;
1735 	}
1736 
1737 	if (err)
1738 		flags |= RTCF_DIRECTSRC;
1739 
1740 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1741 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1742 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1743 		flags |= RTCF_DOREDIRECT;
1744 
1745 	if (skb->protocol != htons(ETH_P_IP)) {
1746 		/* Not IP (i.e. ARP). Do not create route, if it is
1747 		 * invalid for proxy arp. DNAT routes are always valid.
1748 		 */
1749 		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1750 			err = -EINVAL;
1751 			goto cleanup;
1752 		}
1753 	}
1754 
1755 
1756 	rth = dst_alloc(&ipv4_dst_ops);
1757 	if (!rth) {
1758 		err = -ENOBUFS;
1759 		goto cleanup;
1760 	}
1761 
1762 	atomic_set(&rth->u.dst.__refcnt, 1);
1763 	rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 	if (res->fi->fib_nhs > 1)
1766 		rth->u.dst.flags |= DST_BALANCED;
1767 #endif
1768 	if (in_dev->cnf.no_policy)
1769 		rth->u.dst.flags |= DST_NOPOLICY;
1770 	if (in_dev->cnf.no_xfrm)
1771 		rth->u.dst.flags |= DST_NOXFRM;
1772 	rth->fl.fl4_dst	= daddr;
1773 	rth->rt_dst	= daddr;
1774 	rth->fl.fl4_tos	= tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 	rth->fl.fl4_fwmark= skb->nfmark;
1777 #endif
1778 	rth->fl.fl4_src	= saddr;
1779 	rth->rt_src	= saddr;
1780 	rth->rt_gateway	= daddr;
1781 	rth->rt_iif 	=
1782 		rth->fl.iif	= in_dev->dev->ifindex;
1783 	rth->u.dst.dev	= (out_dev)->dev;
1784 	dev_hold(rth->u.dst.dev);
1785 	rth->idev	= in_dev_get(rth->u.dst.dev);
1786 	rth->fl.oif 	= 0;
1787 	rth->rt_spec_dst= spec_dst;
1788 
1789 	rth->u.dst.input = ip_forward;
1790 	rth->u.dst.output = ip_output;
1791 
1792 	rt_set_nexthop(rth, res, itag);
1793 
1794 	rth->rt_flags = flags;
1795 
1796 	*result = rth;
1797 	err = 0;
1798  cleanup:
1799 	/* release the working reference to the output device */
1800 	in_dev_put(out_dev);
1801 	return err;
1802 }
1803 
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 				       struct fib_result* res,
1806 				       const struct flowi *fl,
1807 				       struct in_device *in_dev,
1808 				       u32 daddr, u32 saddr, u32 tos)
1809 {
1810 	struct rtable* rth = NULL;
1811 	int err;
1812 	unsigned hash;
1813 
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 		fib_select_multipath(fl, res);
1817 #endif
1818 
1819 	/* create a routing cache entry */
1820 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821 	if (err)
1822 		return err;
1823 
1824 	/* put it into the cache */
1825 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1826 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1827 }
1828 
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830 				   struct fib_result* res,
1831 				   const struct flowi *fl,
1832 				   struct in_device *in_dev,
1833 				   u32 daddr, u32 saddr, u32 tos)
1834 {
1835 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1836 	struct rtable* rth = NULL, *rtres;
1837 	unsigned char hop, hopcount;
1838 	int err = -EINVAL;
1839 	unsigned int hash;
1840 
1841 	if (res->fi)
1842 		hopcount = res->fi->fib_nhs;
1843 	else
1844 		hopcount = 1;
1845 
1846 	/* distinguish between multipath and singlepath */
1847 	if (hopcount < 2)
1848 		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1849 					    saddr, tos);
1850 
1851 	/* add all alternatives to the routing cache */
1852 	for (hop = 0; hop < hopcount; hop++) {
1853 		res->nh_sel = hop;
1854 
1855 		/* put reference to previous result */
1856 		if (hop)
1857 			ip_rt_put(rtres);
1858 
1859 		/* create a routing cache entry */
1860 		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1861 				      &rth);
1862 		if (err)
1863 			return err;
1864 
1865 		/* put it into the cache */
1866 		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1867 		err = rt_intern_hash(hash, rth, &rtres);
1868 		if (err)
1869 			return err;
1870 
1871 		/* forward hop information to multipath impl. */
1872 		multipath_set_nhinfo(rth,
1873 				     FIB_RES_NETWORK(*res),
1874 				     FIB_RES_NETMASK(*res),
1875 				     res->prefixlen,
1876 				     &FIB_RES_NH(*res));
1877 	}
1878 	skb->dst = &rtres->u.dst;
1879 	return err;
1880 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1881 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1882 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1883 }
1884 
1885 
1886 /*
1887  *	NOTE. We drop all the packets that has local source
1888  *	addresses, because every properly looped back packet
1889  *	must have correct destination already attached by output routine.
1890  *
1891  *	Such approach solves two big problems:
1892  *	1. Not simplex devices are handled properly.
1893  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1894  */
1895 
1896 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1897 			       u8 tos, struct net_device *dev)
1898 {
1899 	struct fib_result res;
1900 	struct in_device *in_dev = in_dev_get(dev);
1901 	struct flowi fl = { .nl_u = { .ip4_u =
1902 				      { .daddr = daddr,
1903 					.saddr = saddr,
1904 					.tos = tos,
1905 					.scope = RT_SCOPE_UNIVERSE,
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907 					.fwmark = skb->nfmark
1908 #endif
1909 				      } },
1910 			    .iif = dev->ifindex };
1911 	unsigned	flags = 0;
1912 	u32		itag = 0;
1913 	struct rtable * rth;
1914 	unsigned	hash;
1915 	u32		spec_dst;
1916 	int		err = -EINVAL;
1917 	int		free_res = 0;
1918 
1919 	/* IP on this device is disabled. */
1920 
1921 	if (!in_dev)
1922 		goto out;
1923 
1924 	/* Check for the most weird martians, which can be not detected
1925 	   by fib_lookup.
1926 	 */
1927 
1928 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1929 		goto martian_source;
1930 
1931 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1932 		goto brd_input;
1933 
1934 	/* Accept zero addresses only to limited broadcast;
1935 	 * I even do not know to fix it or not. Waiting for complains :-)
1936 	 */
1937 	if (ZERONET(saddr))
1938 		goto martian_source;
1939 
1940 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1941 		goto martian_destination;
1942 
1943 	/*
1944 	 *	Now we are ready to route packet.
1945 	 */
1946 	if ((err = fib_lookup(&fl, &res)) != 0) {
1947 		if (!IN_DEV_FORWARD(in_dev))
1948 			goto e_hostunreach;
1949 		goto no_route;
1950 	}
1951 	free_res = 1;
1952 
1953 	RT_CACHE_STAT_INC(in_slow_tot);
1954 
1955 	if (res.type == RTN_BROADCAST)
1956 		goto brd_input;
1957 
1958 	if (res.type == RTN_LOCAL) {
1959 		int result;
1960 		result = fib_validate_source(saddr, daddr, tos,
1961 					     loopback_dev.ifindex,
1962 					     dev, &spec_dst, &itag);
1963 		if (result < 0)
1964 			goto martian_source;
1965 		if (result)
1966 			flags |= RTCF_DIRECTSRC;
1967 		spec_dst = daddr;
1968 		goto local_input;
1969 	}
1970 
1971 	if (!IN_DEV_FORWARD(in_dev))
1972 		goto e_hostunreach;
1973 	if (res.type != RTN_UNICAST)
1974 		goto martian_destination;
1975 
1976 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1977 	if (err == -ENOBUFS)
1978 		goto e_nobufs;
1979 	if (err == -EINVAL)
1980 		goto e_inval;
1981 
1982 done:
1983 	in_dev_put(in_dev);
1984 	if (free_res)
1985 		fib_res_put(&res);
1986 out:	return err;
1987 
1988 brd_input:
1989 	if (skb->protocol != htons(ETH_P_IP))
1990 		goto e_inval;
1991 
1992 	if (ZERONET(saddr))
1993 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1994 	else {
1995 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1996 					  &itag);
1997 		if (err < 0)
1998 			goto martian_source;
1999 		if (err)
2000 			flags |= RTCF_DIRECTSRC;
2001 	}
2002 	flags |= RTCF_BROADCAST;
2003 	res.type = RTN_BROADCAST;
2004 	RT_CACHE_STAT_INC(in_brd);
2005 
2006 local_input:
2007 	rth = dst_alloc(&ipv4_dst_ops);
2008 	if (!rth)
2009 		goto e_nobufs;
2010 
2011 	rth->u.dst.output= ip_rt_bug;
2012 
2013 	atomic_set(&rth->u.dst.__refcnt, 1);
2014 	rth->u.dst.flags= DST_HOST;
2015 	if (in_dev->cnf.no_policy)
2016 		rth->u.dst.flags |= DST_NOPOLICY;
2017 	rth->fl.fl4_dst	= daddr;
2018 	rth->rt_dst	= daddr;
2019 	rth->fl.fl4_tos	= tos;
2020 #ifdef CONFIG_IP_ROUTE_FWMARK
2021 	rth->fl.fl4_fwmark= skb->nfmark;
2022 #endif
2023 	rth->fl.fl4_src	= saddr;
2024 	rth->rt_src	= saddr;
2025 #ifdef CONFIG_NET_CLS_ROUTE
2026 	rth->u.dst.tclassid = itag;
2027 #endif
2028 	rth->rt_iif	=
2029 	rth->fl.iif	= dev->ifindex;
2030 	rth->u.dst.dev	= &loopback_dev;
2031 	dev_hold(rth->u.dst.dev);
2032 	rth->idev	= in_dev_get(rth->u.dst.dev);
2033 	rth->rt_gateway	= daddr;
2034 	rth->rt_spec_dst= spec_dst;
2035 	rth->u.dst.input= ip_local_deliver;
2036 	rth->rt_flags 	= flags|RTCF_LOCAL;
2037 	if (res.type == RTN_UNREACHABLE) {
2038 		rth->u.dst.input= ip_error;
2039 		rth->u.dst.error= -err;
2040 		rth->rt_flags 	&= ~RTCF_LOCAL;
2041 	}
2042 	rth->rt_type	= res.type;
2043 	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2044 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2045 	goto done;
2046 
2047 no_route:
2048 	RT_CACHE_STAT_INC(in_no_route);
2049 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2050 	res.type = RTN_UNREACHABLE;
2051 	goto local_input;
2052 
2053 	/*
2054 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2055 	 */
2056 martian_destination:
2057 	RT_CACHE_STAT_INC(in_martian_dst);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2060 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2061 			"%u.%u.%u.%u, dev %s\n",
2062 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2063 #endif
2064 
2065 e_hostunreach:
2066         err = -EHOSTUNREACH;
2067         goto done;
2068 
2069 e_inval:
2070 	err = -EINVAL;
2071 	goto done;
2072 
2073 e_nobufs:
2074 	err = -ENOBUFS;
2075 	goto done;
2076 
2077 martian_source:
2078 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2079 	goto e_inval;
2080 }
2081 
2082 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2083 		   u8 tos, struct net_device *dev)
2084 {
2085 	struct rtable * rth;
2086 	unsigned	hash;
2087 	int iif = dev->ifindex;
2088 
2089 	tos &= IPTOS_RT_MASK;
2090 	hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2091 
2092 	rcu_read_lock();
2093 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2094 	     rth = rcu_dereference(rth->u.rt_next)) {
2095 		if (rth->fl.fl4_dst == daddr &&
2096 		    rth->fl.fl4_src == saddr &&
2097 		    rth->fl.iif == iif &&
2098 		    rth->fl.oif == 0 &&
2099 #ifdef CONFIG_IP_ROUTE_FWMARK
2100 		    rth->fl.fl4_fwmark == skb->nfmark &&
2101 #endif
2102 		    rth->fl.fl4_tos == tos) {
2103 			rth->u.dst.lastuse = jiffies;
2104 			dst_hold(&rth->u.dst);
2105 			rth->u.dst.__use++;
2106 			RT_CACHE_STAT_INC(in_hit);
2107 			rcu_read_unlock();
2108 			skb->dst = (struct dst_entry*)rth;
2109 			return 0;
2110 		}
2111 		RT_CACHE_STAT_INC(in_hlist_search);
2112 	}
2113 	rcu_read_unlock();
2114 
2115 	/* Multicast recognition logic is moved from route cache to here.
2116 	   The problem was that too many Ethernet cards have broken/missing
2117 	   hardware multicast filters :-( As result the host on multicasting
2118 	   network acquires a lot of useless route cache entries, sort of
2119 	   SDR messages from all the world. Now we try to get rid of them.
2120 	   Really, provided software IP multicast filter is organized
2121 	   reasonably (at least, hashed), it does not result in a slowdown
2122 	   comparing with route cache reject entries.
2123 	   Note, that multicast routers are not affected, because
2124 	   route cache entry is created eventually.
2125 	 */
2126 	if (MULTICAST(daddr)) {
2127 		struct in_device *in_dev;
2128 
2129 		rcu_read_lock();
2130 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2131 			int our = ip_check_mc(in_dev, daddr, saddr,
2132 				skb->nh.iph->protocol);
2133 			if (our
2134 #ifdef CONFIG_IP_MROUTE
2135 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2136 #endif
2137 			    ) {
2138 				rcu_read_unlock();
2139 				return ip_route_input_mc(skb, daddr, saddr,
2140 							 tos, dev, our);
2141 			}
2142 		}
2143 		rcu_read_unlock();
2144 		return -EINVAL;
2145 	}
2146 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2147 }
2148 
2149 static inline int __mkroute_output(struct rtable **result,
2150 				   struct fib_result* res,
2151 				   const struct flowi *fl,
2152 				   const struct flowi *oldflp,
2153 				   struct net_device *dev_out,
2154 				   unsigned flags)
2155 {
2156 	struct rtable *rth;
2157 	struct in_device *in_dev;
2158 	u32 tos = RT_FL_TOS(oldflp);
2159 	int err = 0;
2160 
2161 	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2162 		return -EINVAL;
2163 
2164 	if (fl->fl4_dst == 0xFFFFFFFF)
2165 		res->type = RTN_BROADCAST;
2166 	else if (MULTICAST(fl->fl4_dst))
2167 		res->type = RTN_MULTICAST;
2168 	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2169 		return -EINVAL;
2170 
2171 	if (dev_out->flags & IFF_LOOPBACK)
2172 		flags |= RTCF_LOCAL;
2173 
2174 	/* get work reference to inet device */
2175 	in_dev = in_dev_get(dev_out);
2176 	if (!in_dev)
2177 		return -EINVAL;
2178 
2179 	if (res->type == RTN_BROADCAST) {
2180 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2181 		if (res->fi) {
2182 			fib_info_put(res->fi);
2183 			res->fi = NULL;
2184 		}
2185 	} else if (res->type == RTN_MULTICAST) {
2186 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2187 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2188 				 oldflp->proto))
2189 			flags &= ~RTCF_LOCAL;
2190 		/* If multicast route do not exist use
2191 		   default one, but do not gateway in this case.
2192 		   Yes, it is hack.
2193 		 */
2194 		if (res->fi && res->prefixlen < 4) {
2195 			fib_info_put(res->fi);
2196 			res->fi = NULL;
2197 		}
2198 	}
2199 
2200 
2201 	rth = dst_alloc(&ipv4_dst_ops);
2202 	if (!rth) {
2203 		err = -ENOBUFS;
2204 		goto cleanup;
2205 	}
2206 
2207 	atomic_set(&rth->u.dst.__refcnt, 1);
2208 	rth->u.dst.flags= DST_HOST;
2209 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2210 	if (res->fi) {
2211 		rth->rt_multipath_alg = res->fi->fib_mp_alg;
2212 		if (res->fi->fib_nhs > 1)
2213 			rth->u.dst.flags |= DST_BALANCED;
2214 	}
2215 #endif
2216 	if (in_dev->cnf.no_xfrm)
2217 		rth->u.dst.flags |= DST_NOXFRM;
2218 	if (in_dev->cnf.no_policy)
2219 		rth->u.dst.flags |= DST_NOPOLICY;
2220 
2221 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2222 	rth->fl.fl4_tos	= tos;
2223 	rth->fl.fl4_src	= oldflp->fl4_src;
2224 	rth->fl.oif	= oldflp->oif;
2225 #ifdef CONFIG_IP_ROUTE_FWMARK
2226 	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2227 #endif
2228 	rth->rt_dst	= fl->fl4_dst;
2229 	rth->rt_src	= fl->fl4_src;
2230 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2231 	/* get references to the devices that are to be hold by the routing
2232 	   cache entry */
2233 	rth->u.dst.dev	= dev_out;
2234 	dev_hold(dev_out);
2235 	rth->idev	= in_dev_get(dev_out);
2236 	rth->rt_gateway = fl->fl4_dst;
2237 	rth->rt_spec_dst= fl->fl4_src;
2238 
2239 	rth->u.dst.output=ip_output;
2240 
2241 	RT_CACHE_STAT_INC(out_slow_tot);
2242 
2243 	if (flags & RTCF_LOCAL) {
2244 		rth->u.dst.input = ip_local_deliver;
2245 		rth->rt_spec_dst = fl->fl4_dst;
2246 	}
2247 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2248 		rth->rt_spec_dst = fl->fl4_src;
2249 		if (flags & RTCF_LOCAL &&
2250 		    !(dev_out->flags & IFF_LOOPBACK)) {
2251 			rth->u.dst.output = ip_mc_output;
2252 			RT_CACHE_STAT_INC(out_slow_mc);
2253 		}
2254 #ifdef CONFIG_IP_MROUTE
2255 		if (res->type == RTN_MULTICAST) {
2256 			if (IN_DEV_MFORWARD(in_dev) &&
2257 			    !LOCAL_MCAST(oldflp->fl4_dst)) {
2258 				rth->u.dst.input = ip_mr_input;
2259 				rth->u.dst.output = ip_mc_output;
2260 			}
2261 		}
2262 #endif
2263 	}
2264 
2265 	rt_set_nexthop(rth, res, 0);
2266 
2267 	rth->rt_flags = flags;
2268 
2269 	*result = rth;
2270  cleanup:
2271 	/* release work reference to inet device */
2272 	in_dev_put(in_dev);
2273 
2274 	return err;
2275 }
2276 
2277 static inline int ip_mkroute_output_def(struct rtable **rp,
2278 					struct fib_result* res,
2279 					const struct flowi *fl,
2280 					const struct flowi *oldflp,
2281 					struct net_device *dev_out,
2282 					unsigned flags)
2283 {
2284 	struct rtable *rth = NULL;
2285 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2286 	unsigned hash;
2287 	if (err == 0) {
2288 		hash = rt_hash_code(oldflp->fl4_dst,
2289 				    oldflp->fl4_src ^ (oldflp->oif << 5));
2290 		err = rt_intern_hash(hash, rth, rp);
2291 	}
2292 
2293 	return err;
2294 }
2295 
2296 static inline int ip_mkroute_output(struct rtable** rp,
2297 				    struct fib_result* res,
2298 				    const struct flowi *fl,
2299 				    const struct flowi *oldflp,
2300 				    struct net_device *dev_out,
2301 				    unsigned flags)
2302 {
2303 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2304 	unsigned char hop;
2305 	unsigned hash;
2306 	int err = -EINVAL;
2307 	struct rtable *rth = NULL;
2308 
2309 	if (res->fi && res->fi->fib_nhs > 1) {
2310 		unsigned char hopcount = res->fi->fib_nhs;
2311 
2312 		for (hop = 0; hop < hopcount; hop++) {
2313 			struct net_device *dev2nexthop;
2314 
2315 			res->nh_sel = hop;
2316 
2317 			/* hold a work reference to the output device */
2318 			dev2nexthop = FIB_RES_DEV(*res);
2319 			dev_hold(dev2nexthop);
2320 
2321 			/* put reference to previous result */
2322 			if (hop)
2323 				ip_rt_put(*rp);
2324 
2325 			err = __mkroute_output(&rth, res, fl, oldflp,
2326 					       dev2nexthop, flags);
2327 
2328 			if (err != 0)
2329 				goto cleanup;
2330 
2331 			hash = rt_hash_code(oldflp->fl4_dst,
2332 					    oldflp->fl4_src ^
2333 					    (oldflp->oif << 5));
2334 			err = rt_intern_hash(hash, rth, rp);
2335 
2336 			/* forward hop information to multipath impl. */
2337 			multipath_set_nhinfo(rth,
2338 					     FIB_RES_NETWORK(*res),
2339 					     FIB_RES_NETMASK(*res),
2340 					     res->prefixlen,
2341 					     &FIB_RES_NH(*res));
2342 		cleanup:
2343 			/* release work reference to output device */
2344 			dev_put(dev2nexthop);
2345 
2346 			if (err != 0)
2347 				return err;
2348 		}
2349 		return err;
2350 	} else {
2351 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2352 					     flags);
2353 	}
2354 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2355 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2356 #endif
2357 }
2358 
2359 /*
2360  * Major route resolver routine.
2361  */
2362 
2363 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2364 {
2365 	u32 tos	= RT_FL_TOS(oldflp);
2366 	struct flowi fl = { .nl_u = { .ip4_u =
2367 				      { .daddr = oldflp->fl4_dst,
2368 					.saddr = oldflp->fl4_src,
2369 					.tos = tos & IPTOS_RT_MASK,
2370 					.scope = ((tos & RTO_ONLINK) ?
2371 						  RT_SCOPE_LINK :
2372 						  RT_SCOPE_UNIVERSE),
2373 #ifdef CONFIG_IP_ROUTE_FWMARK
2374 					.fwmark = oldflp->fl4_fwmark
2375 #endif
2376 				      } },
2377 			    .iif = loopback_dev.ifindex,
2378 			    .oif = oldflp->oif };
2379 	struct fib_result res;
2380 	unsigned flags = 0;
2381 	struct net_device *dev_out = NULL;
2382 	int free_res = 0;
2383 	int err;
2384 
2385 
2386 	res.fi		= NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2388 	res.r		= NULL;
2389 #endif
2390 
2391 	if (oldflp->fl4_src) {
2392 		err = -EINVAL;
2393 		if (MULTICAST(oldflp->fl4_src) ||
2394 		    BADCLASS(oldflp->fl4_src) ||
2395 		    ZERONET(oldflp->fl4_src))
2396 			goto out;
2397 
2398 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 		dev_out = ip_dev_find(oldflp->fl4_src);
2400 		if (dev_out == NULL)
2401 			goto out;
2402 
2403 		/* I removed check for oif == dev_out->oif here.
2404 		   It was wrong for two reasons:
2405 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 		      assigned to multiple interfaces.
2407 		   2. Moreover, we are allowed to send packets with saddr
2408 		      of another iface. --ANK
2409 		 */
2410 
2411 		if (oldflp->oif == 0
2412 		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2413 			/* Special hack: user can direct multicasts
2414 			   and limited broadcast via necessary interface
2415 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 			   This hack is not just for fun, it allows
2417 			   vic,vat and friends to work.
2418 			   They bind socket to loopback, set ttl to zero
2419 			   and expect that it will work.
2420 			   From the viewpoint of routing cache they are broken,
2421 			   because we are not allowed to build multicast path
2422 			   with loopback source addr (look, routing cache
2423 			   cannot know, that ttl is zero, so that packet
2424 			   will not leave this host and route is valid).
2425 			   Luckily, this hack is good workaround.
2426 			 */
2427 
2428 			fl.oif = dev_out->ifindex;
2429 			goto make_route;
2430 		}
2431 		if (dev_out)
2432 			dev_put(dev_out);
2433 		dev_out = NULL;
2434 	}
2435 
2436 
2437 	if (oldflp->oif) {
2438 		dev_out = dev_get_by_index(oldflp->oif);
2439 		err = -ENODEV;
2440 		if (dev_out == NULL)
2441 			goto out;
2442 
2443 		/* RACE: Check return value of inet_select_addr instead. */
2444 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2445 			dev_put(dev_out);
2446 			goto out;	/* Wrong error code */
2447 		}
2448 
2449 		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2450 			if (!fl.fl4_src)
2451 				fl.fl4_src = inet_select_addr(dev_out, 0,
2452 							      RT_SCOPE_LINK);
2453 			goto make_route;
2454 		}
2455 		if (!fl.fl4_src) {
2456 			if (MULTICAST(oldflp->fl4_dst))
2457 				fl.fl4_src = inet_select_addr(dev_out, 0,
2458 							      fl.fl4_scope);
2459 			else if (!oldflp->fl4_dst)
2460 				fl.fl4_src = inet_select_addr(dev_out, 0,
2461 							      RT_SCOPE_HOST);
2462 		}
2463 	}
2464 
2465 	if (!fl.fl4_dst) {
2466 		fl.fl4_dst = fl.fl4_src;
2467 		if (!fl.fl4_dst)
2468 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 		if (dev_out)
2470 			dev_put(dev_out);
2471 		dev_out = &loopback_dev;
2472 		dev_hold(dev_out);
2473 		fl.oif = loopback_dev.ifindex;
2474 		res.type = RTN_LOCAL;
2475 		flags |= RTCF_LOCAL;
2476 		goto make_route;
2477 	}
2478 
2479 	if (fib_lookup(&fl, &res)) {
2480 		res.fi = NULL;
2481 		if (oldflp->oif) {
2482 			/* Apparently, routing tables are wrong. Assume,
2483 			   that the destination is on link.
2484 
2485 			   WHY? DW.
2486 			   Because we are allowed to send to iface
2487 			   even if it has NO routes and NO assigned
2488 			   addresses. When oif is specified, routing
2489 			   tables are looked up with only one purpose:
2490 			   to catch if destination is gatewayed, rather than
2491 			   direct. Moreover, if MSG_DONTROUTE is set,
2492 			   we send packet, ignoring both routing tables
2493 			   and ifaddr state. --ANK
2494 
2495 
2496 			   We could make it even if oif is unknown,
2497 			   likely IPv6, but we do not.
2498 			 */
2499 
2500 			if (fl.fl4_src == 0)
2501 				fl.fl4_src = inet_select_addr(dev_out, 0,
2502 							      RT_SCOPE_LINK);
2503 			res.type = RTN_UNICAST;
2504 			goto make_route;
2505 		}
2506 		if (dev_out)
2507 			dev_put(dev_out);
2508 		err = -ENETUNREACH;
2509 		goto out;
2510 	}
2511 	free_res = 1;
2512 
2513 	if (res.type == RTN_LOCAL) {
2514 		if (!fl.fl4_src)
2515 			fl.fl4_src = fl.fl4_dst;
2516 		if (dev_out)
2517 			dev_put(dev_out);
2518 		dev_out = &loopback_dev;
2519 		dev_hold(dev_out);
2520 		fl.oif = dev_out->ifindex;
2521 		if (res.fi)
2522 			fib_info_put(res.fi);
2523 		res.fi = NULL;
2524 		flags |= RTCF_LOCAL;
2525 		goto make_route;
2526 	}
2527 
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 		fib_select_multipath(&fl, &res);
2531 	else
2532 #endif
2533 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 		fib_select_default(&fl, &res);
2535 
2536 	if (!fl.fl4_src)
2537 		fl.fl4_src = FIB_RES_PREFSRC(res);
2538 
2539 	if (dev_out)
2540 		dev_put(dev_out);
2541 	dev_out = FIB_RES_DEV(res);
2542 	dev_hold(dev_out);
2543 	fl.oif = dev_out->ifindex;
2544 
2545 
2546 make_route:
2547 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548 
2549 
2550 	if (free_res)
2551 		fib_res_put(&res);
2552 	if (dev_out)
2553 		dev_put(dev_out);
2554 out:	return err;
2555 }
2556 
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558 {
2559 	unsigned hash;
2560 	struct rtable *rth;
2561 
2562 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2563 
2564 	rcu_read_lock_bh();
2565 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 		rth = rcu_dereference(rth->u.rt_next)) {
2567 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 		    rth->fl.fl4_src == flp->fl4_src &&
2569 		    rth->fl.iif == 0 &&
2570 		    rth->fl.oif == flp->oif &&
2571 #ifdef CONFIG_IP_ROUTE_FWMARK
2572 		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2573 #endif
2574 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2575 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2576 
2577 			/* check for multipath routes and choose one if
2578 			 * necessary
2579 			 */
2580 			if (multipath_select_route(flp, rth, rp)) {
2581 				dst_hold(&(*rp)->u.dst);
2582 				RT_CACHE_STAT_INC(out_hit);
2583 				rcu_read_unlock_bh();
2584 				return 0;
2585 			}
2586 
2587 			rth->u.dst.lastuse = jiffies;
2588 			dst_hold(&rth->u.dst);
2589 			rth->u.dst.__use++;
2590 			RT_CACHE_STAT_INC(out_hit);
2591 			rcu_read_unlock_bh();
2592 			*rp = rth;
2593 			return 0;
2594 		}
2595 		RT_CACHE_STAT_INC(out_hlist_search);
2596 	}
2597 	rcu_read_unlock_bh();
2598 
2599 	return ip_route_output_slow(rp, flp);
2600 }
2601 
2602 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2603 
2604 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2605 {
2606 	int err;
2607 
2608 	if ((err = __ip_route_output_key(rp, flp)) != 0)
2609 		return err;
2610 
2611 	if (flp->proto) {
2612 		if (!flp->fl4_src)
2613 			flp->fl4_src = (*rp)->rt_src;
2614 		if (!flp->fl4_dst)
2615 			flp->fl4_dst = (*rp)->rt_dst;
2616 		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2617 	}
2618 
2619 	return 0;
2620 }
2621 
2622 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2623 
2624 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2625 {
2626 	return ip_route_output_flow(rp, flp, NULL, 0);
2627 }
2628 
2629 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2630 			int nowait, unsigned int flags)
2631 {
2632 	struct rtable *rt = (struct rtable*)skb->dst;
2633 	struct rtmsg *r;
2634 	struct nlmsghdr  *nlh;
2635 	unsigned char	 *b = skb->tail;
2636 	struct rta_cacheinfo ci;
2637 #ifdef CONFIG_IP_MROUTE
2638 	struct rtattr *eptr;
2639 #endif
2640 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2641 	r = NLMSG_DATA(nlh);
2642 	r->rtm_family	 = AF_INET;
2643 	r->rtm_dst_len	= 32;
2644 	r->rtm_src_len	= 0;
2645 	r->rtm_tos	= rt->fl.fl4_tos;
2646 	r->rtm_table	= RT_TABLE_MAIN;
2647 	r->rtm_type	= rt->rt_type;
2648 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2649 	r->rtm_protocol = RTPROT_UNSPEC;
2650 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 	if (rt->rt_flags & RTCF_NOTIFY)
2652 		r->rtm_flags |= RTM_F_NOTIFY;
2653 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2654 	if (rt->fl.fl4_src) {
2655 		r->rtm_src_len = 32;
2656 		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2657 	}
2658 	if (rt->u.dst.dev)
2659 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2660 #ifdef CONFIG_NET_CLS_ROUTE
2661 	if (rt->u.dst.tclassid)
2662 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2663 #endif
2664 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2665 	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2666 		__u32 alg = rt->rt_multipath_alg;
2667 
2668 		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2669 	}
2670 #endif
2671 	if (rt->fl.iif)
2672 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2673 	else if (rt->rt_src != rt->fl.fl4_src)
2674 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2675 	if (rt->rt_dst != rt->rt_gateway)
2676 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2677 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 		goto rtattr_failure;
2679 	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2680 	ci.rta_used	= rt->u.dst.__use;
2681 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2682 	if (rt->u.dst.expires)
2683 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2684 	else
2685 		ci.rta_expires = 0;
2686 	ci.rta_error	= rt->u.dst.error;
2687 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2688 	if (rt->peer) {
2689 		ci.rta_id = rt->peer->ip_id_count;
2690 		if (rt->peer->tcp_ts_stamp) {
2691 			ci.rta_ts = rt->peer->tcp_ts;
2692 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2693 		}
2694 	}
2695 #ifdef CONFIG_IP_MROUTE
2696 	eptr = (struct rtattr*)skb->tail;
2697 #endif
2698 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2699 	if (rt->fl.iif) {
2700 #ifdef CONFIG_IP_MROUTE
2701 		u32 dst = rt->rt_dst;
2702 
2703 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2704 		    ipv4_devconf.mc_forwarding) {
2705 			int err = ipmr_get_route(skb, r, nowait);
2706 			if (err <= 0) {
2707 				if (!nowait) {
2708 					if (err == 0)
2709 						return 0;
2710 					goto nlmsg_failure;
2711 				} else {
2712 					if (err == -EMSGSIZE)
2713 						goto nlmsg_failure;
2714 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2715 				}
2716 			}
2717 		} else
2718 #endif
2719 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2720 	}
2721 
2722 	nlh->nlmsg_len = skb->tail - b;
2723 	return skb->len;
2724 
2725 nlmsg_failure:
2726 rtattr_failure:
2727 	skb_trim(skb, b - skb->data);
2728 	return -1;
2729 }
2730 
2731 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2732 {
2733 	struct rtattr **rta = arg;
2734 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2735 	struct rtable *rt = NULL;
2736 	u32 dst = 0;
2737 	u32 src = 0;
2738 	int iif = 0;
2739 	int err = -ENOBUFS;
2740 	struct sk_buff *skb;
2741 
2742 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743 	if (!skb)
2744 		goto out;
2745 
2746 	/* Reserve room for dummy headers, this skb can pass
2747 	   through good chunk of routing engine.
2748 	 */
2749 	skb->mac.raw = skb->nh.raw = skb->data;
2750 
2751 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 	skb->nh.iph->protocol = IPPROTO_ICMP;
2753 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754 
2755 	if (rta[RTA_SRC - 1])
2756 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 	if (rta[RTA_DST - 1])
2758 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 	if (rta[RTA_IIF - 1])
2760 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2761 
2762 	if (iif) {
2763 		struct net_device *dev = __dev_get_by_index(iif);
2764 		err = -ENODEV;
2765 		if (!dev)
2766 			goto out_free;
2767 		skb->protocol	= htons(ETH_P_IP);
2768 		skb->dev	= dev;
2769 		local_bh_disable();
2770 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2771 		local_bh_enable();
2772 		rt = (struct rtable*)skb->dst;
2773 		if (!err && rt->u.dst.error)
2774 			err = -rt->u.dst.error;
2775 	} else {
2776 		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2777 							 .saddr = src,
2778 							 .tos = rtm->rtm_tos } } };
2779 		int oif = 0;
2780 		if (rta[RTA_OIF - 1])
2781 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2782 		fl.oif = oif;
2783 		err = ip_route_output_key(&rt, &fl);
2784 	}
2785 	if (err)
2786 		goto out_free;
2787 
2788 	skb->dst = &rt->u.dst;
2789 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 		rt->rt_flags |= RTCF_NOTIFY;
2791 
2792 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2793 
2794 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 				RTM_NEWROUTE, 0, 0);
2796 	if (!err)
2797 		goto out_free;
2798 	if (err < 0) {
2799 		err = -EMSGSIZE;
2800 		goto out_free;
2801 	}
2802 
2803 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2804 	if (err > 0)
2805 		err = 0;
2806 out:	return err;
2807 
2808 out_free:
2809 	kfree_skb(skb);
2810 	goto out;
2811 }
2812 
2813 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2814 {
2815 	struct rtable *rt;
2816 	int h, s_h;
2817 	int idx, s_idx;
2818 
2819 	s_h = cb->args[0];
2820 	s_idx = idx = cb->args[1];
2821 	for (h = 0; h <= rt_hash_mask; h++) {
2822 		if (h < s_h) continue;
2823 		if (h > s_h)
2824 			s_idx = 0;
2825 		rcu_read_lock_bh();
2826 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 			if (idx < s_idx)
2829 				continue;
2830 			skb->dst = dst_clone(&rt->u.dst);
2831 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 					 1, NLM_F_MULTI) <= 0) {
2834 				dst_release(xchg(&skb->dst, NULL));
2835 				rcu_read_unlock_bh();
2836 				goto done;
2837 			}
2838 			dst_release(xchg(&skb->dst, NULL));
2839 		}
2840 		rcu_read_unlock_bh();
2841 	}
2842 
2843 done:
2844 	cb->args[0] = h;
2845 	cb->args[1] = idx;
2846 	return skb->len;
2847 }
2848 
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2850 {
2851 	rt_cache_flush(0);
2852 }
2853 
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2856 
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 					struct file *filp, void __user *buffer,
2859 					size_t *lenp, loff_t *ppos)
2860 {
2861 	if (write) {
2862 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 		rt_cache_flush(flush_delay);
2864 		return 0;
2865 	}
2866 
2867 	return -EINVAL;
2868 }
2869 
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 						int __user *name,
2872 						int nlen,
2873 						void __user *oldval,
2874 						size_t __user *oldlenp,
2875 						void __user *newval,
2876 						size_t newlen,
2877 						void **context)
2878 {
2879 	int delay;
2880 	if (newlen != sizeof(int))
2881 		return -EINVAL;
2882 	if (get_user(delay, (int __user *)newval))
2883 		return -EFAULT;
2884 	rt_cache_flush(delay);
2885 	return 0;
2886 }
2887 
2888 ctl_table ipv4_route_table[] = {
2889         {
2890 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2891 		.procname	= "flush",
2892 		.data		= &flush_delay,
2893 		.maxlen		= sizeof(int),
2894 		.mode		= 0200,
2895 		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2896 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2897 	},
2898 	{
2899 		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
2900 		.procname	= "min_delay",
2901 		.data		= &ip_rt_min_delay,
2902 		.maxlen		= sizeof(int),
2903 		.mode		= 0644,
2904 		.proc_handler	= &proc_dointvec_jiffies,
2905 		.strategy	= &sysctl_jiffies,
2906 	},
2907 	{
2908 		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
2909 		.procname	= "max_delay",
2910 		.data		= &ip_rt_max_delay,
2911 		.maxlen		= sizeof(int),
2912 		.mode		= 0644,
2913 		.proc_handler	= &proc_dointvec_jiffies,
2914 		.strategy	= &sysctl_jiffies,
2915 	},
2916 	{
2917 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2918 		.procname	= "gc_thresh",
2919 		.data		= &ipv4_dst_ops.gc_thresh,
2920 		.maxlen		= sizeof(int),
2921 		.mode		= 0644,
2922 		.proc_handler	= &proc_dointvec,
2923 	},
2924 	{
2925 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2926 		.procname	= "max_size",
2927 		.data		= &ip_rt_max_size,
2928 		.maxlen		= sizeof(int),
2929 		.mode		= 0644,
2930 		.proc_handler	= &proc_dointvec,
2931 	},
2932 	{
2933 		/*  Deprecated. Use gc_min_interval_ms */
2934 
2935 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 		.procname	= "gc_min_interval",
2937 		.data		= &ip_rt_gc_min_interval,
2938 		.maxlen		= sizeof(int),
2939 		.mode		= 0644,
2940 		.proc_handler	= &proc_dointvec_jiffies,
2941 		.strategy	= &sysctl_jiffies,
2942 	},
2943 	{
2944 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 		.procname	= "gc_min_interval_ms",
2946 		.data		= &ip_rt_gc_min_interval,
2947 		.maxlen		= sizeof(int),
2948 		.mode		= 0644,
2949 		.proc_handler	= &proc_dointvec_ms_jiffies,
2950 		.strategy	= &sysctl_ms_jiffies,
2951 	},
2952 	{
2953 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2954 		.procname	= "gc_timeout",
2955 		.data		= &ip_rt_gc_timeout,
2956 		.maxlen		= sizeof(int),
2957 		.mode		= 0644,
2958 		.proc_handler	= &proc_dointvec_jiffies,
2959 		.strategy	= &sysctl_jiffies,
2960 	},
2961 	{
2962 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2963 		.procname	= "gc_interval",
2964 		.data		= &ip_rt_gc_interval,
2965 		.maxlen		= sizeof(int),
2966 		.mode		= 0644,
2967 		.proc_handler	= &proc_dointvec_jiffies,
2968 		.strategy	= &sysctl_jiffies,
2969 	},
2970 	{
2971 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 		.procname	= "redirect_load",
2973 		.data		= &ip_rt_redirect_load,
2974 		.maxlen		= sizeof(int),
2975 		.mode		= 0644,
2976 		.proc_handler	= &proc_dointvec,
2977 	},
2978 	{
2979 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 		.procname	= "redirect_number",
2981 		.data		= &ip_rt_redirect_number,
2982 		.maxlen		= sizeof(int),
2983 		.mode		= 0644,
2984 		.proc_handler	= &proc_dointvec,
2985 	},
2986 	{
2987 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 		.procname	= "redirect_silence",
2989 		.data		= &ip_rt_redirect_silence,
2990 		.maxlen		= sizeof(int),
2991 		.mode		= 0644,
2992 		.proc_handler	= &proc_dointvec,
2993 	},
2994 	{
2995 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2996 		.procname	= "error_cost",
2997 		.data		= &ip_rt_error_cost,
2998 		.maxlen		= sizeof(int),
2999 		.mode		= 0644,
3000 		.proc_handler	= &proc_dointvec,
3001 	},
3002 	{
3003 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3004 		.procname	= "error_burst",
3005 		.data		= &ip_rt_error_burst,
3006 		.maxlen		= sizeof(int),
3007 		.mode		= 0644,
3008 		.proc_handler	= &proc_dointvec,
3009 	},
3010 	{
3011 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3012 		.procname	= "gc_elasticity",
3013 		.data		= &ip_rt_gc_elasticity,
3014 		.maxlen		= sizeof(int),
3015 		.mode		= 0644,
3016 		.proc_handler	= &proc_dointvec,
3017 	},
3018 	{
3019 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3020 		.procname	= "mtu_expires",
3021 		.data		= &ip_rt_mtu_expires,
3022 		.maxlen		= sizeof(int),
3023 		.mode		= 0644,
3024 		.proc_handler	= &proc_dointvec_jiffies,
3025 		.strategy	= &sysctl_jiffies,
3026 	},
3027 	{
3028 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3029 		.procname	= "min_pmtu",
3030 		.data		= &ip_rt_min_pmtu,
3031 		.maxlen		= sizeof(int),
3032 		.mode		= 0644,
3033 		.proc_handler	= &proc_dointvec,
3034 	},
3035 	{
3036 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3037 		.procname	= "min_adv_mss",
3038 		.data		= &ip_rt_min_advmss,
3039 		.maxlen		= sizeof(int),
3040 		.mode		= 0644,
3041 		.proc_handler	= &proc_dointvec,
3042 	},
3043 	{
3044 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 		.procname	= "secret_interval",
3046 		.data		= &ip_rt_secret_interval,
3047 		.maxlen		= sizeof(int),
3048 		.mode		= 0644,
3049 		.proc_handler	= &proc_dointvec_jiffies,
3050 		.strategy	= &sysctl_jiffies,
3051 	},
3052 	{ .ctl_name = 0 }
3053 };
3054 #endif
3055 
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3058 
3059 /* This code sucks.  But you should have seen it before! --RR */
3060 
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063 
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 			   int length, int *eof, void *data)
3067 {
3068 	unsigned int i;
3069 
3070 	if ((offset & 3) || (length & 3))
3071 		return -EIO;
3072 
3073 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 		*eof = 1;
3075 		return 0;
3076 	}
3077 
3078 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 		length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 		*eof = 1;
3081 	}
3082 
3083 	offset /= sizeof(u32);
3084 
3085 	if (length > 0) {
3086 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 		u32 *dst = (u32 *) buffer;
3088 
3089 		/* Copy first cpu. */
3090 		*start = buffer;
3091 		memcpy(dst, src, length);
3092 
3093 		/* Add the other cpus in, one int at a time */
3094 		for_each_possible_cpu(i) {
3095 			unsigned int j;
3096 
3097 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098 
3099 			for (j = 0; j < length/4; j++)
3100 				dst[j] += src[j];
3101 		}
3102 	}
3103 	return length;
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3107 
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3110 {
3111 	if (!str)
3112 		return 0;
3113 	rhash_entries = simple_strtoul(str, &str, 0);
3114 	return 1;
3115 }
3116 __setup("rhash_entries=", set_rhash_entries);
3117 
3118 int __init ip_rt_init(void)
3119 {
3120 	int rc = 0;
3121 
3122 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 			     (jiffies ^ (jiffies >> 7)));
3124 
3125 #ifdef CONFIG_NET_CLS_ROUTE
3126 	{
3127 	int order;
3128 	for (order = 0;
3129 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 		/* NOTHING */;
3131 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 	if (!ip_rt_acct)
3133 		panic("IP: failed to allocate ip_rt_acct\n");
3134 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
3135 	}
3136 #endif
3137 
3138 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 						     sizeof(struct rtable),
3140 						     0, SLAB_HWCACHE_ALIGN,
3141 						     NULL, NULL);
3142 
3143 	if (!ipv4_dst_ops.kmem_cachep)
3144 		panic("IP: failed to allocate ip_dst_cache\n");
3145 
3146 	rt_hash_table = (struct rt_hash_bucket *)
3147 		alloc_large_system_hash("IP route cache",
3148 					sizeof(struct rt_hash_bucket),
3149 					rhash_entries,
3150 					(num_physpages >= 128 * 1024) ?
3151 					15 : 17,
3152 					HASH_HIGHMEM,
3153 					&rt_hash_log,
3154 					&rt_hash_mask,
3155 					0);
3156 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 	rt_hash_lock_init();
3158 
3159 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161 
3162 	devinet_init();
3163 	ip_fib_init();
3164 
3165 	init_timer(&rt_flush_timer);
3166 	rt_flush_timer.function = rt_run_flush;
3167 	init_timer(&rt_periodic_timer);
3168 	rt_periodic_timer.function = rt_check_expire;
3169 	init_timer(&rt_secret_timer);
3170 	rt_secret_timer.function = rt_secret_rebuild;
3171 
3172 	/* All the timers, started at system startup tend
3173 	   to synchronize. Perturb it a bit.
3174 	 */
3175 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3176 					ip_rt_gc_interval;
3177 	add_timer(&rt_periodic_timer);
3178 
3179 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3180 		ip_rt_secret_interval;
3181 	add_timer(&rt_secret_timer);
3182 
3183 #ifdef CONFIG_PROC_FS
3184 	{
3185 	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3186 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3187 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3188 			    		     proc_net_stat))) {
3189 		return -ENOMEM;
3190 	}
3191 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3192 	}
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195 #endif
3196 #endif
3197 #ifdef CONFIG_XFRM
3198 	xfrm_init();
3199 	xfrm4_init();
3200 #endif
3201 	return rc;
3202 }
3203 
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);
3207