xref: /linux/net/ipv4/route.c (revision 273b281fa22c293963ee3e6eec418f5dda2dbc83)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
517 {
518 	struct ip_rt_acct *dst, *src;
519 	unsigned int i, j;
520 
521 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 	if (!dst)
523 		return -ENOMEM;
524 
525 	for_each_possible_cpu(i) {
526 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 		for (j = 0; j < 256; j++) {
528 			dst[j].o_bytes   += src[j].o_bytes;
529 			dst[j].o_packets += src[j].o_packets;
530 			dst[j].i_bytes   += src[j].i_bytes;
531 			dst[j].i_packets += src[j].i_packets;
532 		}
533 	}
534 
535 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 	kfree(dst);
537 	return 0;
538 }
539 
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
541 {
542 	return single_open(file, rt_acct_proc_show, NULL);
543 }
544 
545 static const struct file_operations rt_acct_proc_fops = {
546 	.owner		= THIS_MODULE,
547 	.open		= rt_acct_proc_open,
548 	.read		= seq_read,
549 	.llseek		= seq_lseek,
550 	.release	= single_release,
551 };
552 #endif
553 
554 static int __net_init ip_rt_do_proc_init(struct net *net)
555 {
556 	struct proc_dir_entry *pde;
557 
558 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 			&rt_cache_seq_fops);
560 	if (!pde)
561 		goto err1;
562 
563 	pde = proc_create("rt_cache", S_IRUGO,
564 			  net->proc_net_stat, &rt_cpu_seq_fops);
565 	if (!pde)
566 		goto err2;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 	if (!pde)
571 		goto err3;
572 #endif
573 	return 0;
574 
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 	remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 	remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 	return -ENOMEM;
583 }
584 
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587 	remove_proc_entry("rt_cache", net->proc_net_stat);
588 	remove_proc_entry("rt_cache", net->proc_net);
589 	remove_proc_entry("rt_acct", net->proc_net);
590 }
591 
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593 	.init = ip_rt_do_proc_init,
594 	.exit = ip_rt_do_proc_exit,
595 };
596 
597 static int __init ip_rt_proc_init(void)
598 {
599 	return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601 
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605 	return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608 
609 static inline void rt_free(struct rtable *rt)
610 {
611 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 }
613 
614 static inline void rt_drop(struct rtable *rt)
615 {
616 	ip_rt_put(rt);
617 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618 }
619 
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622 	/* Kill broadcast/multicast entries very aggresively, if they
623 	   collide in hash table with more useful entries */
624 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 		rth->fl.iif && rth->u.dst.rt_next;
626 }
627 
628 static inline int rt_valuable(struct rtable *rth)
629 {
630 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 		rth->u.dst.expires;
632 }
633 
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636 	unsigned long age;
637 	int ret = 0;
638 
639 	if (atomic_read(&rth->u.dst.__refcnt))
640 		goto out;
641 
642 	ret = 1;
643 	if (rth->u.dst.expires &&
644 	    time_after_eq(jiffies, rth->u.dst.expires))
645 		goto out;
646 
647 	age = jiffies - rth->u.dst.lastuse;
648 	ret = 0;
649 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 	    (age <= tmo2 && rt_valuable(rth)))
651 		goto out;
652 	ret = 1;
653 out:	return ret;
654 }
655 
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663 	u32 score = jiffies - rt->u.dst.lastuse;
664 
665 	score = ~score & ~(3<<30);
666 
667 	if (rt_valuable(rt))
668 		score |= (1<<31);
669 
670 	if (!rt->fl.iif ||
671 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 		score |= (1<<30);
673 
674 	return score;
675 }
676 
677 static inline bool rt_caching(const struct net *net)
678 {
679 	return net->ipv4.current_rt_cache_rebuild_count <=
680 		net->ipv4.sysctl_rt_cache_rebuild_count;
681 }
682 
683 static inline bool compare_hash_inputs(const struct flowi *fl1,
684 					const struct flowi *fl2)
685 {
686 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
687 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
688 		(fl1->iif ^ fl2->iif)) == 0);
689 }
690 
691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692 {
693 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
694 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
695 		(fl1->mark ^ fl2->mark) |
696 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
697 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
698 		(fl1->oif ^ fl2->oif) |
699 		(fl1->iif ^ fl2->iif)) == 0;
700 }
701 
702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703 {
704 	return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
705 }
706 
707 static inline int rt_is_expired(struct rtable *rth)
708 {
709 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
710 }
711 
712 /*
713  * Perform a full scan of hash table and free all entries.
714  * Can be called by a softirq or a process.
715  * In the later case, we want to be reschedule if necessary
716  */
717 static void rt_do_flush(int process_context)
718 {
719 	unsigned int i;
720 	struct rtable *rth, *next;
721 	struct rtable * tail;
722 
723 	for (i = 0; i <= rt_hash_mask; i++) {
724 		if (process_context && need_resched())
725 			cond_resched();
726 		rth = rt_hash_table[i].chain;
727 		if (!rth)
728 			continue;
729 
730 		spin_lock_bh(rt_hash_lock_addr(i));
731 #ifdef CONFIG_NET_NS
732 		{
733 		struct rtable ** prev, * p;
734 
735 		rth = rt_hash_table[i].chain;
736 
737 		/* defer releasing the head of the list after spin_unlock */
738 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
739 			if (!rt_is_expired(tail))
740 				break;
741 		if (rth != tail)
742 			rt_hash_table[i].chain = tail;
743 
744 		/* call rt_free on entries after the tail requiring flush */
745 		prev = &rt_hash_table[i].chain;
746 		for (p = *prev; p; p = next) {
747 			next = p->u.dst.rt_next;
748 			if (!rt_is_expired(p)) {
749 				prev = &p->u.dst.rt_next;
750 			} else {
751 				*prev = next;
752 				rt_free(p);
753 			}
754 		}
755 		}
756 #else
757 		rth = rt_hash_table[i].chain;
758 		rt_hash_table[i].chain = NULL;
759 		tail = NULL;
760 #endif
761 		spin_unlock_bh(rt_hash_lock_addr(i));
762 
763 		for (; rth != tail; rth = next) {
764 			next = rth->u.dst.rt_next;
765 			rt_free(rth);
766 		}
767 	}
768 }
769 
770 /*
771  * While freeing expired entries, we compute average chain length
772  * and standard deviation, using fixed-point arithmetic.
773  * This to have an estimation of rt_chain_length_max
774  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
775  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
776  */
777 
778 #define FRACT_BITS 3
779 #define ONE (1UL << FRACT_BITS)
780 
781 static void rt_check_expire(void)
782 {
783 	static unsigned int rover;
784 	unsigned int i = rover, goal;
785 	struct rtable *rth, *aux, **rthp;
786 	unsigned long samples = 0;
787 	unsigned long sum = 0, sum2 = 0;
788 	unsigned long delta;
789 	u64 mult;
790 
791 	delta = jiffies - expires_ljiffies;
792 	expires_ljiffies = jiffies;
793 	mult = ((u64)delta) << rt_hash_log;
794 	if (ip_rt_gc_timeout > 1)
795 		do_div(mult, ip_rt_gc_timeout);
796 	goal = (unsigned int)mult;
797 	if (goal > rt_hash_mask)
798 		goal = rt_hash_mask + 1;
799 	for (; goal > 0; goal--) {
800 		unsigned long tmo = ip_rt_gc_timeout;
801 		unsigned long length;
802 
803 		i = (i + 1) & rt_hash_mask;
804 		rthp = &rt_hash_table[i].chain;
805 
806 		if (need_resched())
807 			cond_resched();
808 
809 		samples++;
810 
811 		if (*rthp == NULL)
812 			continue;
813 		length = 0;
814 		spin_lock_bh(rt_hash_lock_addr(i));
815 		while ((rth = *rthp) != NULL) {
816 			prefetch(rth->u.dst.rt_next);
817 			if (rt_is_expired(rth)) {
818 				*rthp = rth->u.dst.rt_next;
819 				rt_free(rth);
820 				continue;
821 			}
822 			if (rth->u.dst.expires) {
823 				/* Entry is expired even if it is in use */
824 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
825 nofree:
826 					tmo >>= 1;
827 					rthp = &rth->u.dst.rt_next;
828 					/*
829 					 * We only count entries on
830 					 * a chain with equal hash inputs once
831 					 * so that entries for different QOS
832 					 * levels, and other non-hash input
833 					 * attributes don't unfairly skew
834 					 * the length computation
835 					 */
836 					for (aux = rt_hash_table[i].chain;;) {
837 						if (aux == rth) {
838 							length += ONE;
839 							break;
840 						}
841 						if (compare_hash_inputs(&aux->fl, &rth->fl))
842 							break;
843 						aux = aux->u.dst.rt_next;
844 					}
845 					continue;
846 				}
847 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
848 				goto nofree;
849 
850 			/* Cleanup aged off entries. */
851 			*rthp = rth->u.dst.rt_next;
852 			rt_free(rth);
853 		}
854 		spin_unlock_bh(rt_hash_lock_addr(i));
855 		sum += length;
856 		sum2 += length*length;
857 	}
858 	if (samples) {
859 		unsigned long avg = sum / samples;
860 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
861 		rt_chain_length_max = max_t(unsigned long,
862 					ip_rt_gc_elasticity,
863 					(avg + 4*sd) >> FRACT_BITS);
864 	}
865 	rover = i;
866 }
867 
868 /*
869  * rt_worker_func() is run in process context.
870  * we call rt_check_expire() to scan part of the hash table
871  */
872 static void rt_worker_func(struct work_struct *work)
873 {
874 	rt_check_expire();
875 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
876 }
877 
878 /*
879  * Pertubation of rt_genid by a small quantity [1..256]
880  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
881  * many times (2^24) without giving recent rt_genid.
882  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
883  */
884 static void rt_cache_invalidate(struct net *net)
885 {
886 	unsigned char shuffle;
887 
888 	get_random_bytes(&shuffle, sizeof(shuffle));
889 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
890 }
891 
892 /*
893  * delay < 0  : invalidate cache (fast : entries will be deleted later)
894  * delay >= 0 : invalidate & flush cache (can be long)
895  */
896 void rt_cache_flush(struct net *net, int delay)
897 {
898 	rt_cache_invalidate(net);
899 	if (delay >= 0)
900 		rt_do_flush(!in_softirq());
901 }
902 
903 /* Flush previous cache invalidated entries from the cache */
904 void rt_cache_flush_batch(void)
905 {
906 	rt_do_flush(!in_softirq());
907 }
908 
909 /*
910  * We change rt_genid and let gc do the cleanup
911  */
912 static void rt_secret_rebuild(unsigned long __net)
913 {
914 	struct net *net = (struct net *)__net;
915 	rt_cache_invalidate(net);
916 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
917 }
918 
919 static void rt_secret_rebuild_oneshot(struct net *net)
920 {
921 	del_timer_sync(&net->ipv4.rt_secret_timer);
922 	rt_cache_invalidate(net);
923 	if (ip_rt_secret_interval) {
924 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
925 		add_timer(&net->ipv4.rt_secret_timer);
926 	}
927 }
928 
929 static void rt_emergency_hash_rebuild(struct net *net)
930 {
931 	if (net_ratelimit()) {
932 		printk(KERN_WARNING "Route hash chain too long!\n");
933 		printk(KERN_WARNING "Adjust your secret_interval!\n");
934 	}
935 
936 	rt_secret_rebuild_oneshot(net);
937 }
938 
939 /*
940    Short description of GC goals.
941 
942    We want to build algorithm, which will keep routing cache
943    at some equilibrium point, when number of aged off entries
944    is kept approximately equal to newly generated ones.
945 
946    Current expiration strength is variable "expire".
947    We try to adjust it dynamically, so that if networking
948    is idle expires is large enough to keep enough of warm entries,
949    and when load increases it reduces to limit cache size.
950  */
951 
952 static int rt_garbage_collect(struct dst_ops *ops)
953 {
954 	static unsigned long expire = RT_GC_TIMEOUT;
955 	static unsigned long last_gc;
956 	static int rover;
957 	static int equilibrium;
958 	struct rtable *rth, **rthp;
959 	unsigned long now = jiffies;
960 	int goal;
961 
962 	/*
963 	 * Garbage collection is pretty expensive,
964 	 * do not make it too frequently.
965 	 */
966 
967 	RT_CACHE_STAT_INC(gc_total);
968 
969 	if (now - last_gc < ip_rt_gc_min_interval &&
970 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
971 		RT_CACHE_STAT_INC(gc_ignored);
972 		goto out;
973 	}
974 
975 	/* Calculate number of entries, which we want to expire now. */
976 	goal = atomic_read(&ipv4_dst_ops.entries) -
977 		(ip_rt_gc_elasticity << rt_hash_log);
978 	if (goal <= 0) {
979 		if (equilibrium < ipv4_dst_ops.gc_thresh)
980 			equilibrium = ipv4_dst_ops.gc_thresh;
981 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
982 		if (goal > 0) {
983 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
984 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
985 		}
986 	} else {
987 		/* We are in dangerous area. Try to reduce cache really
988 		 * aggressively.
989 		 */
990 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
991 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
992 	}
993 
994 	if (now - last_gc >= ip_rt_gc_min_interval)
995 		last_gc = now;
996 
997 	if (goal <= 0) {
998 		equilibrium += goal;
999 		goto work_done;
1000 	}
1001 
1002 	do {
1003 		int i, k;
1004 
1005 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1006 			unsigned long tmo = expire;
1007 
1008 			k = (k + 1) & rt_hash_mask;
1009 			rthp = &rt_hash_table[k].chain;
1010 			spin_lock_bh(rt_hash_lock_addr(k));
1011 			while ((rth = *rthp) != NULL) {
1012 				if (!rt_is_expired(rth) &&
1013 					!rt_may_expire(rth, tmo, expire)) {
1014 					tmo >>= 1;
1015 					rthp = &rth->u.dst.rt_next;
1016 					continue;
1017 				}
1018 				*rthp = rth->u.dst.rt_next;
1019 				rt_free(rth);
1020 				goal--;
1021 			}
1022 			spin_unlock_bh(rt_hash_lock_addr(k));
1023 			if (goal <= 0)
1024 				break;
1025 		}
1026 		rover = k;
1027 
1028 		if (goal <= 0)
1029 			goto work_done;
1030 
1031 		/* Goal is not achieved. We stop process if:
1032 
1033 		   - if expire reduced to zero. Otherwise, expire is halfed.
1034 		   - if table is not full.
1035 		   - if we are called from interrupt.
1036 		   - jiffies check is just fallback/debug loop breaker.
1037 		     We will not spin here for long time in any case.
1038 		 */
1039 
1040 		RT_CACHE_STAT_INC(gc_goal_miss);
1041 
1042 		if (expire == 0)
1043 			break;
1044 
1045 		expire >>= 1;
1046 #if RT_CACHE_DEBUG >= 2
1047 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1049 #endif
1050 
1051 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 			goto out;
1053 	} while (!in_softirq() && time_before_eq(jiffies, now));
1054 
1055 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1056 		goto out;
1057 	if (net_ratelimit())
1058 		printk(KERN_WARNING "dst cache overflow\n");
1059 	RT_CACHE_STAT_INC(gc_dst_overflow);
1060 	return 1;
1061 
1062 work_done:
1063 	expire += ip_rt_gc_min_interval;
1064 	if (expire > ip_rt_gc_timeout ||
1065 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1066 		expire = ip_rt_gc_timeout;
1067 #if RT_CACHE_DEBUG >= 2
1068 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1069 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1070 #endif
1071 out:	return 0;
1072 }
1073 
1074 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1075 			  struct rtable **rp, struct sk_buff *skb)
1076 {
1077 	struct rtable	*rth, **rthp;
1078 	unsigned long	now;
1079 	struct rtable *cand, **candp;
1080 	u32 		min_score;
1081 	int		chain_length;
1082 	int attempts = !in_softirq();
1083 
1084 restart:
1085 	chain_length = 0;
1086 	min_score = ~(u32)0;
1087 	cand = NULL;
1088 	candp = NULL;
1089 	now = jiffies;
1090 
1091 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1092 		/*
1093 		 * If we're not caching, just tell the caller we
1094 		 * were successful and don't touch the route.  The
1095 		 * caller hold the sole reference to the cache entry, and
1096 		 * it will be released when the caller is done with it.
1097 		 * If we drop it here, the callers have no way to resolve routes
1098 		 * when we're not caching.  Instead, just point *rp at rt, so
1099 		 * the caller gets a single use out of the route
1100 		 * Note that we do rt_free on this new route entry, so that
1101 		 * once its refcount hits zero, we are still able to reap it
1102 		 * (Thanks Alexey)
1103 		 * Note also the rt_free uses call_rcu.  We don't actually
1104 		 * need rcu protection here, this is just our path to get
1105 		 * on the route gc list.
1106 		 */
1107 
1108 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1109 			int err = arp_bind_neighbour(&rt->u.dst);
1110 			if (err) {
1111 				if (net_ratelimit())
1112 					printk(KERN_WARNING
1113 					    "Neighbour table failure & not caching routes.\n");
1114 				rt_drop(rt);
1115 				return err;
1116 			}
1117 		}
1118 
1119 		rt_free(rt);
1120 		goto skip_hashing;
1121 	}
1122 
1123 	rthp = &rt_hash_table[hash].chain;
1124 
1125 	spin_lock_bh(rt_hash_lock_addr(hash));
1126 	while ((rth = *rthp) != NULL) {
1127 		if (rt_is_expired(rth)) {
1128 			*rthp = rth->u.dst.rt_next;
1129 			rt_free(rth);
1130 			continue;
1131 		}
1132 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1133 			/* Put it first */
1134 			*rthp = rth->u.dst.rt_next;
1135 			/*
1136 			 * Since lookup is lockfree, the deletion
1137 			 * must be visible to another weakly ordered CPU before
1138 			 * the insertion at the start of the hash chain.
1139 			 */
1140 			rcu_assign_pointer(rth->u.dst.rt_next,
1141 					   rt_hash_table[hash].chain);
1142 			/*
1143 			 * Since lookup is lockfree, the update writes
1144 			 * must be ordered for consistency on SMP.
1145 			 */
1146 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147 
1148 			dst_use(&rth->u.dst, now);
1149 			spin_unlock_bh(rt_hash_lock_addr(hash));
1150 
1151 			rt_drop(rt);
1152 			if (rp)
1153 				*rp = rth;
1154 			else
1155 				skb_dst_set(skb, &rth->u.dst);
1156 			return 0;
1157 		}
1158 
1159 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1160 			u32 score = rt_score(rth);
1161 
1162 			if (score <= min_score) {
1163 				cand = rth;
1164 				candp = rthp;
1165 				min_score = score;
1166 			}
1167 		}
1168 
1169 		chain_length++;
1170 
1171 		rthp = &rth->u.dst.rt_next;
1172 	}
1173 
1174 	if (cand) {
1175 		/* ip_rt_gc_elasticity used to be average length of chain
1176 		 * length, when exceeded gc becomes really aggressive.
1177 		 *
1178 		 * The second limit is less certain. At the moment it allows
1179 		 * only 2 entries per bucket. We will see.
1180 		 */
1181 		if (chain_length > ip_rt_gc_elasticity) {
1182 			*candp = cand->u.dst.rt_next;
1183 			rt_free(cand);
1184 		}
1185 	} else {
1186 		if (chain_length > rt_chain_length_max) {
1187 			struct net *net = dev_net(rt->u.dst.dev);
1188 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1189 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1190 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1191 					rt->u.dst.dev->name, num);
1192 			}
1193 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1194 		}
1195 	}
1196 
1197 	/* Try to bind route to arp only if it is output
1198 	   route or unicast forwarding path.
1199 	 */
1200 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1201 		int err = arp_bind_neighbour(&rt->u.dst);
1202 		if (err) {
1203 			spin_unlock_bh(rt_hash_lock_addr(hash));
1204 
1205 			if (err != -ENOBUFS) {
1206 				rt_drop(rt);
1207 				return err;
1208 			}
1209 
1210 			/* Neighbour tables are full and nothing
1211 			   can be released. Try to shrink route cache,
1212 			   it is most likely it holds some neighbour records.
1213 			 */
1214 			if (attempts-- > 0) {
1215 				int saved_elasticity = ip_rt_gc_elasticity;
1216 				int saved_int = ip_rt_gc_min_interval;
1217 				ip_rt_gc_elasticity	= 1;
1218 				ip_rt_gc_min_interval	= 0;
1219 				rt_garbage_collect(&ipv4_dst_ops);
1220 				ip_rt_gc_min_interval	= saved_int;
1221 				ip_rt_gc_elasticity	= saved_elasticity;
1222 				goto restart;
1223 			}
1224 
1225 			if (net_ratelimit())
1226 				printk(KERN_WARNING "Neighbour table overflow.\n");
1227 			rt_drop(rt);
1228 			return -ENOBUFS;
1229 		}
1230 	}
1231 
1232 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1233 
1234 #if RT_CACHE_DEBUG >= 2
1235 	if (rt->u.dst.rt_next) {
1236 		struct rtable *trt;
1237 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1238 		       hash, &rt->rt_dst);
1239 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1240 			printk(" . %pI4", &trt->rt_dst);
1241 		printk("\n");
1242 	}
1243 #endif
1244 	/*
1245 	 * Since lookup is lockfree, we must make sure
1246 	 * previous writes to rt are comitted to memory
1247 	 * before making rt visible to other CPUS.
1248 	 */
1249 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1250 
1251 	spin_unlock_bh(rt_hash_lock_addr(hash));
1252 
1253 skip_hashing:
1254 	if (rp)
1255 		*rp = rt;
1256 	else
1257 		skb_dst_set(skb, &rt->u.dst);
1258 	return 0;
1259 }
1260 
1261 void rt_bind_peer(struct rtable *rt, int create)
1262 {
1263 	static DEFINE_SPINLOCK(rt_peer_lock);
1264 	struct inet_peer *peer;
1265 
1266 	peer = inet_getpeer(rt->rt_dst, create);
1267 
1268 	spin_lock_bh(&rt_peer_lock);
1269 	if (rt->peer == NULL) {
1270 		rt->peer = peer;
1271 		peer = NULL;
1272 	}
1273 	spin_unlock_bh(&rt_peer_lock);
1274 	if (peer)
1275 		inet_putpeer(peer);
1276 }
1277 
1278 /*
1279  * Peer allocation may fail only in serious out-of-memory conditions.  However
1280  * we still can generate some output.
1281  * Random ID selection looks a bit dangerous because we have no chances to
1282  * select ID being unique in a reasonable period of time.
1283  * But broken packet identifier may be better than no packet at all.
1284  */
1285 static void ip_select_fb_ident(struct iphdr *iph)
1286 {
1287 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1288 	static u32 ip_fallback_id;
1289 	u32 salt;
1290 
1291 	spin_lock_bh(&ip_fb_id_lock);
1292 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1293 	iph->id = htons(salt & 0xFFFF);
1294 	ip_fallback_id = salt;
1295 	spin_unlock_bh(&ip_fb_id_lock);
1296 }
1297 
1298 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1299 {
1300 	struct rtable *rt = (struct rtable *) dst;
1301 
1302 	if (rt) {
1303 		if (rt->peer == NULL)
1304 			rt_bind_peer(rt, 1);
1305 
1306 		/* If peer is attached to destination, it is never detached,
1307 		   so that we need not to grab a lock to dereference it.
1308 		 */
1309 		if (rt->peer) {
1310 			iph->id = htons(inet_getid(rt->peer, more));
1311 			return;
1312 		}
1313 	} else
1314 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1315 		       __builtin_return_address(0));
1316 
1317 	ip_select_fb_ident(iph);
1318 }
1319 
1320 static void rt_del(unsigned hash, struct rtable *rt)
1321 {
1322 	struct rtable **rthp, *aux;
1323 
1324 	rthp = &rt_hash_table[hash].chain;
1325 	spin_lock_bh(rt_hash_lock_addr(hash));
1326 	ip_rt_put(rt);
1327 	while ((aux = *rthp) != NULL) {
1328 		if (aux == rt || rt_is_expired(aux)) {
1329 			*rthp = aux->u.dst.rt_next;
1330 			rt_free(aux);
1331 			continue;
1332 		}
1333 		rthp = &aux->u.dst.rt_next;
1334 	}
1335 	spin_unlock_bh(rt_hash_lock_addr(hash));
1336 }
1337 
1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 		    __be32 saddr, struct net_device *dev)
1340 {
1341 	int i, k;
1342 	struct in_device *in_dev = in_dev_get(dev);
1343 	struct rtable *rth, **rthp;
1344 	__be32  skeys[2] = { saddr, 0 };
1345 	int  ikeys[2] = { dev->ifindex, 0 };
1346 	struct netevent_redirect netevent;
1347 	struct net *net;
1348 
1349 	if (!in_dev)
1350 		return;
1351 
1352 	net = dev_net(dev);
1353 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1354 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1355 	    ipv4_is_zeronet(new_gw))
1356 		goto reject_redirect;
1357 
1358 	if (!rt_caching(net))
1359 		goto reject_redirect;
1360 
1361 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 			goto reject_redirect;
1364 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 			goto reject_redirect;
1366 	} else {
1367 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368 			goto reject_redirect;
1369 	}
1370 
1371 	for (i = 0; i < 2; i++) {
1372 		for (k = 0; k < 2; k++) {
1373 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1374 						rt_genid(net));
1375 
1376 			rthp=&rt_hash_table[hash].chain;
1377 
1378 			rcu_read_lock();
1379 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1380 				struct rtable *rt;
1381 
1382 				if (rth->fl.fl4_dst != daddr ||
1383 				    rth->fl.fl4_src != skeys[i] ||
1384 				    rth->fl.oif != ikeys[k] ||
1385 				    rth->fl.iif != 0 ||
1386 				    rt_is_expired(rth) ||
1387 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1388 					rthp = &rth->u.dst.rt_next;
1389 					continue;
1390 				}
1391 
1392 				if (rth->rt_dst != daddr ||
1393 				    rth->rt_src != saddr ||
1394 				    rth->u.dst.error ||
1395 				    rth->rt_gateway != old_gw ||
1396 				    rth->u.dst.dev != dev)
1397 					break;
1398 
1399 				dst_hold(&rth->u.dst);
1400 				rcu_read_unlock();
1401 
1402 				rt = dst_alloc(&ipv4_dst_ops);
1403 				if (rt == NULL) {
1404 					ip_rt_put(rth);
1405 					in_dev_put(in_dev);
1406 					return;
1407 				}
1408 
1409 				/* Copy all the information. */
1410 				*rt = *rth;
1411 				rt->u.dst.__use		= 1;
1412 				atomic_set(&rt->u.dst.__refcnt, 1);
1413 				rt->u.dst.child		= NULL;
1414 				if (rt->u.dst.dev)
1415 					dev_hold(rt->u.dst.dev);
1416 				if (rt->idev)
1417 					in_dev_hold(rt->idev);
1418 				rt->u.dst.obsolete	= 0;
1419 				rt->u.dst.lastuse	= jiffies;
1420 				rt->u.dst.path		= &rt->u.dst;
1421 				rt->u.dst.neighbour	= NULL;
1422 				rt->u.dst.hh		= NULL;
1423 #ifdef CONFIG_XFRM
1424 				rt->u.dst.xfrm		= NULL;
1425 #endif
1426 				rt->rt_genid		= rt_genid(net);
1427 				rt->rt_flags		|= RTCF_REDIRECTED;
1428 
1429 				/* Gateway is different ... */
1430 				rt->rt_gateway		= new_gw;
1431 
1432 				/* Redirect received -> path was valid */
1433 				dst_confirm(&rth->u.dst);
1434 
1435 				if (rt->peer)
1436 					atomic_inc(&rt->peer->refcnt);
1437 
1438 				if (arp_bind_neighbour(&rt->u.dst) ||
1439 				    !(rt->u.dst.neighbour->nud_state &
1440 					    NUD_VALID)) {
1441 					if (rt->u.dst.neighbour)
1442 						neigh_event_send(rt->u.dst.neighbour, NULL);
1443 					ip_rt_put(rth);
1444 					rt_drop(rt);
1445 					goto do_next;
1446 				}
1447 
1448 				netevent.old = &rth->u.dst;
1449 				netevent.new = &rt->u.dst;
1450 				call_netevent_notifiers(NETEVENT_REDIRECT,
1451 							&netevent);
1452 
1453 				rt_del(hash, rth);
1454 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1455 					ip_rt_put(rt);
1456 				goto do_next;
1457 			}
1458 			rcu_read_unlock();
1459 		do_next:
1460 			;
1461 		}
1462 	}
1463 	in_dev_put(in_dev);
1464 	return;
1465 
1466 reject_redirect:
1467 #ifdef CONFIG_IP_ROUTE_VERBOSE
1468 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1469 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1470 			"  Advised path = %pI4 -> %pI4\n",
1471 		       &old_gw, dev->name, &new_gw,
1472 		       &saddr, &daddr);
1473 #endif
1474 	in_dev_put(in_dev);
1475 }
1476 
1477 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1478 {
1479 	struct rtable *rt = (struct rtable *)dst;
1480 	struct dst_entry *ret = dst;
1481 
1482 	if (rt) {
1483 		if (dst->obsolete) {
1484 			ip_rt_put(rt);
1485 			ret = NULL;
1486 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1487 			   rt->u.dst.expires) {
1488 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1489 						rt->fl.oif,
1490 						rt_genid(dev_net(dst->dev)));
1491 #if RT_CACHE_DEBUG >= 1
1492 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1493 				&rt->rt_dst, rt->fl.fl4_tos);
1494 #endif
1495 			rt_del(hash, rt);
1496 			ret = NULL;
1497 		}
1498 	}
1499 	return ret;
1500 }
1501 
1502 /*
1503  * Algorithm:
1504  *	1. The first ip_rt_redirect_number redirects are sent
1505  *	   with exponential backoff, then we stop sending them at all,
1506  *	   assuming that the host ignores our redirects.
1507  *	2. If we did not see packets requiring redirects
1508  *	   during ip_rt_redirect_silence, we assume that the host
1509  *	   forgot redirected route and start to send redirects again.
1510  *
1511  * This algorithm is much cheaper and more intelligent than dumb load limiting
1512  * in icmp.c.
1513  *
1514  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1515  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1516  */
1517 
1518 void ip_rt_send_redirect(struct sk_buff *skb)
1519 {
1520 	struct rtable *rt = skb_rtable(skb);
1521 	struct in_device *in_dev;
1522 	int log_martians;
1523 
1524 	rcu_read_lock();
1525 	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1526 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1527 		rcu_read_unlock();
1528 		return;
1529 	}
1530 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1531 	rcu_read_unlock();
1532 
1533 	/* No redirected packets during ip_rt_redirect_silence;
1534 	 * reset the algorithm.
1535 	 */
1536 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1537 		rt->u.dst.rate_tokens = 0;
1538 
1539 	/* Too many ignored redirects; do not send anything
1540 	 * set u.dst.rate_last to the last seen redirected packet.
1541 	 */
1542 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1543 		rt->u.dst.rate_last = jiffies;
1544 		return;
1545 	}
1546 
1547 	/* Check for load limit; set rate_last to the latest sent
1548 	 * redirect.
1549 	 */
1550 	if (rt->u.dst.rate_tokens == 0 ||
1551 	    time_after(jiffies,
1552 		       (rt->u.dst.rate_last +
1553 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1554 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1555 		rt->u.dst.rate_last = jiffies;
1556 		++rt->u.dst.rate_tokens;
1557 #ifdef CONFIG_IP_ROUTE_VERBOSE
1558 		if (log_martians &&
1559 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1560 		    net_ratelimit())
1561 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1562 				&rt->rt_src, rt->rt_iif,
1563 				&rt->rt_dst, &rt->rt_gateway);
1564 #endif
1565 	}
1566 }
1567 
1568 static int ip_error(struct sk_buff *skb)
1569 {
1570 	struct rtable *rt = skb_rtable(skb);
1571 	unsigned long now;
1572 	int code;
1573 
1574 	switch (rt->u.dst.error) {
1575 		case EINVAL:
1576 		default:
1577 			goto out;
1578 		case EHOSTUNREACH:
1579 			code = ICMP_HOST_UNREACH;
1580 			break;
1581 		case ENETUNREACH:
1582 			code = ICMP_NET_UNREACH;
1583 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1584 					IPSTATS_MIB_INNOROUTES);
1585 			break;
1586 		case EACCES:
1587 			code = ICMP_PKT_FILTERED;
1588 			break;
1589 	}
1590 
1591 	now = jiffies;
1592 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1593 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1594 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1595 	rt->u.dst.rate_last = now;
1596 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1597 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1598 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1599 	}
1600 
1601 out:	kfree_skb(skb);
1602 	return 0;
1603 }
1604 
1605 /*
1606  *	The last two values are not from the RFC but
1607  *	are needed for AMPRnet AX.25 paths.
1608  */
1609 
1610 static const unsigned short mtu_plateau[] =
1611 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1612 
1613 static inline unsigned short guess_mtu(unsigned short old_mtu)
1614 {
1615 	int i;
1616 
1617 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1618 		if (old_mtu > mtu_plateau[i])
1619 			return mtu_plateau[i];
1620 	return 68;
1621 }
1622 
1623 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1624 				 unsigned short new_mtu,
1625 				 struct net_device *dev)
1626 {
1627 	int i, k;
1628 	unsigned short old_mtu = ntohs(iph->tot_len);
1629 	struct rtable *rth;
1630 	int  ikeys[2] = { dev->ifindex, 0 };
1631 	__be32  skeys[2] = { iph->saddr, 0, };
1632 	__be32  daddr = iph->daddr;
1633 	unsigned short est_mtu = 0;
1634 
1635 	for (k = 0; k < 2; k++) {
1636 		for (i = 0; i < 2; i++) {
1637 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1638 						rt_genid(net));
1639 
1640 			rcu_read_lock();
1641 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1642 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1643 				unsigned short mtu = new_mtu;
1644 
1645 				if (rth->fl.fl4_dst != daddr ||
1646 				    rth->fl.fl4_src != skeys[i] ||
1647 				    rth->rt_dst != daddr ||
1648 				    rth->rt_src != iph->saddr ||
1649 				    rth->fl.oif != ikeys[k] ||
1650 				    rth->fl.iif != 0 ||
1651 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1652 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1653 				    rt_is_expired(rth))
1654 					continue;
1655 
1656 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1657 
1658 					/* BSD 4.2 compatibility hack :-( */
1659 					if (mtu == 0 &&
1660 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1661 					    old_mtu >= 68 + (iph->ihl << 2))
1662 						old_mtu -= iph->ihl << 2;
1663 
1664 					mtu = guess_mtu(old_mtu);
1665 				}
1666 				if (mtu <= dst_mtu(&rth->u.dst)) {
1667 					if (mtu < dst_mtu(&rth->u.dst)) {
1668 						dst_confirm(&rth->u.dst);
1669 						if (mtu < ip_rt_min_pmtu) {
1670 							mtu = ip_rt_min_pmtu;
1671 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1672 								(1 << RTAX_MTU);
1673 						}
1674 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1675 						dst_set_expires(&rth->u.dst,
1676 							ip_rt_mtu_expires);
1677 					}
1678 					est_mtu = mtu;
1679 				}
1680 			}
1681 			rcu_read_unlock();
1682 		}
1683 	}
1684 	return est_mtu ? : new_mtu;
1685 }
1686 
1687 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1688 {
1689 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1690 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1691 		if (mtu < ip_rt_min_pmtu) {
1692 			mtu = ip_rt_min_pmtu;
1693 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1694 		}
1695 		dst->metrics[RTAX_MTU-1] = mtu;
1696 		dst_set_expires(dst, ip_rt_mtu_expires);
1697 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1698 	}
1699 }
1700 
1701 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702 {
1703 	return NULL;
1704 }
1705 
1706 static void ipv4_dst_destroy(struct dst_entry *dst)
1707 {
1708 	struct rtable *rt = (struct rtable *) dst;
1709 	struct inet_peer *peer = rt->peer;
1710 	struct in_device *idev = rt->idev;
1711 
1712 	if (peer) {
1713 		rt->peer = NULL;
1714 		inet_putpeer(peer);
1715 	}
1716 
1717 	if (idev) {
1718 		rt->idev = NULL;
1719 		in_dev_put(idev);
1720 	}
1721 }
1722 
1723 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1724 			    int how)
1725 {
1726 	struct rtable *rt = (struct rtable *) dst;
1727 	struct in_device *idev = rt->idev;
1728 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1729 		struct in_device *loopback_idev =
1730 			in_dev_get(dev_net(dev)->loopback_dev);
1731 		if (loopback_idev) {
1732 			rt->idev = loopback_idev;
1733 			in_dev_put(idev);
1734 		}
1735 	}
1736 }
1737 
1738 static void ipv4_link_failure(struct sk_buff *skb)
1739 {
1740 	struct rtable *rt;
1741 
1742 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743 
1744 	rt = skb_rtable(skb);
1745 	if (rt)
1746 		dst_set_expires(&rt->u.dst, 0);
1747 }
1748 
1749 static int ip_rt_bug(struct sk_buff *skb)
1750 {
1751 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1753 		skb->dev ? skb->dev->name : "?");
1754 	kfree_skb(skb);
1755 	return 0;
1756 }
1757 
1758 /*
1759    We do not cache source address of outgoing interface,
1760    because it is used only by IP RR, TS and SRR options,
1761    so that it out of fast path.
1762 
1763    BTW remember: "addr" is allowed to be not aligned
1764    in IP options!
1765  */
1766 
1767 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1768 {
1769 	__be32 src;
1770 	struct fib_result res;
1771 
1772 	if (rt->fl.iif == 0)
1773 		src = rt->rt_src;
1774 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1775 		src = FIB_RES_PREFSRC(res);
1776 		fib_res_put(&res);
1777 	} else
1778 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1779 					RT_SCOPE_UNIVERSE);
1780 	memcpy(addr, &src, 4);
1781 }
1782 
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 static void set_class_tag(struct rtable *rt, u32 tag)
1785 {
1786 	if (!(rt->u.dst.tclassid & 0xFFFF))
1787 		rt->u.dst.tclassid |= tag & 0xFFFF;
1788 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1789 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1790 }
1791 #endif
1792 
1793 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1794 {
1795 	struct fib_info *fi = res->fi;
1796 
1797 	if (fi) {
1798 		if (FIB_RES_GW(*res) &&
1799 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1800 			rt->rt_gateway = FIB_RES_GW(*res);
1801 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1802 		       sizeof(rt->u.dst.metrics));
1803 		if (fi->fib_mtu == 0) {
1804 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1805 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1806 			    rt->rt_gateway != rt->rt_dst &&
1807 			    rt->u.dst.dev->mtu > 576)
1808 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1809 		}
1810 #ifdef CONFIG_NET_CLS_ROUTE
1811 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1812 #endif
1813 	} else
1814 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1815 
1816 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1817 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1818 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1819 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1820 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1821 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1822 				       ip_rt_min_advmss);
1823 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1824 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1825 
1826 #ifdef CONFIG_NET_CLS_ROUTE
1827 #ifdef CONFIG_IP_MULTIPLE_TABLES
1828 	set_class_tag(rt, fib_rules_tclass(res));
1829 #endif
1830 	set_class_tag(rt, itag);
1831 #endif
1832 	rt->rt_type = res->type;
1833 }
1834 
1835 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1836 				u8 tos, struct net_device *dev, int our)
1837 {
1838 	unsigned hash;
1839 	struct rtable *rth;
1840 	__be32 spec_dst;
1841 	struct in_device *in_dev = in_dev_get(dev);
1842 	u32 itag = 0;
1843 
1844 	/* Primary sanity checks. */
1845 
1846 	if (in_dev == NULL)
1847 		return -EINVAL;
1848 
1849 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1850 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1851 		goto e_inval;
1852 
1853 	if (ipv4_is_zeronet(saddr)) {
1854 		if (!ipv4_is_local_multicast(daddr))
1855 			goto e_inval;
1856 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1857 	} else if (fib_validate_source(saddr, 0, tos, 0,
1858 					dev, &spec_dst, &itag, 0) < 0)
1859 		goto e_inval;
1860 
1861 	rth = dst_alloc(&ipv4_dst_ops);
1862 	if (!rth)
1863 		goto e_nobufs;
1864 
1865 	rth->u.dst.output= ip_rt_bug;
1866 
1867 	atomic_set(&rth->u.dst.__refcnt, 1);
1868 	rth->u.dst.flags= DST_HOST;
1869 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1870 		rth->u.dst.flags |= DST_NOPOLICY;
1871 	rth->fl.fl4_dst	= daddr;
1872 	rth->rt_dst	= daddr;
1873 	rth->fl.fl4_tos	= tos;
1874 	rth->fl.mark    = skb->mark;
1875 	rth->fl.fl4_src	= saddr;
1876 	rth->rt_src	= saddr;
1877 #ifdef CONFIG_NET_CLS_ROUTE
1878 	rth->u.dst.tclassid = itag;
1879 #endif
1880 	rth->rt_iif	=
1881 	rth->fl.iif	= dev->ifindex;
1882 	rth->u.dst.dev	= init_net.loopback_dev;
1883 	dev_hold(rth->u.dst.dev);
1884 	rth->idev	= in_dev_get(rth->u.dst.dev);
1885 	rth->fl.oif	= 0;
1886 	rth->rt_gateway	= daddr;
1887 	rth->rt_spec_dst= spec_dst;
1888 	rth->rt_genid	= rt_genid(dev_net(dev));
1889 	rth->rt_flags	= RTCF_MULTICAST;
1890 	rth->rt_type	= RTN_MULTICAST;
1891 	if (our) {
1892 		rth->u.dst.input= ip_local_deliver;
1893 		rth->rt_flags |= RTCF_LOCAL;
1894 	}
1895 
1896 #ifdef CONFIG_IP_MROUTE
1897 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1898 		rth->u.dst.input = ip_mr_input;
1899 #endif
1900 	RT_CACHE_STAT_INC(in_slow_mc);
1901 
1902 	in_dev_put(in_dev);
1903 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1904 	return rt_intern_hash(hash, rth, NULL, skb);
1905 
1906 e_nobufs:
1907 	in_dev_put(in_dev);
1908 	return -ENOBUFS;
1909 
1910 e_inval:
1911 	in_dev_put(in_dev);
1912 	return -EINVAL;
1913 }
1914 
1915 
1916 static void ip_handle_martian_source(struct net_device *dev,
1917 				     struct in_device *in_dev,
1918 				     struct sk_buff *skb,
1919 				     __be32 daddr,
1920 				     __be32 saddr)
1921 {
1922 	RT_CACHE_STAT_INC(in_martian_src);
1923 #ifdef CONFIG_IP_ROUTE_VERBOSE
1924 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925 		/*
1926 		 *	RFC1812 recommendation, if source is martian,
1927 		 *	the only hint is MAC header.
1928 		 */
1929 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930 			&daddr, &saddr, dev->name);
1931 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1932 			int i;
1933 			const unsigned char *p = skb_mac_header(skb);
1934 			printk(KERN_WARNING "ll header: ");
1935 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1936 				printk("%02x", *p);
1937 				if (i < (dev->hard_header_len - 1))
1938 					printk(":");
1939 			}
1940 			printk("\n");
1941 		}
1942 	}
1943 #endif
1944 }
1945 
1946 static int __mkroute_input(struct sk_buff *skb,
1947 			   struct fib_result *res,
1948 			   struct in_device *in_dev,
1949 			   __be32 daddr, __be32 saddr, u32 tos,
1950 			   struct rtable **result)
1951 {
1952 
1953 	struct rtable *rth;
1954 	int err;
1955 	struct in_device *out_dev;
1956 	unsigned flags = 0;
1957 	__be32 spec_dst;
1958 	u32 itag;
1959 
1960 	/* get a working reference to the output device */
1961 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1962 	if (out_dev == NULL) {
1963 		if (net_ratelimit())
1964 			printk(KERN_CRIT "Bug in ip_route_input" \
1965 			       "_slow(). Please, report\n");
1966 		return -EINVAL;
1967 	}
1968 
1969 
1970 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1971 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1972 	if (err < 0) {
1973 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1974 					 saddr);
1975 
1976 		err = -EINVAL;
1977 		goto cleanup;
1978 	}
1979 
1980 	if (err)
1981 		flags |= RTCF_DIRECTSRC;
1982 
1983 	if (out_dev == in_dev && err &&
1984 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 		flags |= RTCF_DOREDIRECT;
1987 
1988 	if (skb->protocol != htons(ETH_P_IP)) {
1989 		/* Not IP (i.e. ARP). Do not create route, if it is
1990 		 * invalid for proxy arp. DNAT routes are always valid.
1991 		 */
1992 		if (out_dev == in_dev) {
1993 			err = -EINVAL;
1994 			goto cleanup;
1995 		}
1996 	}
1997 
1998 
1999 	rth = dst_alloc(&ipv4_dst_ops);
2000 	if (!rth) {
2001 		err = -ENOBUFS;
2002 		goto cleanup;
2003 	}
2004 
2005 	atomic_set(&rth->u.dst.__refcnt, 1);
2006 	rth->u.dst.flags= DST_HOST;
2007 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2008 		rth->u.dst.flags |= DST_NOPOLICY;
2009 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2010 		rth->u.dst.flags |= DST_NOXFRM;
2011 	rth->fl.fl4_dst	= daddr;
2012 	rth->rt_dst	= daddr;
2013 	rth->fl.fl4_tos	= tos;
2014 	rth->fl.mark    = skb->mark;
2015 	rth->fl.fl4_src	= saddr;
2016 	rth->rt_src	= saddr;
2017 	rth->rt_gateway	= daddr;
2018 	rth->rt_iif 	=
2019 		rth->fl.iif	= in_dev->dev->ifindex;
2020 	rth->u.dst.dev	= (out_dev)->dev;
2021 	dev_hold(rth->u.dst.dev);
2022 	rth->idev	= in_dev_get(rth->u.dst.dev);
2023 	rth->fl.oif 	= 0;
2024 	rth->rt_spec_dst= spec_dst;
2025 
2026 	rth->u.dst.input = ip_forward;
2027 	rth->u.dst.output = ip_output;
2028 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2029 
2030 	rt_set_nexthop(rth, res, itag);
2031 
2032 	rth->rt_flags = flags;
2033 
2034 	*result = rth;
2035 	err = 0;
2036  cleanup:
2037 	/* release the working reference to the output device */
2038 	in_dev_put(out_dev);
2039 	return err;
2040 }
2041 
2042 static int ip_mkroute_input(struct sk_buff *skb,
2043 			    struct fib_result *res,
2044 			    const struct flowi *fl,
2045 			    struct in_device *in_dev,
2046 			    __be32 daddr, __be32 saddr, u32 tos)
2047 {
2048 	struct rtable* rth = NULL;
2049 	int err;
2050 	unsigned hash;
2051 
2052 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2053 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2054 		fib_select_multipath(fl, res);
2055 #endif
2056 
2057 	/* create a routing cache entry */
2058 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2059 	if (err)
2060 		return err;
2061 
2062 	/* put it into the cache */
2063 	hash = rt_hash(daddr, saddr, fl->iif,
2064 		       rt_genid(dev_net(rth->u.dst.dev)));
2065 	return rt_intern_hash(hash, rth, NULL, skb);
2066 }
2067 
2068 /*
2069  *	NOTE. We drop all the packets that has local source
2070  *	addresses, because every properly looped back packet
2071  *	must have correct destination already attached by output routine.
2072  *
2073  *	Such approach solves two big problems:
2074  *	1. Not simplex devices are handled properly.
2075  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2076  */
2077 
2078 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2079 			       u8 tos, struct net_device *dev)
2080 {
2081 	struct fib_result res;
2082 	struct in_device *in_dev = in_dev_get(dev);
2083 	struct flowi fl = { .nl_u = { .ip4_u =
2084 				      { .daddr = daddr,
2085 					.saddr = saddr,
2086 					.tos = tos,
2087 					.scope = RT_SCOPE_UNIVERSE,
2088 				      } },
2089 			    .mark = skb->mark,
2090 			    .iif = dev->ifindex };
2091 	unsigned	flags = 0;
2092 	u32		itag = 0;
2093 	struct rtable * rth;
2094 	unsigned	hash;
2095 	__be32		spec_dst;
2096 	int		err = -EINVAL;
2097 	int		free_res = 0;
2098 	struct net    * net = dev_net(dev);
2099 
2100 	/* IP on this device is disabled. */
2101 
2102 	if (!in_dev)
2103 		goto out;
2104 
2105 	/* Check for the most weird martians, which can be not detected
2106 	   by fib_lookup.
2107 	 */
2108 
2109 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2110 	    ipv4_is_loopback(saddr))
2111 		goto martian_source;
2112 
2113 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2114 		goto brd_input;
2115 
2116 	/* Accept zero addresses only to limited broadcast;
2117 	 * I even do not know to fix it or not. Waiting for complains :-)
2118 	 */
2119 	if (ipv4_is_zeronet(saddr))
2120 		goto martian_source;
2121 
2122 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2123 	    ipv4_is_loopback(daddr))
2124 		goto martian_destination;
2125 
2126 	/*
2127 	 *	Now we are ready to route packet.
2128 	 */
2129 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2130 		if (!IN_DEV_FORWARD(in_dev))
2131 			goto e_hostunreach;
2132 		goto no_route;
2133 	}
2134 	free_res = 1;
2135 
2136 	RT_CACHE_STAT_INC(in_slow_tot);
2137 
2138 	if (res.type == RTN_BROADCAST)
2139 		goto brd_input;
2140 
2141 	if (res.type == RTN_LOCAL) {
2142 		int result;
2143 		result = fib_validate_source(saddr, daddr, tos,
2144 					     net->loopback_dev->ifindex,
2145 					     dev, &spec_dst, &itag, skb->mark);
2146 		if (result < 0)
2147 			goto martian_source;
2148 		if (result)
2149 			flags |= RTCF_DIRECTSRC;
2150 		spec_dst = daddr;
2151 		goto local_input;
2152 	}
2153 
2154 	if (!IN_DEV_FORWARD(in_dev))
2155 		goto e_hostunreach;
2156 	if (res.type != RTN_UNICAST)
2157 		goto martian_destination;
2158 
2159 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2160 done:
2161 	in_dev_put(in_dev);
2162 	if (free_res)
2163 		fib_res_put(&res);
2164 out:	return err;
2165 
2166 brd_input:
2167 	if (skb->protocol != htons(ETH_P_IP))
2168 		goto e_inval;
2169 
2170 	if (ipv4_is_zeronet(saddr))
2171 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2172 	else {
2173 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2174 					  &itag, skb->mark);
2175 		if (err < 0)
2176 			goto martian_source;
2177 		if (err)
2178 			flags |= RTCF_DIRECTSRC;
2179 	}
2180 	flags |= RTCF_BROADCAST;
2181 	res.type = RTN_BROADCAST;
2182 	RT_CACHE_STAT_INC(in_brd);
2183 
2184 local_input:
2185 	rth = dst_alloc(&ipv4_dst_ops);
2186 	if (!rth)
2187 		goto e_nobufs;
2188 
2189 	rth->u.dst.output= ip_rt_bug;
2190 	rth->rt_genid = rt_genid(net);
2191 
2192 	atomic_set(&rth->u.dst.__refcnt, 1);
2193 	rth->u.dst.flags= DST_HOST;
2194 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2195 		rth->u.dst.flags |= DST_NOPOLICY;
2196 	rth->fl.fl4_dst	= daddr;
2197 	rth->rt_dst	= daddr;
2198 	rth->fl.fl4_tos	= tos;
2199 	rth->fl.mark    = skb->mark;
2200 	rth->fl.fl4_src	= saddr;
2201 	rth->rt_src	= saddr;
2202 #ifdef CONFIG_NET_CLS_ROUTE
2203 	rth->u.dst.tclassid = itag;
2204 #endif
2205 	rth->rt_iif	=
2206 	rth->fl.iif	= dev->ifindex;
2207 	rth->u.dst.dev	= net->loopback_dev;
2208 	dev_hold(rth->u.dst.dev);
2209 	rth->idev	= in_dev_get(rth->u.dst.dev);
2210 	rth->rt_gateway	= daddr;
2211 	rth->rt_spec_dst= spec_dst;
2212 	rth->u.dst.input= ip_local_deliver;
2213 	rth->rt_flags 	= flags|RTCF_LOCAL;
2214 	if (res.type == RTN_UNREACHABLE) {
2215 		rth->u.dst.input= ip_error;
2216 		rth->u.dst.error= -err;
2217 		rth->rt_flags 	&= ~RTCF_LOCAL;
2218 	}
2219 	rth->rt_type	= res.type;
2220 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2221 	err = rt_intern_hash(hash, rth, NULL, skb);
2222 	goto done;
2223 
2224 no_route:
2225 	RT_CACHE_STAT_INC(in_no_route);
2226 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2227 	res.type = RTN_UNREACHABLE;
2228 	if (err == -ESRCH)
2229 		err = -ENETUNREACH;
2230 	goto local_input;
2231 
2232 	/*
2233 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2234 	 */
2235 martian_destination:
2236 	RT_CACHE_STAT_INC(in_martian_dst);
2237 #ifdef CONFIG_IP_ROUTE_VERBOSE
2238 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2239 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2240 			&daddr, &saddr, dev->name);
2241 #endif
2242 
2243 e_hostunreach:
2244 	err = -EHOSTUNREACH;
2245 	goto done;
2246 
2247 e_inval:
2248 	err = -EINVAL;
2249 	goto done;
2250 
2251 e_nobufs:
2252 	err = -ENOBUFS;
2253 	goto done;
2254 
2255 martian_source:
2256 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2257 	goto e_inval;
2258 }
2259 
2260 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2261 		   u8 tos, struct net_device *dev)
2262 {
2263 	struct rtable * rth;
2264 	unsigned	hash;
2265 	int iif = dev->ifindex;
2266 	struct net *net;
2267 
2268 	net = dev_net(dev);
2269 
2270 	if (!rt_caching(net))
2271 		goto skip_cache;
2272 
2273 	tos &= IPTOS_RT_MASK;
2274 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2275 
2276 	rcu_read_lock();
2277 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2278 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2279 		if (((rth->fl.fl4_dst ^ daddr) |
2280 		     (rth->fl.fl4_src ^ saddr) |
2281 		     (rth->fl.iif ^ iif) |
2282 		     rth->fl.oif |
2283 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2284 		    rth->fl.mark == skb->mark &&
2285 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2286 		    !rt_is_expired(rth)) {
2287 			dst_use(&rth->u.dst, jiffies);
2288 			RT_CACHE_STAT_INC(in_hit);
2289 			rcu_read_unlock();
2290 			skb_dst_set(skb, &rth->u.dst);
2291 			return 0;
2292 		}
2293 		RT_CACHE_STAT_INC(in_hlist_search);
2294 	}
2295 	rcu_read_unlock();
2296 
2297 skip_cache:
2298 	/* Multicast recognition logic is moved from route cache to here.
2299 	   The problem was that too many Ethernet cards have broken/missing
2300 	   hardware multicast filters :-( As result the host on multicasting
2301 	   network acquires a lot of useless route cache entries, sort of
2302 	   SDR messages from all the world. Now we try to get rid of them.
2303 	   Really, provided software IP multicast filter is organized
2304 	   reasonably (at least, hashed), it does not result in a slowdown
2305 	   comparing with route cache reject entries.
2306 	   Note, that multicast routers are not affected, because
2307 	   route cache entry is created eventually.
2308 	 */
2309 	if (ipv4_is_multicast(daddr)) {
2310 		struct in_device *in_dev;
2311 
2312 		rcu_read_lock();
2313 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2314 			int our = ip_check_mc(in_dev, daddr, saddr,
2315 				ip_hdr(skb)->protocol);
2316 			if (our
2317 #ifdef CONFIG_IP_MROUTE
2318 				||
2319 			    (!ipv4_is_local_multicast(daddr) &&
2320 			     IN_DEV_MFORWARD(in_dev))
2321 #endif
2322 			   ) {
2323 				rcu_read_unlock();
2324 				return ip_route_input_mc(skb, daddr, saddr,
2325 							 tos, dev, our);
2326 			}
2327 		}
2328 		rcu_read_unlock();
2329 		return -EINVAL;
2330 	}
2331 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2332 }
2333 
2334 static int __mkroute_output(struct rtable **result,
2335 			    struct fib_result *res,
2336 			    const struct flowi *fl,
2337 			    const struct flowi *oldflp,
2338 			    struct net_device *dev_out,
2339 			    unsigned flags)
2340 {
2341 	struct rtable *rth;
2342 	struct in_device *in_dev;
2343 	u32 tos = RT_FL_TOS(oldflp);
2344 	int err = 0;
2345 
2346 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2347 		return -EINVAL;
2348 
2349 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2350 		res->type = RTN_BROADCAST;
2351 	else if (ipv4_is_multicast(fl->fl4_dst))
2352 		res->type = RTN_MULTICAST;
2353 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2354 		return -EINVAL;
2355 
2356 	if (dev_out->flags & IFF_LOOPBACK)
2357 		flags |= RTCF_LOCAL;
2358 
2359 	/* get work reference to inet device */
2360 	in_dev = in_dev_get(dev_out);
2361 	if (!in_dev)
2362 		return -EINVAL;
2363 
2364 	if (res->type == RTN_BROADCAST) {
2365 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2366 		if (res->fi) {
2367 			fib_info_put(res->fi);
2368 			res->fi = NULL;
2369 		}
2370 	} else if (res->type == RTN_MULTICAST) {
2371 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2372 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2373 				 oldflp->proto))
2374 			flags &= ~RTCF_LOCAL;
2375 		/* If multicast route do not exist use
2376 		   default one, but do not gateway in this case.
2377 		   Yes, it is hack.
2378 		 */
2379 		if (res->fi && res->prefixlen < 4) {
2380 			fib_info_put(res->fi);
2381 			res->fi = NULL;
2382 		}
2383 	}
2384 
2385 
2386 	rth = dst_alloc(&ipv4_dst_ops);
2387 	if (!rth) {
2388 		err = -ENOBUFS;
2389 		goto cleanup;
2390 	}
2391 
2392 	atomic_set(&rth->u.dst.__refcnt, 1);
2393 	rth->u.dst.flags= DST_HOST;
2394 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2395 		rth->u.dst.flags |= DST_NOXFRM;
2396 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2397 		rth->u.dst.flags |= DST_NOPOLICY;
2398 
2399 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2400 	rth->fl.fl4_tos	= tos;
2401 	rth->fl.fl4_src	= oldflp->fl4_src;
2402 	rth->fl.oif	= oldflp->oif;
2403 	rth->fl.mark    = oldflp->mark;
2404 	rth->rt_dst	= fl->fl4_dst;
2405 	rth->rt_src	= fl->fl4_src;
2406 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2407 	/* get references to the devices that are to be hold by the routing
2408 	   cache entry */
2409 	rth->u.dst.dev	= dev_out;
2410 	dev_hold(dev_out);
2411 	rth->idev	= in_dev_get(dev_out);
2412 	rth->rt_gateway = fl->fl4_dst;
2413 	rth->rt_spec_dst= fl->fl4_src;
2414 
2415 	rth->u.dst.output=ip_output;
2416 	rth->rt_genid = rt_genid(dev_net(dev_out));
2417 
2418 	RT_CACHE_STAT_INC(out_slow_tot);
2419 
2420 	if (flags & RTCF_LOCAL) {
2421 		rth->u.dst.input = ip_local_deliver;
2422 		rth->rt_spec_dst = fl->fl4_dst;
2423 	}
2424 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2425 		rth->rt_spec_dst = fl->fl4_src;
2426 		if (flags & RTCF_LOCAL &&
2427 		    !(dev_out->flags & IFF_LOOPBACK)) {
2428 			rth->u.dst.output = ip_mc_output;
2429 			RT_CACHE_STAT_INC(out_slow_mc);
2430 		}
2431 #ifdef CONFIG_IP_MROUTE
2432 		if (res->type == RTN_MULTICAST) {
2433 			if (IN_DEV_MFORWARD(in_dev) &&
2434 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2435 				rth->u.dst.input = ip_mr_input;
2436 				rth->u.dst.output = ip_mc_output;
2437 			}
2438 		}
2439 #endif
2440 	}
2441 
2442 	rt_set_nexthop(rth, res, 0);
2443 
2444 	rth->rt_flags = flags;
2445 
2446 	*result = rth;
2447  cleanup:
2448 	/* release work reference to inet device */
2449 	in_dev_put(in_dev);
2450 
2451 	return err;
2452 }
2453 
2454 static int ip_mkroute_output(struct rtable **rp,
2455 			     struct fib_result *res,
2456 			     const struct flowi *fl,
2457 			     const struct flowi *oldflp,
2458 			     struct net_device *dev_out,
2459 			     unsigned flags)
2460 {
2461 	struct rtable *rth = NULL;
2462 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2463 	unsigned hash;
2464 	if (err == 0) {
2465 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2466 			       rt_genid(dev_net(dev_out)));
2467 		err = rt_intern_hash(hash, rth, rp, NULL);
2468 	}
2469 
2470 	return err;
2471 }
2472 
2473 /*
2474  * Major route resolver routine.
2475  */
2476 
2477 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2478 				const struct flowi *oldflp)
2479 {
2480 	u32 tos	= RT_FL_TOS(oldflp);
2481 	struct flowi fl = { .nl_u = { .ip4_u =
2482 				      { .daddr = oldflp->fl4_dst,
2483 					.saddr = oldflp->fl4_src,
2484 					.tos = tos & IPTOS_RT_MASK,
2485 					.scope = ((tos & RTO_ONLINK) ?
2486 						  RT_SCOPE_LINK :
2487 						  RT_SCOPE_UNIVERSE),
2488 				      } },
2489 			    .mark = oldflp->mark,
2490 			    .iif = net->loopback_dev->ifindex,
2491 			    .oif = oldflp->oif };
2492 	struct fib_result res;
2493 	unsigned flags = 0;
2494 	struct net_device *dev_out = NULL;
2495 	int free_res = 0;
2496 	int err;
2497 
2498 
2499 	res.fi		= NULL;
2500 #ifdef CONFIG_IP_MULTIPLE_TABLES
2501 	res.r		= NULL;
2502 #endif
2503 
2504 	if (oldflp->fl4_src) {
2505 		err = -EINVAL;
2506 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2507 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2508 		    ipv4_is_zeronet(oldflp->fl4_src))
2509 			goto out;
2510 
2511 		/* I removed check for oif == dev_out->oif here.
2512 		   It was wrong for two reasons:
2513 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2514 		      is assigned to multiple interfaces.
2515 		   2. Moreover, we are allowed to send packets with saddr
2516 		      of another iface. --ANK
2517 		 */
2518 
2519 		if (oldflp->oif == 0 &&
2520 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2521 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2522 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2523 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2524 			if (dev_out == NULL)
2525 				goto out;
2526 
2527 			/* Special hack: user can direct multicasts
2528 			   and limited broadcast via necessary interface
2529 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2530 			   This hack is not just for fun, it allows
2531 			   vic,vat and friends to work.
2532 			   They bind socket to loopback, set ttl to zero
2533 			   and expect that it will work.
2534 			   From the viewpoint of routing cache they are broken,
2535 			   because we are not allowed to build multicast path
2536 			   with loopback source addr (look, routing cache
2537 			   cannot know, that ttl is zero, so that packet
2538 			   will not leave this host and route is valid).
2539 			   Luckily, this hack is good workaround.
2540 			 */
2541 
2542 			fl.oif = dev_out->ifindex;
2543 			goto make_route;
2544 		}
2545 
2546 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2547 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2549 			if (dev_out == NULL)
2550 				goto out;
2551 			dev_put(dev_out);
2552 			dev_out = NULL;
2553 		}
2554 	}
2555 
2556 
2557 	if (oldflp->oif) {
2558 		dev_out = dev_get_by_index(net, oldflp->oif);
2559 		err = -ENODEV;
2560 		if (dev_out == NULL)
2561 			goto out;
2562 
2563 		/* RACE: Check return value of inet_select_addr instead. */
2564 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2565 			dev_put(dev_out);
2566 			goto out;	/* Wrong error code */
2567 		}
2568 
2569 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2570 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2571 			if (!fl.fl4_src)
2572 				fl.fl4_src = inet_select_addr(dev_out, 0,
2573 							      RT_SCOPE_LINK);
2574 			goto make_route;
2575 		}
2576 		if (!fl.fl4_src) {
2577 			if (ipv4_is_multicast(oldflp->fl4_dst))
2578 				fl.fl4_src = inet_select_addr(dev_out, 0,
2579 							      fl.fl4_scope);
2580 			else if (!oldflp->fl4_dst)
2581 				fl.fl4_src = inet_select_addr(dev_out, 0,
2582 							      RT_SCOPE_HOST);
2583 		}
2584 	}
2585 
2586 	if (!fl.fl4_dst) {
2587 		fl.fl4_dst = fl.fl4_src;
2588 		if (!fl.fl4_dst)
2589 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2590 		if (dev_out)
2591 			dev_put(dev_out);
2592 		dev_out = net->loopback_dev;
2593 		dev_hold(dev_out);
2594 		fl.oif = net->loopback_dev->ifindex;
2595 		res.type = RTN_LOCAL;
2596 		flags |= RTCF_LOCAL;
2597 		goto make_route;
2598 	}
2599 
2600 	if (fib_lookup(net, &fl, &res)) {
2601 		res.fi = NULL;
2602 		if (oldflp->oif) {
2603 			/* Apparently, routing tables are wrong. Assume,
2604 			   that the destination is on link.
2605 
2606 			   WHY? DW.
2607 			   Because we are allowed to send to iface
2608 			   even if it has NO routes and NO assigned
2609 			   addresses. When oif is specified, routing
2610 			   tables are looked up with only one purpose:
2611 			   to catch if destination is gatewayed, rather than
2612 			   direct. Moreover, if MSG_DONTROUTE is set,
2613 			   we send packet, ignoring both routing tables
2614 			   and ifaddr state. --ANK
2615 
2616 
2617 			   We could make it even if oif is unknown,
2618 			   likely IPv6, but we do not.
2619 			 */
2620 
2621 			if (fl.fl4_src == 0)
2622 				fl.fl4_src = inet_select_addr(dev_out, 0,
2623 							      RT_SCOPE_LINK);
2624 			res.type = RTN_UNICAST;
2625 			goto make_route;
2626 		}
2627 		if (dev_out)
2628 			dev_put(dev_out);
2629 		err = -ENETUNREACH;
2630 		goto out;
2631 	}
2632 	free_res = 1;
2633 
2634 	if (res.type == RTN_LOCAL) {
2635 		if (!fl.fl4_src)
2636 			fl.fl4_src = fl.fl4_dst;
2637 		if (dev_out)
2638 			dev_put(dev_out);
2639 		dev_out = net->loopback_dev;
2640 		dev_hold(dev_out);
2641 		fl.oif = dev_out->ifindex;
2642 		if (res.fi)
2643 			fib_info_put(res.fi);
2644 		res.fi = NULL;
2645 		flags |= RTCF_LOCAL;
2646 		goto make_route;
2647 	}
2648 
2649 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2650 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2651 		fib_select_multipath(&fl, &res);
2652 	else
2653 #endif
2654 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2655 		fib_select_default(net, &fl, &res);
2656 
2657 	if (!fl.fl4_src)
2658 		fl.fl4_src = FIB_RES_PREFSRC(res);
2659 
2660 	if (dev_out)
2661 		dev_put(dev_out);
2662 	dev_out = FIB_RES_DEV(res);
2663 	dev_hold(dev_out);
2664 	fl.oif = dev_out->ifindex;
2665 
2666 
2667 make_route:
2668 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2669 
2670 
2671 	if (free_res)
2672 		fib_res_put(&res);
2673 	if (dev_out)
2674 		dev_put(dev_out);
2675 out:	return err;
2676 }
2677 
2678 int __ip_route_output_key(struct net *net, struct rtable **rp,
2679 			  const struct flowi *flp)
2680 {
2681 	unsigned hash;
2682 	struct rtable *rth;
2683 
2684 	if (!rt_caching(net))
2685 		goto slow_output;
2686 
2687 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2688 
2689 	rcu_read_lock_bh();
2690 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2691 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2692 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2693 		    rth->fl.fl4_src == flp->fl4_src &&
2694 		    rth->fl.iif == 0 &&
2695 		    rth->fl.oif == flp->oif &&
2696 		    rth->fl.mark == flp->mark &&
2697 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2698 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2699 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2700 		    !rt_is_expired(rth)) {
2701 			dst_use(&rth->u.dst, jiffies);
2702 			RT_CACHE_STAT_INC(out_hit);
2703 			rcu_read_unlock_bh();
2704 			*rp = rth;
2705 			return 0;
2706 		}
2707 		RT_CACHE_STAT_INC(out_hlist_search);
2708 	}
2709 	rcu_read_unlock_bh();
2710 
2711 slow_output:
2712 	return ip_route_output_slow(net, rp, flp);
2713 }
2714 
2715 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2716 
2717 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2718 {
2719 }
2720 
2721 static struct dst_ops ipv4_dst_blackhole_ops = {
2722 	.family			=	AF_INET,
2723 	.protocol		=	cpu_to_be16(ETH_P_IP),
2724 	.destroy		=	ipv4_dst_destroy,
2725 	.check			=	ipv4_dst_check,
2726 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2727 	.entries		=	ATOMIC_INIT(0),
2728 };
2729 
2730 
2731 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2732 {
2733 	struct rtable *ort = *rp;
2734 	struct rtable *rt = (struct rtable *)
2735 		dst_alloc(&ipv4_dst_blackhole_ops);
2736 
2737 	if (rt) {
2738 		struct dst_entry *new = &rt->u.dst;
2739 
2740 		atomic_set(&new->__refcnt, 1);
2741 		new->__use = 1;
2742 		new->input = dst_discard;
2743 		new->output = dst_discard;
2744 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2745 
2746 		new->dev = ort->u.dst.dev;
2747 		if (new->dev)
2748 			dev_hold(new->dev);
2749 
2750 		rt->fl = ort->fl;
2751 
2752 		rt->idev = ort->idev;
2753 		if (rt->idev)
2754 			in_dev_hold(rt->idev);
2755 		rt->rt_genid = rt_genid(net);
2756 		rt->rt_flags = ort->rt_flags;
2757 		rt->rt_type = ort->rt_type;
2758 		rt->rt_dst = ort->rt_dst;
2759 		rt->rt_src = ort->rt_src;
2760 		rt->rt_iif = ort->rt_iif;
2761 		rt->rt_gateway = ort->rt_gateway;
2762 		rt->rt_spec_dst = ort->rt_spec_dst;
2763 		rt->peer = ort->peer;
2764 		if (rt->peer)
2765 			atomic_inc(&rt->peer->refcnt);
2766 
2767 		dst_free(new);
2768 	}
2769 
2770 	dst_release(&(*rp)->u.dst);
2771 	*rp = rt;
2772 	return (rt ? 0 : -ENOMEM);
2773 }
2774 
2775 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2776 			 struct sock *sk, int flags)
2777 {
2778 	int err;
2779 
2780 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2781 		return err;
2782 
2783 	if (flp->proto) {
2784 		if (!flp->fl4_src)
2785 			flp->fl4_src = (*rp)->rt_src;
2786 		if (!flp->fl4_dst)
2787 			flp->fl4_dst = (*rp)->rt_dst;
2788 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2789 				    flags ? XFRM_LOOKUP_WAIT : 0);
2790 		if (err == -EREMOTE)
2791 			err = ipv4_dst_blackhole(net, rp, flp);
2792 
2793 		return err;
2794 	}
2795 
2796 	return 0;
2797 }
2798 
2799 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800 
2801 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2802 {
2803 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2804 }
2805 
2806 static int rt_fill_info(struct net *net,
2807 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2808 			int nowait, unsigned int flags)
2809 {
2810 	struct rtable *rt = skb_rtable(skb);
2811 	struct rtmsg *r;
2812 	struct nlmsghdr *nlh;
2813 	long expires;
2814 	u32 id = 0, ts = 0, tsage = 0, error;
2815 
2816 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2817 	if (nlh == NULL)
2818 		return -EMSGSIZE;
2819 
2820 	r = nlmsg_data(nlh);
2821 	r->rtm_family	 = AF_INET;
2822 	r->rtm_dst_len	= 32;
2823 	r->rtm_src_len	= 0;
2824 	r->rtm_tos	= rt->fl.fl4_tos;
2825 	r->rtm_table	= RT_TABLE_MAIN;
2826 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2827 	r->rtm_type	= rt->rt_type;
2828 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2829 	r->rtm_protocol = RTPROT_UNSPEC;
2830 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2831 	if (rt->rt_flags & RTCF_NOTIFY)
2832 		r->rtm_flags |= RTM_F_NOTIFY;
2833 
2834 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2835 
2836 	if (rt->fl.fl4_src) {
2837 		r->rtm_src_len = 32;
2838 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2839 	}
2840 	if (rt->u.dst.dev)
2841 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2842 #ifdef CONFIG_NET_CLS_ROUTE
2843 	if (rt->u.dst.tclassid)
2844 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2845 #endif
2846 	if (rt->fl.iif)
2847 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2848 	else if (rt->rt_src != rt->fl.fl4_src)
2849 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2850 
2851 	if (rt->rt_dst != rt->rt_gateway)
2852 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2853 
2854 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2855 		goto nla_put_failure;
2856 
2857 	error = rt->u.dst.error;
2858 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2859 	if (rt->peer) {
2860 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2861 		if (rt->peer->tcp_ts_stamp) {
2862 			ts = rt->peer->tcp_ts;
2863 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2864 		}
2865 	}
2866 
2867 	if (rt->fl.iif) {
2868 #ifdef CONFIG_IP_MROUTE
2869 		__be32 dst = rt->rt_dst;
2870 
2871 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2872 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2873 			int err = ipmr_get_route(net, skb, r, nowait);
2874 			if (err <= 0) {
2875 				if (!nowait) {
2876 					if (err == 0)
2877 						return 0;
2878 					goto nla_put_failure;
2879 				} else {
2880 					if (err == -EMSGSIZE)
2881 						goto nla_put_failure;
2882 					error = err;
2883 				}
2884 			}
2885 		} else
2886 #endif
2887 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2888 	}
2889 
2890 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2891 			       expires, error) < 0)
2892 		goto nla_put_failure;
2893 
2894 	return nlmsg_end(skb, nlh);
2895 
2896 nla_put_failure:
2897 	nlmsg_cancel(skb, nlh);
2898 	return -EMSGSIZE;
2899 }
2900 
2901 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2902 {
2903 	struct net *net = sock_net(in_skb->sk);
2904 	struct rtmsg *rtm;
2905 	struct nlattr *tb[RTA_MAX+1];
2906 	struct rtable *rt = NULL;
2907 	__be32 dst = 0;
2908 	__be32 src = 0;
2909 	u32 iif;
2910 	int err;
2911 	struct sk_buff *skb;
2912 
2913 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2914 	if (err < 0)
2915 		goto errout;
2916 
2917 	rtm = nlmsg_data(nlh);
2918 
2919 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2920 	if (skb == NULL) {
2921 		err = -ENOBUFS;
2922 		goto errout;
2923 	}
2924 
2925 	/* Reserve room for dummy headers, this skb can pass
2926 	   through good chunk of routing engine.
2927 	 */
2928 	skb_reset_mac_header(skb);
2929 	skb_reset_network_header(skb);
2930 
2931 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2932 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2933 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2934 
2935 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2936 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2937 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2938 
2939 	if (iif) {
2940 		struct net_device *dev;
2941 
2942 		dev = __dev_get_by_index(net, iif);
2943 		if (dev == NULL) {
2944 			err = -ENODEV;
2945 			goto errout_free;
2946 		}
2947 
2948 		skb->protocol	= htons(ETH_P_IP);
2949 		skb->dev	= dev;
2950 		local_bh_disable();
2951 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2952 		local_bh_enable();
2953 
2954 		rt = skb_rtable(skb);
2955 		if (err == 0 && rt->u.dst.error)
2956 			err = -rt->u.dst.error;
2957 	} else {
2958 		struct flowi fl = {
2959 			.nl_u = {
2960 				.ip4_u = {
2961 					.daddr = dst,
2962 					.saddr = src,
2963 					.tos = rtm->rtm_tos,
2964 				},
2965 			},
2966 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2967 		};
2968 		err = ip_route_output_key(net, &rt, &fl);
2969 	}
2970 
2971 	if (err)
2972 		goto errout_free;
2973 
2974 	skb_dst_set(skb, &rt->u.dst);
2975 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2976 		rt->rt_flags |= RTCF_NOTIFY;
2977 
2978 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2979 			   RTM_NEWROUTE, 0, 0);
2980 	if (err <= 0)
2981 		goto errout_free;
2982 
2983 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2984 errout:
2985 	return err;
2986 
2987 errout_free:
2988 	kfree_skb(skb);
2989 	goto errout;
2990 }
2991 
2992 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2993 {
2994 	struct rtable *rt;
2995 	int h, s_h;
2996 	int idx, s_idx;
2997 	struct net *net;
2998 
2999 	net = sock_net(skb->sk);
3000 
3001 	s_h = cb->args[0];
3002 	if (s_h < 0)
3003 		s_h = 0;
3004 	s_idx = idx = cb->args[1];
3005 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3006 		if (!rt_hash_table[h].chain)
3007 			continue;
3008 		rcu_read_lock_bh();
3009 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3010 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3011 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3012 				continue;
3013 			if (rt_is_expired(rt))
3014 				continue;
3015 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3016 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3017 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3018 					 1, NLM_F_MULTI) <= 0) {
3019 				skb_dst_drop(skb);
3020 				rcu_read_unlock_bh();
3021 				goto done;
3022 			}
3023 			skb_dst_drop(skb);
3024 		}
3025 		rcu_read_unlock_bh();
3026 	}
3027 
3028 done:
3029 	cb->args[0] = h;
3030 	cb->args[1] = idx;
3031 	return skb->len;
3032 }
3033 
3034 void ip_rt_multicast_event(struct in_device *in_dev)
3035 {
3036 	rt_cache_flush(dev_net(in_dev->dev), 0);
3037 }
3038 
3039 #ifdef CONFIG_SYSCTL
3040 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3041 					void __user *buffer,
3042 					size_t *lenp, loff_t *ppos)
3043 {
3044 	if (write) {
3045 		int flush_delay;
3046 		ctl_table ctl;
3047 		struct net *net;
3048 
3049 		memcpy(&ctl, __ctl, sizeof(ctl));
3050 		ctl.data = &flush_delay;
3051 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3052 
3053 		net = (struct net *)__ctl->extra1;
3054 		rt_cache_flush(net, flush_delay);
3055 		return 0;
3056 	}
3057 
3058 	return -EINVAL;
3059 }
3060 
3061 static void rt_secret_reschedule(int old)
3062 {
3063 	struct net *net;
3064 	int new = ip_rt_secret_interval;
3065 	int diff = new - old;
3066 
3067 	if (!diff)
3068 		return;
3069 
3070 	rtnl_lock();
3071 	for_each_net(net) {
3072 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3073 
3074 		if (!new)
3075 			continue;
3076 
3077 		if (deleted) {
3078 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3079 
3080 			if (time <= 0 || (time += diff) <= 0)
3081 				time = 0;
3082 
3083 			net->ipv4.rt_secret_timer.expires = time;
3084 		} else
3085 			net->ipv4.rt_secret_timer.expires = new;
3086 
3087 		net->ipv4.rt_secret_timer.expires += jiffies;
3088 		add_timer(&net->ipv4.rt_secret_timer);
3089 	}
3090 	rtnl_unlock();
3091 }
3092 
3093 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3094 					  void __user *buffer, size_t *lenp,
3095 					  loff_t *ppos)
3096 {
3097 	int old = ip_rt_secret_interval;
3098 	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3099 
3100 	rt_secret_reschedule(old);
3101 
3102 	return ret;
3103 }
3104 
3105 static ctl_table ipv4_route_table[] = {
3106 	{
3107 		.procname	= "gc_thresh",
3108 		.data		= &ipv4_dst_ops.gc_thresh,
3109 		.maxlen		= sizeof(int),
3110 		.mode		= 0644,
3111 		.proc_handler	= proc_dointvec,
3112 	},
3113 	{
3114 		.procname	= "max_size",
3115 		.data		= &ip_rt_max_size,
3116 		.maxlen		= sizeof(int),
3117 		.mode		= 0644,
3118 		.proc_handler	= proc_dointvec,
3119 	},
3120 	{
3121 		/*  Deprecated. Use gc_min_interval_ms */
3122 
3123 		.procname	= "gc_min_interval",
3124 		.data		= &ip_rt_gc_min_interval,
3125 		.maxlen		= sizeof(int),
3126 		.mode		= 0644,
3127 		.proc_handler	= proc_dointvec_jiffies,
3128 	},
3129 	{
3130 		.procname	= "gc_min_interval_ms",
3131 		.data		= &ip_rt_gc_min_interval,
3132 		.maxlen		= sizeof(int),
3133 		.mode		= 0644,
3134 		.proc_handler	= proc_dointvec_ms_jiffies,
3135 	},
3136 	{
3137 		.procname	= "gc_timeout",
3138 		.data		= &ip_rt_gc_timeout,
3139 		.maxlen		= sizeof(int),
3140 		.mode		= 0644,
3141 		.proc_handler	= proc_dointvec_jiffies,
3142 	},
3143 	{
3144 		.procname	= "gc_interval",
3145 		.data		= &ip_rt_gc_interval,
3146 		.maxlen		= sizeof(int),
3147 		.mode		= 0644,
3148 		.proc_handler	= proc_dointvec_jiffies,
3149 	},
3150 	{
3151 		.procname	= "redirect_load",
3152 		.data		= &ip_rt_redirect_load,
3153 		.maxlen		= sizeof(int),
3154 		.mode		= 0644,
3155 		.proc_handler	= proc_dointvec,
3156 	},
3157 	{
3158 		.procname	= "redirect_number",
3159 		.data		= &ip_rt_redirect_number,
3160 		.maxlen		= sizeof(int),
3161 		.mode		= 0644,
3162 		.proc_handler	= proc_dointvec,
3163 	},
3164 	{
3165 		.procname	= "redirect_silence",
3166 		.data		= &ip_rt_redirect_silence,
3167 		.maxlen		= sizeof(int),
3168 		.mode		= 0644,
3169 		.proc_handler	= proc_dointvec,
3170 	},
3171 	{
3172 		.procname	= "error_cost",
3173 		.data		= &ip_rt_error_cost,
3174 		.maxlen		= sizeof(int),
3175 		.mode		= 0644,
3176 		.proc_handler	= proc_dointvec,
3177 	},
3178 	{
3179 		.procname	= "error_burst",
3180 		.data		= &ip_rt_error_burst,
3181 		.maxlen		= sizeof(int),
3182 		.mode		= 0644,
3183 		.proc_handler	= proc_dointvec,
3184 	},
3185 	{
3186 		.procname	= "gc_elasticity",
3187 		.data		= &ip_rt_gc_elasticity,
3188 		.maxlen		= sizeof(int),
3189 		.mode		= 0644,
3190 		.proc_handler	= proc_dointvec,
3191 	},
3192 	{
3193 		.procname	= "mtu_expires",
3194 		.data		= &ip_rt_mtu_expires,
3195 		.maxlen		= sizeof(int),
3196 		.mode		= 0644,
3197 		.proc_handler	= proc_dointvec_jiffies,
3198 	},
3199 	{
3200 		.procname	= "min_pmtu",
3201 		.data		= &ip_rt_min_pmtu,
3202 		.maxlen		= sizeof(int),
3203 		.mode		= 0644,
3204 		.proc_handler	= proc_dointvec,
3205 	},
3206 	{
3207 		.procname	= "min_adv_mss",
3208 		.data		= &ip_rt_min_advmss,
3209 		.maxlen		= sizeof(int),
3210 		.mode		= 0644,
3211 		.proc_handler	= proc_dointvec,
3212 	},
3213 	{
3214 		.procname	= "secret_interval",
3215 		.data		= &ip_rt_secret_interval,
3216 		.maxlen		= sizeof(int),
3217 		.mode		= 0644,
3218 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3219 	},
3220 	{ }
3221 };
3222 
3223 static struct ctl_table empty[1];
3224 
3225 static struct ctl_table ipv4_skeleton[] =
3226 {
3227 	{ .procname = "route",
3228 	  .mode = 0555, .child = ipv4_route_table},
3229 	{ .procname = "neigh",
3230 	  .mode = 0555, .child = empty},
3231 	{ }
3232 };
3233 
3234 static __net_initdata struct ctl_path ipv4_path[] = {
3235 	{ .procname = "net", },
3236 	{ .procname = "ipv4", },
3237 	{ },
3238 };
3239 
3240 static struct ctl_table ipv4_route_flush_table[] = {
3241 	{
3242 		.procname	= "flush",
3243 		.maxlen		= sizeof(int),
3244 		.mode		= 0200,
3245 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3246 	},
3247 	{ },
3248 };
3249 
3250 static __net_initdata struct ctl_path ipv4_route_path[] = {
3251 	{ .procname = "net", },
3252 	{ .procname = "ipv4", },
3253 	{ .procname = "route", },
3254 	{ },
3255 };
3256 
3257 static __net_init int sysctl_route_net_init(struct net *net)
3258 {
3259 	struct ctl_table *tbl;
3260 
3261 	tbl = ipv4_route_flush_table;
3262 	if (!net_eq(net, &init_net)) {
3263 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3264 		if (tbl == NULL)
3265 			goto err_dup;
3266 	}
3267 	tbl[0].extra1 = net;
3268 
3269 	net->ipv4.route_hdr =
3270 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3271 	if (net->ipv4.route_hdr == NULL)
3272 		goto err_reg;
3273 	return 0;
3274 
3275 err_reg:
3276 	if (tbl != ipv4_route_flush_table)
3277 		kfree(tbl);
3278 err_dup:
3279 	return -ENOMEM;
3280 }
3281 
3282 static __net_exit void sysctl_route_net_exit(struct net *net)
3283 {
3284 	struct ctl_table *tbl;
3285 
3286 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3287 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3288 	BUG_ON(tbl == ipv4_route_flush_table);
3289 	kfree(tbl);
3290 }
3291 
3292 static __net_initdata struct pernet_operations sysctl_route_ops = {
3293 	.init = sysctl_route_net_init,
3294 	.exit = sysctl_route_net_exit,
3295 };
3296 #endif
3297 
3298 
3299 static __net_init int rt_secret_timer_init(struct net *net)
3300 {
3301 	atomic_set(&net->ipv4.rt_genid,
3302 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3303 			(jiffies ^ (jiffies >> 7))));
3304 
3305 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3306 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3307 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3308 
3309 	if (ip_rt_secret_interval) {
3310 		net->ipv4.rt_secret_timer.expires =
3311 			jiffies + net_random() % ip_rt_secret_interval +
3312 			ip_rt_secret_interval;
3313 		add_timer(&net->ipv4.rt_secret_timer);
3314 	}
3315 	return 0;
3316 }
3317 
3318 static __net_exit void rt_secret_timer_exit(struct net *net)
3319 {
3320 	del_timer_sync(&net->ipv4.rt_secret_timer);
3321 }
3322 
3323 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3324 	.init = rt_secret_timer_init,
3325 	.exit = rt_secret_timer_exit,
3326 };
3327 
3328 
3329 #ifdef CONFIG_NET_CLS_ROUTE
3330 struct ip_rt_acct *ip_rt_acct __read_mostly;
3331 #endif /* CONFIG_NET_CLS_ROUTE */
3332 
3333 static __initdata unsigned long rhash_entries;
3334 static int __init set_rhash_entries(char *str)
3335 {
3336 	if (!str)
3337 		return 0;
3338 	rhash_entries = simple_strtoul(str, &str, 0);
3339 	return 1;
3340 }
3341 __setup("rhash_entries=", set_rhash_entries);
3342 
3343 int __init ip_rt_init(void)
3344 {
3345 	int rc = 0;
3346 
3347 #ifdef CONFIG_NET_CLS_ROUTE
3348 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3349 	if (!ip_rt_acct)
3350 		panic("IP: failed to allocate ip_rt_acct\n");
3351 #endif
3352 
3353 	ipv4_dst_ops.kmem_cachep =
3354 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3355 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3356 
3357 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3358 
3359 	rt_hash_table = (struct rt_hash_bucket *)
3360 		alloc_large_system_hash("IP route cache",
3361 					sizeof(struct rt_hash_bucket),
3362 					rhash_entries,
3363 					(totalram_pages >= 128 * 1024) ?
3364 					15 : 17,
3365 					0,
3366 					&rt_hash_log,
3367 					&rt_hash_mask,
3368 					rhash_entries ? 0 : 512 * 1024);
3369 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3370 	rt_hash_lock_init();
3371 
3372 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3373 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3374 
3375 	devinet_init();
3376 	ip_fib_init();
3377 
3378 	/* All the timers, started at system startup tend
3379 	   to synchronize. Perturb it a bit.
3380 	 */
3381 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3382 	expires_ljiffies = jiffies;
3383 	schedule_delayed_work(&expires_work,
3384 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3385 
3386 	if (register_pernet_subsys(&rt_secret_timer_ops))
3387 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3388 
3389 	if (ip_rt_proc_init())
3390 		printk(KERN_ERR "Unable to create route proc files\n");
3391 #ifdef CONFIG_XFRM
3392 	xfrm_init();
3393 	xfrm4_init(ip_rt_max_size);
3394 #endif
3395 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3396 
3397 #ifdef CONFIG_SYSCTL
3398 	register_pernet_subsys(&sysctl_route_ops);
3399 #endif
3400 	return rc;
3401 }
3402 
3403 #ifdef CONFIG_SYSCTL
3404 /*
3405  * We really need to sanitize the damn ipv4 init order, then all
3406  * this nonsense will go away.
3407  */
3408 void __init ip_static_sysctl_init(void)
3409 {
3410 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3411 }
3412 #endif
3413 
3414 EXPORT_SYMBOL(__ip_select_ident);
3415 EXPORT_SYMBOL(ip_route_input);
3416 EXPORT_SYMBOL(ip_route_output_key);
3417