xref: /linux/net/ipv4/route.c (revision 9a58a80a701bdb2d220cdab4914218df5b48d781)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
517 {
518 	struct ip_rt_acct *dst, *src;
519 	unsigned int i, j;
520 
521 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 	if (!dst)
523 		return -ENOMEM;
524 
525 	for_each_possible_cpu(i) {
526 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 		for (j = 0; j < 256; j++) {
528 			dst[j].o_bytes   += src[j].o_bytes;
529 			dst[j].o_packets += src[j].o_packets;
530 			dst[j].i_bytes   += src[j].i_bytes;
531 			dst[j].i_packets += src[j].i_packets;
532 		}
533 	}
534 
535 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 	kfree(dst);
537 	return 0;
538 }
539 
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
541 {
542 	return single_open(file, rt_acct_proc_show, NULL);
543 }
544 
545 static const struct file_operations rt_acct_proc_fops = {
546 	.owner		= THIS_MODULE,
547 	.open		= rt_acct_proc_open,
548 	.read		= seq_read,
549 	.llseek		= seq_lseek,
550 	.release	= single_release,
551 };
552 #endif
553 
554 static int __net_init ip_rt_do_proc_init(struct net *net)
555 {
556 	struct proc_dir_entry *pde;
557 
558 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 			&rt_cache_seq_fops);
560 	if (!pde)
561 		goto err1;
562 
563 	pde = proc_create("rt_cache", S_IRUGO,
564 			  net->proc_net_stat, &rt_cpu_seq_fops);
565 	if (!pde)
566 		goto err2;
567 
568 #ifdef CONFIG_NET_CLS_ROUTE
569 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 	if (!pde)
571 		goto err3;
572 #endif
573 	return 0;
574 
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 	remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 	remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 	return -ENOMEM;
583 }
584 
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587 	remove_proc_entry("rt_cache", net->proc_net_stat);
588 	remove_proc_entry("rt_cache", net->proc_net);
589 	remove_proc_entry("rt_acct", net->proc_net);
590 }
591 
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593 	.init = ip_rt_do_proc_init,
594 	.exit = ip_rt_do_proc_exit,
595 };
596 
597 static int __init ip_rt_proc_init(void)
598 {
599 	return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601 
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605 	return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608 
609 static inline void rt_free(struct rtable *rt)
610 {
611 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 }
613 
614 static inline void rt_drop(struct rtable *rt)
615 {
616 	ip_rt_put(rt);
617 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618 }
619 
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622 	/* Kill broadcast/multicast entries very aggresively, if they
623 	   collide in hash table with more useful entries */
624 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 		rth->fl.iif && rth->u.dst.rt_next;
626 }
627 
628 static inline int rt_valuable(struct rtable *rth)
629 {
630 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 		rth->u.dst.expires;
632 }
633 
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636 	unsigned long age;
637 	int ret = 0;
638 
639 	if (atomic_read(&rth->u.dst.__refcnt))
640 		goto out;
641 
642 	ret = 1;
643 	if (rth->u.dst.expires &&
644 	    time_after_eq(jiffies, rth->u.dst.expires))
645 		goto out;
646 
647 	age = jiffies - rth->u.dst.lastuse;
648 	ret = 0;
649 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 	    (age <= tmo2 && rt_valuable(rth)))
651 		goto out;
652 	ret = 1;
653 out:	return ret;
654 }
655 
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663 	u32 score = jiffies - rt->u.dst.lastuse;
664 
665 	score = ~score & ~(3<<30);
666 
667 	if (rt_valuable(rt))
668 		score |= (1<<31);
669 
670 	if (!rt->fl.iif ||
671 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 		score |= (1<<30);
673 
674 	return score;
675 }
676 
677 static inline bool rt_caching(const struct net *net)
678 {
679 	return net->ipv4.current_rt_cache_rebuild_count <=
680 		net->ipv4.sysctl_rt_cache_rebuild_count;
681 }
682 
683 static inline bool compare_hash_inputs(const struct flowi *fl1,
684 					const struct flowi *fl2)
685 {
686 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
687 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
688 		(fl1->iif ^ fl2->iif)) == 0);
689 }
690 
691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692 {
693 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
694 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
695 		(fl1->mark ^ fl2->mark) |
696 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
697 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
698 		(fl1->oif ^ fl2->oif) |
699 		(fl1->iif ^ fl2->iif)) == 0;
700 }
701 
702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703 {
704 	return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
705 }
706 
707 static inline int rt_is_expired(struct rtable *rth)
708 {
709 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
710 }
711 
712 /*
713  * Perform a full scan of hash table and free all entries.
714  * Can be called by a softirq or a process.
715  * In the later case, we want to be reschedule if necessary
716  */
717 static void rt_do_flush(int process_context)
718 {
719 	unsigned int i;
720 	struct rtable *rth, *next;
721 	struct rtable * tail;
722 
723 	for (i = 0; i <= rt_hash_mask; i++) {
724 		if (process_context && need_resched())
725 			cond_resched();
726 		rth = rt_hash_table[i].chain;
727 		if (!rth)
728 			continue;
729 
730 		spin_lock_bh(rt_hash_lock_addr(i));
731 #ifdef CONFIG_NET_NS
732 		{
733 		struct rtable ** prev, * p;
734 
735 		rth = rt_hash_table[i].chain;
736 
737 		/* defer releasing the head of the list after spin_unlock */
738 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
739 			if (!rt_is_expired(tail))
740 				break;
741 		if (rth != tail)
742 			rt_hash_table[i].chain = tail;
743 
744 		/* call rt_free on entries after the tail requiring flush */
745 		prev = &rt_hash_table[i].chain;
746 		for (p = *prev; p; p = next) {
747 			next = p->u.dst.rt_next;
748 			if (!rt_is_expired(p)) {
749 				prev = &p->u.dst.rt_next;
750 			} else {
751 				*prev = next;
752 				rt_free(p);
753 			}
754 		}
755 		}
756 #else
757 		rth = rt_hash_table[i].chain;
758 		rt_hash_table[i].chain = NULL;
759 		tail = NULL;
760 #endif
761 		spin_unlock_bh(rt_hash_lock_addr(i));
762 
763 		for (; rth != tail; rth = next) {
764 			next = rth->u.dst.rt_next;
765 			rt_free(rth);
766 		}
767 	}
768 }
769 
770 /*
771  * While freeing expired entries, we compute average chain length
772  * and standard deviation, using fixed-point arithmetic.
773  * This to have an estimation of rt_chain_length_max
774  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
775  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
776  */
777 
778 #define FRACT_BITS 3
779 #define ONE (1UL << FRACT_BITS)
780 
781 static void rt_check_expire(void)
782 {
783 	static unsigned int rover;
784 	unsigned int i = rover, goal;
785 	struct rtable *rth, *aux, **rthp;
786 	unsigned long samples = 0;
787 	unsigned long sum = 0, sum2 = 0;
788 	unsigned long delta;
789 	u64 mult;
790 
791 	delta = jiffies - expires_ljiffies;
792 	expires_ljiffies = jiffies;
793 	mult = ((u64)delta) << rt_hash_log;
794 	if (ip_rt_gc_timeout > 1)
795 		do_div(mult, ip_rt_gc_timeout);
796 	goal = (unsigned int)mult;
797 	if (goal > rt_hash_mask)
798 		goal = rt_hash_mask + 1;
799 	for (; goal > 0; goal--) {
800 		unsigned long tmo = ip_rt_gc_timeout;
801 		unsigned long length;
802 
803 		i = (i + 1) & rt_hash_mask;
804 		rthp = &rt_hash_table[i].chain;
805 
806 		if (need_resched())
807 			cond_resched();
808 
809 		samples++;
810 
811 		if (*rthp == NULL)
812 			continue;
813 		length = 0;
814 		spin_lock_bh(rt_hash_lock_addr(i));
815 		while ((rth = *rthp) != NULL) {
816 			prefetch(rth->u.dst.rt_next);
817 			if (rt_is_expired(rth)) {
818 				*rthp = rth->u.dst.rt_next;
819 				rt_free(rth);
820 				continue;
821 			}
822 			if (rth->u.dst.expires) {
823 				/* Entry is expired even if it is in use */
824 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
825 nofree:
826 					tmo >>= 1;
827 					rthp = &rth->u.dst.rt_next;
828 					/*
829 					 * We only count entries on
830 					 * a chain with equal hash inputs once
831 					 * so that entries for different QOS
832 					 * levels, and other non-hash input
833 					 * attributes don't unfairly skew
834 					 * the length computation
835 					 */
836 					for (aux = rt_hash_table[i].chain;;) {
837 						if (aux == rth) {
838 							length += ONE;
839 							break;
840 						}
841 						if (compare_hash_inputs(&aux->fl, &rth->fl))
842 							break;
843 						aux = aux->u.dst.rt_next;
844 					}
845 					continue;
846 				}
847 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
848 				goto nofree;
849 
850 			/* Cleanup aged off entries. */
851 			*rthp = rth->u.dst.rt_next;
852 			rt_free(rth);
853 		}
854 		spin_unlock_bh(rt_hash_lock_addr(i));
855 		sum += length;
856 		sum2 += length*length;
857 	}
858 	if (samples) {
859 		unsigned long avg = sum / samples;
860 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
861 		rt_chain_length_max = max_t(unsigned long,
862 					ip_rt_gc_elasticity,
863 					(avg + 4*sd) >> FRACT_BITS);
864 	}
865 	rover = i;
866 }
867 
868 /*
869  * rt_worker_func() is run in process context.
870  * we call rt_check_expire() to scan part of the hash table
871  */
872 static void rt_worker_func(struct work_struct *work)
873 {
874 	rt_check_expire();
875 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
876 }
877 
878 /*
879  * Pertubation of rt_genid by a small quantity [1..256]
880  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
881  * many times (2^24) without giving recent rt_genid.
882  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
883  */
884 static void rt_cache_invalidate(struct net *net)
885 {
886 	unsigned char shuffle;
887 
888 	get_random_bytes(&shuffle, sizeof(shuffle));
889 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
890 }
891 
892 /*
893  * delay < 0  : invalidate cache (fast : entries will be deleted later)
894  * delay >= 0 : invalidate & flush cache (can be long)
895  */
896 void rt_cache_flush(struct net *net, int delay)
897 {
898 	rt_cache_invalidate(net);
899 	if (delay >= 0)
900 		rt_do_flush(!in_softirq());
901 }
902 
903 /* Flush previous cache invalidated entries from the cache */
904 void rt_cache_flush_batch(void)
905 {
906 	rt_do_flush(!in_softirq());
907 }
908 
909 /*
910  * We change rt_genid and let gc do the cleanup
911  */
912 static void rt_secret_rebuild(unsigned long __net)
913 {
914 	struct net *net = (struct net *)__net;
915 	rt_cache_invalidate(net);
916 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
917 }
918 
919 static void rt_secret_rebuild_oneshot(struct net *net)
920 {
921 	del_timer_sync(&net->ipv4.rt_secret_timer);
922 	rt_cache_invalidate(net);
923 	if (ip_rt_secret_interval) {
924 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
925 		add_timer(&net->ipv4.rt_secret_timer);
926 	}
927 }
928 
929 static void rt_emergency_hash_rebuild(struct net *net)
930 {
931 	if (net_ratelimit()) {
932 		printk(KERN_WARNING "Route hash chain too long!\n");
933 		printk(KERN_WARNING "Adjust your secret_interval!\n");
934 	}
935 
936 	rt_secret_rebuild_oneshot(net);
937 }
938 
939 /*
940    Short description of GC goals.
941 
942    We want to build algorithm, which will keep routing cache
943    at some equilibrium point, when number of aged off entries
944    is kept approximately equal to newly generated ones.
945 
946    Current expiration strength is variable "expire".
947    We try to adjust it dynamically, so that if networking
948    is idle expires is large enough to keep enough of warm entries,
949    and when load increases it reduces to limit cache size.
950  */
951 
952 static int rt_garbage_collect(struct dst_ops *ops)
953 {
954 	static unsigned long expire = RT_GC_TIMEOUT;
955 	static unsigned long last_gc;
956 	static int rover;
957 	static int equilibrium;
958 	struct rtable *rth, **rthp;
959 	unsigned long now = jiffies;
960 	int goal;
961 
962 	/*
963 	 * Garbage collection is pretty expensive,
964 	 * do not make it too frequently.
965 	 */
966 
967 	RT_CACHE_STAT_INC(gc_total);
968 
969 	if (now - last_gc < ip_rt_gc_min_interval &&
970 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
971 		RT_CACHE_STAT_INC(gc_ignored);
972 		goto out;
973 	}
974 
975 	/* Calculate number of entries, which we want to expire now. */
976 	goal = atomic_read(&ipv4_dst_ops.entries) -
977 		(ip_rt_gc_elasticity << rt_hash_log);
978 	if (goal <= 0) {
979 		if (equilibrium < ipv4_dst_ops.gc_thresh)
980 			equilibrium = ipv4_dst_ops.gc_thresh;
981 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
982 		if (goal > 0) {
983 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
984 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
985 		}
986 	} else {
987 		/* We are in dangerous area. Try to reduce cache really
988 		 * aggressively.
989 		 */
990 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
991 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
992 	}
993 
994 	if (now - last_gc >= ip_rt_gc_min_interval)
995 		last_gc = now;
996 
997 	if (goal <= 0) {
998 		equilibrium += goal;
999 		goto work_done;
1000 	}
1001 
1002 	do {
1003 		int i, k;
1004 
1005 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1006 			unsigned long tmo = expire;
1007 
1008 			k = (k + 1) & rt_hash_mask;
1009 			rthp = &rt_hash_table[k].chain;
1010 			spin_lock_bh(rt_hash_lock_addr(k));
1011 			while ((rth = *rthp) != NULL) {
1012 				if (!rt_is_expired(rth) &&
1013 					!rt_may_expire(rth, tmo, expire)) {
1014 					tmo >>= 1;
1015 					rthp = &rth->u.dst.rt_next;
1016 					continue;
1017 				}
1018 				*rthp = rth->u.dst.rt_next;
1019 				rt_free(rth);
1020 				goal--;
1021 			}
1022 			spin_unlock_bh(rt_hash_lock_addr(k));
1023 			if (goal <= 0)
1024 				break;
1025 		}
1026 		rover = k;
1027 
1028 		if (goal <= 0)
1029 			goto work_done;
1030 
1031 		/* Goal is not achieved. We stop process if:
1032 
1033 		   - if expire reduced to zero. Otherwise, expire is halfed.
1034 		   - if table is not full.
1035 		   - if we are called from interrupt.
1036 		   - jiffies check is just fallback/debug loop breaker.
1037 		     We will not spin here for long time in any case.
1038 		 */
1039 
1040 		RT_CACHE_STAT_INC(gc_goal_miss);
1041 
1042 		if (expire == 0)
1043 			break;
1044 
1045 		expire >>= 1;
1046 #if RT_CACHE_DEBUG >= 2
1047 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1049 #endif
1050 
1051 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 			goto out;
1053 	} while (!in_softirq() && time_before_eq(jiffies, now));
1054 
1055 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1056 		goto out;
1057 	if (net_ratelimit())
1058 		printk(KERN_WARNING "dst cache overflow\n");
1059 	RT_CACHE_STAT_INC(gc_dst_overflow);
1060 	return 1;
1061 
1062 work_done:
1063 	expire += ip_rt_gc_min_interval;
1064 	if (expire > ip_rt_gc_timeout ||
1065 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1066 		expire = ip_rt_gc_timeout;
1067 #if RT_CACHE_DEBUG >= 2
1068 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1069 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1070 #endif
1071 out:	return 0;
1072 }
1073 
1074 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1075 			  struct rtable **rp, struct sk_buff *skb)
1076 {
1077 	struct rtable	*rth, **rthp;
1078 	unsigned long	now;
1079 	struct rtable *cand, **candp;
1080 	u32 		min_score;
1081 	int		chain_length;
1082 	int attempts = !in_softirq();
1083 
1084 restart:
1085 	chain_length = 0;
1086 	min_score = ~(u32)0;
1087 	cand = NULL;
1088 	candp = NULL;
1089 	now = jiffies;
1090 
1091 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1092 		/*
1093 		 * If we're not caching, just tell the caller we
1094 		 * were successful and don't touch the route.  The
1095 		 * caller hold the sole reference to the cache entry, and
1096 		 * it will be released when the caller is done with it.
1097 		 * If we drop it here, the callers have no way to resolve routes
1098 		 * when we're not caching.  Instead, just point *rp at rt, so
1099 		 * the caller gets a single use out of the route
1100 		 * Note that we do rt_free on this new route entry, so that
1101 		 * once its refcount hits zero, we are still able to reap it
1102 		 * (Thanks Alexey)
1103 		 * Note also the rt_free uses call_rcu.  We don't actually
1104 		 * need rcu protection here, this is just our path to get
1105 		 * on the route gc list.
1106 		 */
1107 
1108 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1109 			int err = arp_bind_neighbour(&rt->u.dst);
1110 			if (err) {
1111 				if (net_ratelimit())
1112 					printk(KERN_WARNING
1113 					    "Neighbour table failure & not caching routes.\n");
1114 				rt_drop(rt);
1115 				return err;
1116 			}
1117 		}
1118 
1119 		rt_free(rt);
1120 		goto skip_hashing;
1121 	}
1122 
1123 	rthp = &rt_hash_table[hash].chain;
1124 
1125 	spin_lock_bh(rt_hash_lock_addr(hash));
1126 	while ((rth = *rthp) != NULL) {
1127 		if (rt_is_expired(rth)) {
1128 			*rthp = rth->u.dst.rt_next;
1129 			rt_free(rth);
1130 			continue;
1131 		}
1132 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1133 			/* Put it first */
1134 			*rthp = rth->u.dst.rt_next;
1135 			/*
1136 			 * Since lookup is lockfree, the deletion
1137 			 * must be visible to another weakly ordered CPU before
1138 			 * the insertion at the start of the hash chain.
1139 			 */
1140 			rcu_assign_pointer(rth->u.dst.rt_next,
1141 					   rt_hash_table[hash].chain);
1142 			/*
1143 			 * Since lookup is lockfree, the update writes
1144 			 * must be ordered for consistency on SMP.
1145 			 */
1146 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147 
1148 			dst_use(&rth->u.dst, now);
1149 			spin_unlock_bh(rt_hash_lock_addr(hash));
1150 
1151 			rt_drop(rt);
1152 			if (rp)
1153 				*rp = rth;
1154 			else
1155 				skb_dst_set(skb, &rth->u.dst);
1156 			return 0;
1157 		}
1158 
1159 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1160 			u32 score = rt_score(rth);
1161 
1162 			if (score <= min_score) {
1163 				cand = rth;
1164 				candp = rthp;
1165 				min_score = score;
1166 			}
1167 		}
1168 
1169 		chain_length++;
1170 
1171 		rthp = &rth->u.dst.rt_next;
1172 	}
1173 
1174 	if (cand) {
1175 		/* ip_rt_gc_elasticity used to be average length of chain
1176 		 * length, when exceeded gc becomes really aggressive.
1177 		 *
1178 		 * The second limit is less certain. At the moment it allows
1179 		 * only 2 entries per bucket. We will see.
1180 		 */
1181 		if (chain_length > ip_rt_gc_elasticity) {
1182 			*candp = cand->u.dst.rt_next;
1183 			rt_free(cand);
1184 		}
1185 	} else {
1186 		if (chain_length > rt_chain_length_max) {
1187 			struct net *net = dev_net(rt->u.dst.dev);
1188 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1189 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1190 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1191 					rt->u.dst.dev->name, num);
1192 			}
1193 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1194 		}
1195 	}
1196 
1197 	/* Try to bind route to arp only if it is output
1198 	   route or unicast forwarding path.
1199 	 */
1200 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1201 		int err = arp_bind_neighbour(&rt->u.dst);
1202 		if (err) {
1203 			spin_unlock_bh(rt_hash_lock_addr(hash));
1204 
1205 			if (err != -ENOBUFS) {
1206 				rt_drop(rt);
1207 				return err;
1208 			}
1209 
1210 			/* Neighbour tables are full and nothing
1211 			   can be released. Try to shrink route cache,
1212 			   it is most likely it holds some neighbour records.
1213 			 */
1214 			if (attempts-- > 0) {
1215 				int saved_elasticity = ip_rt_gc_elasticity;
1216 				int saved_int = ip_rt_gc_min_interval;
1217 				ip_rt_gc_elasticity	= 1;
1218 				ip_rt_gc_min_interval	= 0;
1219 				rt_garbage_collect(&ipv4_dst_ops);
1220 				ip_rt_gc_min_interval	= saved_int;
1221 				ip_rt_gc_elasticity	= saved_elasticity;
1222 				goto restart;
1223 			}
1224 
1225 			if (net_ratelimit())
1226 				printk(KERN_WARNING "Neighbour table overflow.\n");
1227 			rt_drop(rt);
1228 			return -ENOBUFS;
1229 		}
1230 	}
1231 
1232 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1233 
1234 #if RT_CACHE_DEBUG >= 2
1235 	if (rt->u.dst.rt_next) {
1236 		struct rtable *trt;
1237 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1238 		       hash, &rt->rt_dst);
1239 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1240 			printk(" . %pI4", &trt->rt_dst);
1241 		printk("\n");
1242 	}
1243 #endif
1244 	/*
1245 	 * Since lookup is lockfree, we must make sure
1246 	 * previous writes to rt are comitted to memory
1247 	 * before making rt visible to other CPUS.
1248 	 */
1249 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1250 
1251 	spin_unlock_bh(rt_hash_lock_addr(hash));
1252 
1253 skip_hashing:
1254 	if (rp)
1255 		*rp = rt;
1256 	else
1257 		skb_dst_set(skb, &rt->u.dst);
1258 	return 0;
1259 }
1260 
1261 void rt_bind_peer(struct rtable *rt, int create)
1262 {
1263 	static DEFINE_SPINLOCK(rt_peer_lock);
1264 	struct inet_peer *peer;
1265 
1266 	peer = inet_getpeer(rt->rt_dst, create);
1267 
1268 	spin_lock_bh(&rt_peer_lock);
1269 	if (rt->peer == NULL) {
1270 		rt->peer = peer;
1271 		peer = NULL;
1272 	}
1273 	spin_unlock_bh(&rt_peer_lock);
1274 	if (peer)
1275 		inet_putpeer(peer);
1276 }
1277 
1278 /*
1279  * Peer allocation may fail only in serious out-of-memory conditions.  However
1280  * we still can generate some output.
1281  * Random ID selection looks a bit dangerous because we have no chances to
1282  * select ID being unique in a reasonable period of time.
1283  * But broken packet identifier may be better than no packet at all.
1284  */
1285 static void ip_select_fb_ident(struct iphdr *iph)
1286 {
1287 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1288 	static u32 ip_fallback_id;
1289 	u32 salt;
1290 
1291 	spin_lock_bh(&ip_fb_id_lock);
1292 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1293 	iph->id = htons(salt & 0xFFFF);
1294 	ip_fallback_id = salt;
1295 	spin_unlock_bh(&ip_fb_id_lock);
1296 }
1297 
1298 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1299 {
1300 	struct rtable *rt = (struct rtable *) dst;
1301 
1302 	if (rt) {
1303 		if (rt->peer == NULL)
1304 			rt_bind_peer(rt, 1);
1305 
1306 		/* If peer is attached to destination, it is never detached,
1307 		   so that we need not to grab a lock to dereference it.
1308 		 */
1309 		if (rt->peer) {
1310 			iph->id = htons(inet_getid(rt->peer, more));
1311 			return;
1312 		}
1313 	} else
1314 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1315 		       __builtin_return_address(0));
1316 
1317 	ip_select_fb_ident(iph);
1318 }
1319 
1320 static void rt_del(unsigned hash, struct rtable *rt)
1321 {
1322 	struct rtable **rthp, *aux;
1323 
1324 	rthp = &rt_hash_table[hash].chain;
1325 	spin_lock_bh(rt_hash_lock_addr(hash));
1326 	ip_rt_put(rt);
1327 	while ((aux = *rthp) != NULL) {
1328 		if (aux == rt || rt_is_expired(aux)) {
1329 			*rthp = aux->u.dst.rt_next;
1330 			rt_free(aux);
1331 			continue;
1332 		}
1333 		rthp = &aux->u.dst.rt_next;
1334 	}
1335 	spin_unlock_bh(rt_hash_lock_addr(hash));
1336 }
1337 
1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 		    __be32 saddr, struct net_device *dev)
1340 {
1341 	int i, k;
1342 	struct in_device *in_dev = in_dev_get(dev);
1343 	struct rtable *rth, **rthp;
1344 	__be32  skeys[2] = { saddr, 0 };
1345 	int  ikeys[2] = { dev->ifindex, 0 };
1346 	struct netevent_redirect netevent;
1347 	struct net *net;
1348 
1349 	if (!in_dev)
1350 		return;
1351 
1352 	net = dev_net(dev);
1353 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1354 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1355 	    ipv4_is_zeronet(new_gw))
1356 		goto reject_redirect;
1357 
1358 	if (!rt_caching(net))
1359 		goto reject_redirect;
1360 
1361 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 			goto reject_redirect;
1364 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 			goto reject_redirect;
1366 	} else {
1367 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368 			goto reject_redirect;
1369 	}
1370 
1371 	for (i = 0; i < 2; i++) {
1372 		for (k = 0; k < 2; k++) {
1373 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1374 						rt_genid(net));
1375 
1376 			rthp=&rt_hash_table[hash].chain;
1377 
1378 			rcu_read_lock();
1379 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1380 				struct rtable *rt;
1381 
1382 				if (rth->fl.fl4_dst != daddr ||
1383 				    rth->fl.fl4_src != skeys[i] ||
1384 				    rth->fl.oif != ikeys[k] ||
1385 				    rth->fl.iif != 0 ||
1386 				    rt_is_expired(rth) ||
1387 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1388 					rthp = &rth->u.dst.rt_next;
1389 					continue;
1390 				}
1391 
1392 				if (rth->rt_dst != daddr ||
1393 				    rth->rt_src != saddr ||
1394 				    rth->u.dst.error ||
1395 				    rth->rt_gateway != old_gw ||
1396 				    rth->u.dst.dev != dev)
1397 					break;
1398 
1399 				dst_hold(&rth->u.dst);
1400 				rcu_read_unlock();
1401 
1402 				rt = dst_alloc(&ipv4_dst_ops);
1403 				if (rt == NULL) {
1404 					ip_rt_put(rth);
1405 					in_dev_put(in_dev);
1406 					return;
1407 				}
1408 
1409 				/* Copy all the information. */
1410 				*rt = *rth;
1411 				rt->u.dst.__use		= 1;
1412 				atomic_set(&rt->u.dst.__refcnt, 1);
1413 				rt->u.dst.child		= NULL;
1414 				if (rt->u.dst.dev)
1415 					dev_hold(rt->u.dst.dev);
1416 				if (rt->idev)
1417 					in_dev_hold(rt->idev);
1418 				rt->u.dst.obsolete	= 0;
1419 				rt->u.dst.lastuse	= jiffies;
1420 				rt->u.dst.path		= &rt->u.dst;
1421 				rt->u.dst.neighbour	= NULL;
1422 				rt->u.dst.hh		= NULL;
1423 #ifdef CONFIG_XFRM
1424 				rt->u.dst.xfrm		= NULL;
1425 #endif
1426 				rt->rt_genid		= rt_genid(net);
1427 				rt->rt_flags		|= RTCF_REDIRECTED;
1428 
1429 				/* Gateway is different ... */
1430 				rt->rt_gateway		= new_gw;
1431 
1432 				/* Redirect received -> path was valid */
1433 				dst_confirm(&rth->u.dst);
1434 
1435 				if (rt->peer)
1436 					atomic_inc(&rt->peer->refcnt);
1437 
1438 				if (arp_bind_neighbour(&rt->u.dst) ||
1439 				    !(rt->u.dst.neighbour->nud_state &
1440 					    NUD_VALID)) {
1441 					if (rt->u.dst.neighbour)
1442 						neigh_event_send(rt->u.dst.neighbour, NULL);
1443 					ip_rt_put(rth);
1444 					rt_drop(rt);
1445 					goto do_next;
1446 				}
1447 
1448 				netevent.old = &rth->u.dst;
1449 				netevent.new = &rt->u.dst;
1450 				call_netevent_notifiers(NETEVENT_REDIRECT,
1451 							&netevent);
1452 
1453 				rt_del(hash, rth);
1454 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1455 					ip_rt_put(rt);
1456 				goto do_next;
1457 			}
1458 			rcu_read_unlock();
1459 		do_next:
1460 			;
1461 		}
1462 	}
1463 	in_dev_put(in_dev);
1464 	return;
1465 
1466 reject_redirect:
1467 #ifdef CONFIG_IP_ROUTE_VERBOSE
1468 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1469 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1470 			"  Advised path = %pI4 -> %pI4\n",
1471 		       &old_gw, dev->name, &new_gw,
1472 		       &saddr, &daddr);
1473 #endif
1474 	in_dev_put(in_dev);
1475 }
1476 
1477 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1478 {
1479 	struct rtable *rt = (struct rtable *)dst;
1480 	struct dst_entry *ret = dst;
1481 
1482 	if (rt) {
1483 		if (dst->obsolete) {
1484 			ip_rt_put(rt);
1485 			ret = NULL;
1486 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1487 			   rt->u.dst.expires) {
1488 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1489 						rt->fl.oif,
1490 						rt_genid(dev_net(dst->dev)));
1491 #if RT_CACHE_DEBUG >= 1
1492 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1493 				&rt->rt_dst, rt->fl.fl4_tos);
1494 #endif
1495 			rt_del(hash, rt);
1496 			ret = NULL;
1497 		}
1498 	}
1499 	return ret;
1500 }
1501 
1502 /*
1503  * Algorithm:
1504  *	1. The first ip_rt_redirect_number redirects are sent
1505  *	   with exponential backoff, then we stop sending them at all,
1506  *	   assuming that the host ignores our redirects.
1507  *	2. If we did not see packets requiring redirects
1508  *	   during ip_rt_redirect_silence, we assume that the host
1509  *	   forgot redirected route and start to send redirects again.
1510  *
1511  * This algorithm is much cheaper and more intelligent than dumb load limiting
1512  * in icmp.c.
1513  *
1514  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1515  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1516  */
1517 
1518 void ip_rt_send_redirect(struct sk_buff *skb)
1519 {
1520 	struct rtable *rt = skb_rtable(skb);
1521 	struct in_device *in_dev;
1522 	int log_martians;
1523 
1524 	rcu_read_lock();
1525 	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1526 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1527 		rcu_read_unlock();
1528 		return;
1529 	}
1530 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1531 	rcu_read_unlock();
1532 
1533 	/* No redirected packets during ip_rt_redirect_silence;
1534 	 * reset the algorithm.
1535 	 */
1536 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1537 		rt->u.dst.rate_tokens = 0;
1538 
1539 	/* Too many ignored redirects; do not send anything
1540 	 * set u.dst.rate_last to the last seen redirected packet.
1541 	 */
1542 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1543 		rt->u.dst.rate_last = jiffies;
1544 		return;
1545 	}
1546 
1547 	/* Check for load limit; set rate_last to the latest sent
1548 	 * redirect.
1549 	 */
1550 	if (rt->u.dst.rate_tokens == 0 ||
1551 	    time_after(jiffies,
1552 		       (rt->u.dst.rate_last +
1553 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1554 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1555 		rt->u.dst.rate_last = jiffies;
1556 		++rt->u.dst.rate_tokens;
1557 #ifdef CONFIG_IP_ROUTE_VERBOSE
1558 		if (log_martians &&
1559 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1560 		    net_ratelimit())
1561 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1562 				&rt->rt_src, rt->rt_iif,
1563 				&rt->rt_dst, &rt->rt_gateway);
1564 #endif
1565 	}
1566 }
1567 
1568 static int ip_error(struct sk_buff *skb)
1569 {
1570 	struct rtable *rt = skb_rtable(skb);
1571 	unsigned long now;
1572 	int code;
1573 
1574 	switch (rt->u.dst.error) {
1575 		case EINVAL:
1576 		default:
1577 			goto out;
1578 		case EHOSTUNREACH:
1579 			code = ICMP_HOST_UNREACH;
1580 			break;
1581 		case ENETUNREACH:
1582 			code = ICMP_NET_UNREACH;
1583 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1584 					IPSTATS_MIB_INNOROUTES);
1585 			break;
1586 		case EACCES:
1587 			code = ICMP_PKT_FILTERED;
1588 			break;
1589 	}
1590 
1591 	now = jiffies;
1592 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1593 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1594 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1595 	rt->u.dst.rate_last = now;
1596 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1597 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1598 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1599 	}
1600 
1601 out:	kfree_skb(skb);
1602 	return 0;
1603 }
1604 
1605 /*
1606  *	The last two values are not from the RFC but
1607  *	are needed for AMPRnet AX.25 paths.
1608  */
1609 
1610 static const unsigned short mtu_plateau[] =
1611 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1612 
1613 static inline unsigned short guess_mtu(unsigned short old_mtu)
1614 {
1615 	int i;
1616 
1617 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1618 		if (old_mtu > mtu_plateau[i])
1619 			return mtu_plateau[i];
1620 	return 68;
1621 }
1622 
1623 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1624 				 unsigned short new_mtu,
1625 				 struct net_device *dev)
1626 {
1627 	int i, k;
1628 	unsigned short old_mtu = ntohs(iph->tot_len);
1629 	struct rtable *rth;
1630 	int  ikeys[2] = { dev->ifindex, 0 };
1631 	__be32  skeys[2] = { iph->saddr, 0, };
1632 	__be32  daddr = iph->daddr;
1633 	unsigned short est_mtu = 0;
1634 
1635 	for (k = 0; k < 2; k++) {
1636 		for (i = 0; i < 2; i++) {
1637 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1638 						rt_genid(net));
1639 
1640 			rcu_read_lock();
1641 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1642 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1643 				unsigned short mtu = new_mtu;
1644 
1645 				if (rth->fl.fl4_dst != daddr ||
1646 				    rth->fl.fl4_src != skeys[i] ||
1647 				    rth->rt_dst != daddr ||
1648 				    rth->rt_src != iph->saddr ||
1649 				    rth->fl.oif != ikeys[k] ||
1650 				    rth->fl.iif != 0 ||
1651 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1652 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1653 				    rt_is_expired(rth))
1654 					continue;
1655 
1656 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1657 
1658 					/* BSD 4.2 compatibility hack :-( */
1659 					if (mtu == 0 &&
1660 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1661 					    old_mtu >= 68 + (iph->ihl << 2))
1662 						old_mtu -= iph->ihl << 2;
1663 
1664 					mtu = guess_mtu(old_mtu);
1665 				}
1666 				if (mtu <= dst_mtu(&rth->u.dst)) {
1667 					if (mtu < dst_mtu(&rth->u.dst)) {
1668 						dst_confirm(&rth->u.dst);
1669 						if (mtu < ip_rt_min_pmtu) {
1670 							mtu = ip_rt_min_pmtu;
1671 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1672 								(1 << RTAX_MTU);
1673 						}
1674 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1675 						dst_set_expires(&rth->u.dst,
1676 							ip_rt_mtu_expires);
1677 					}
1678 					est_mtu = mtu;
1679 				}
1680 			}
1681 			rcu_read_unlock();
1682 		}
1683 	}
1684 	return est_mtu ? : new_mtu;
1685 }
1686 
1687 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1688 {
1689 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1690 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1691 		if (mtu < ip_rt_min_pmtu) {
1692 			mtu = ip_rt_min_pmtu;
1693 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1694 		}
1695 		dst->metrics[RTAX_MTU-1] = mtu;
1696 		dst_set_expires(dst, ip_rt_mtu_expires);
1697 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1698 	}
1699 }
1700 
1701 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702 {
1703 	return NULL;
1704 }
1705 
1706 static void ipv4_dst_destroy(struct dst_entry *dst)
1707 {
1708 	struct rtable *rt = (struct rtable *) dst;
1709 	struct inet_peer *peer = rt->peer;
1710 	struct in_device *idev = rt->idev;
1711 
1712 	if (peer) {
1713 		rt->peer = NULL;
1714 		inet_putpeer(peer);
1715 	}
1716 
1717 	if (idev) {
1718 		rt->idev = NULL;
1719 		in_dev_put(idev);
1720 	}
1721 }
1722 
1723 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1724 			    int how)
1725 {
1726 	struct rtable *rt = (struct rtable *) dst;
1727 	struct in_device *idev = rt->idev;
1728 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1729 		struct in_device *loopback_idev =
1730 			in_dev_get(dev_net(dev)->loopback_dev);
1731 		if (loopback_idev) {
1732 			rt->idev = loopback_idev;
1733 			in_dev_put(idev);
1734 		}
1735 	}
1736 }
1737 
1738 static void ipv4_link_failure(struct sk_buff *skb)
1739 {
1740 	struct rtable *rt;
1741 
1742 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743 
1744 	rt = skb_rtable(skb);
1745 	if (rt)
1746 		dst_set_expires(&rt->u.dst, 0);
1747 }
1748 
1749 static int ip_rt_bug(struct sk_buff *skb)
1750 {
1751 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1753 		skb->dev ? skb->dev->name : "?");
1754 	kfree_skb(skb);
1755 	return 0;
1756 }
1757 
1758 /*
1759    We do not cache source address of outgoing interface,
1760    because it is used only by IP RR, TS and SRR options,
1761    so that it out of fast path.
1762 
1763    BTW remember: "addr" is allowed to be not aligned
1764    in IP options!
1765  */
1766 
1767 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1768 {
1769 	__be32 src;
1770 	struct fib_result res;
1771 
1772 	if (rt->fl.iif == 0)
1773 		src = rt->rt_src;
1774 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1775 		src = FIB_RES_PREFSRC(res);
1776 		fib_res_put(&res);
1777 	} else
1778 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1779 					RT_SCOPE_UNIVERSE);
1780 	memcpy(addr, &src, 4);
1781 }
1782 
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 static void set_class_tag(struct rtable *rt, u32 tag)
1785 {
1786 	if (!(rt->u.dst.tclassid & 0xFFFF))
1787 		rt->u.dst.tclassid |= tag & 0xFFFF;
1788 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1789 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1790 }
1791 #endif
1792 
1793 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1794 {
1795 	struct fib_info *fi = res->fi;
1796 
1797 	if (fi) {
1798 		if (FIB_RES_GW(*res) &&
1799 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1800 			rt->rt_gateway = FIB_RES_GW(*res);
1801 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1802 		       sizeof(rt->u.dst.metrics));
1803 		if (fi->fib_mtu == 0) {
1804 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1805 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1806 			    rt->rt_gateway != rt->rt_dst &&
1807 			    rt->u.dst.dev->mtu > 576)
1808 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1809 		}
1810 #ifdef CONFIG_NET_CLS_ROUTE
1811 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1812 #endif
1813 	} else
1814 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1815 
1816 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1817 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1818 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1819 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1820 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1821 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1822 				       ip_rt_min_advmss);
1823 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1824 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1825 
1826 #ifdef CONFIG_NET_CLS_ROUTE
1827 #ifdef CONFIG_IP_MULTIPLE_TABLES
1828 	set_class_tag(rt, fib_rules_tclass(res));
1829 #endif
1830 	set_class_tag(rt, itag);
1831 #endif
1832 	rt->rt_type = res->type;
1833 }
1834 
1835 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1836 				u8 tos, struct net_device *dev, int our)
1837 {
1838 	unsigned hash;
1839 	struct rtable *rth;
1840 	__be32 spec_dst;
1841 	struct in_device *in_dev = in_dev_get(dev);
1842 	u32 itag = 0;
1843 
1844 	/* Primary sanity checks. */
1845 
1846 	if (in_dev == NULL)
1847 		return -EINVAL;
1848 
1849 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1850 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1851 		goto e_inval;
1852 
1853 	if (ipv4_is_zeronet(saddr)) {
1854 		if (!ipv4_is_local_multicast(daddr))
1855 			goto e_inval;
1856 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1857 	} else if (fib_validate_source(saddr, 0, tos, 0,
1858 					dev, &spec_dst, &itag, 0) < 0)
1859 		goto e_inval;
1860 
1861 	rth = dst_alloc(&ipv4_dst_ops);
1862 	if (!rth)
1863 		goto e_nobufs;
1864 
1865 	rth->u.dst.output= ip_rt_bug;
1866 
1867 	atomic_set(&rth->u.dst.__refcnt, 1);
1868 	rth->u.dst.flags= DST_HOST;
1869 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1870 		rth->u.dst.flags |= DST_NOPOLICY;
1871 	rth->fl.fl4_dst	= daddr;
1872 	rth->rt_dst	= daddr;
1873 	rth->fl.fl4_tos	= tos;
1874 	rth->fl.mark    = skb->mark;
1875 	rth->fl.fl4_src	= saddr;
1876 	rth->rt_src	= saddr;
1877 #ifdef CONFIG_NET_CLS_ROUTE
1878 	rth->u.dst.tclassid = itag;
1879 #endif
1880 	rth->rt_iif	=
1881 	rth->fl.iif	= dev->ifindex;
1882 	rth->u.dst.dev	= init_net.loopback_dev;
1883 	dev_hold(rth->u.dst.dev);
1884 	rth->idev	= in_dev_get(rth->u.dst.dev);
1885 	rth->fl.oif	= 0;
1886 	rth->rt_gateway	= daddr;
1887 	rth->rt_spec_dst= spec_dst;
1888 	rth->rt_genid	= rt_genid(dev_net(dev));
1889 	rth->rt_flags	= RTCF_MULTICAST;
1890 	rth->rt_type	= RTN_MULTICAST;
1891 	if (our) {
1892 		rth->u.dst.input= ip_local_deliver;
1893 		rth->rt_flags |= RTCF_LOCAL;
1894 	}
1895 
1896 #ifdef CONFIG_IP_MROUTE
1897 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1898 		rth->u.dst.input = ip_mr_input;
1899 #endif
1900 	RT_CACHE_STAT_INC(in_slow_mc);
1901 
1902 	in_dev_put(in_dev);
1903 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1904 	return rt_intern_hash(hash, rth, NULL, skb);
1905 
1906 e_nobufs:
1907 	in_dev_put(in_dev);
1908 	return -ENOBUFS;
1909 
1910 e_inval:
1911 	in_dev_put(in_dev);
1912 	return -EINVAL;
1913 }
1914 
1915 
1916 static void ip_handle_martian_source(struct net_device *dev,
1917 				     struct in_device *in_dev,
1918 				     struct sk_buff *skb,
1919 				     __be32 daddr,
1920 				     __be32 saddr)
1921 {
1922 	RT_CACHE_STAT_INC(in_martian_src);
1923 #ifdef CONFIG_IP_ROUTE_VERBOSE
1924 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925 		/*
1926 		 *	RFC1812 recommendation, if source is martian,
1927 		 *	the only hint is MAC header.
1928 		 */
1929 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930 			&daddr, &saddr, dev->name);
1931 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1932 			int i;
1933 			const unsigned char *p = skb_mac_header(skb);
1934 			printk(KERN_WARNING "ll header: ");
1935 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1936 				printk("%02x", *p);
1937 				if (i < (dev->hard_header_len - 1))
1938 					printk(":");
1939 			}
1940 			printk("\n");
1941 		}
1942 	}
1943 #endif
1944 }
1945 
1946 static int __mkroute_input(struct sk_buff *skb,
1947 			   struct fib_result *res,
1948 			   struct in_device *in_dev,
1949 			   __be32 daddr, __be32 saddr, u32 tos,
1950 			   struct rtable **result)
1951 {
1952 
1953 	struct rtable *rth;
1954 	int err;
1955 	struct in_device *out_dev;
1956 	unsigned flags = 0;
1957 	__be32 spec_dst;
1958 	u32 itag;
1959 
1960 	/* get a working reference to the output device */
1961 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1962 	if (out_dev == NULL) {
1963 		if (net_ratelimit())
1964 			printk(KERN_CRIT "Bug in ip_route_input" \
1965 			       "_slow(). Please, report\n");
1966 		return -EINVAL;
1967 	}
1968 
1969 
1970 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1971 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1972 	if (err < 0) {
1973 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1974 					 saddr);
1975 
1976 		err = -EINVAL;
1977 		goto cleanup;
1978 	}
1979 
1980 	if (err)
1981 		flags |= RTCF_DIRECTSRC;
1982 
1983 	if (out_dev == in_dev && err &&
1984 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 		flags |= RTCF_DOREDIRECT;
1987 
1988 	if (skb->protocol != htons(ETH_P_IP)) {
1989 		/* Not IP (i.e. ARP). Do not create route, if it is
1990 		 * invalid for proxy arp. DNAT routes are always valid.
1991 		 *
1992 		 * Proxy arp feature have been extended to allow, ARP
1993 		 * replies back to the same interface, to support
1994 		 * Private VLAN switch technologies. See arp.c.
1995 		 */
1996 		if (out_dev == in_dev &&
1997 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1998 			err = -EINVAL;
1999 			goto cleanup;
2000 		}
2001 	}
2002 
2003 
2004 	rth = dst_alloc(&ipv4_dst_ops);
2005 	if (!rth) {
2006 		err = -ENOBUFS;
2007 		goto cleanup;
2008 	}
2009 
2010 	atomic_set(&rth->u.dst.__refcnt, 1);
2011 	rth->u.dst.flags= DST_HOST;
2012 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2013 		rth->u.dst.flags |= DST_NOPOLICY;
2014 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2015 		rth->u.dst.flags |= DST_NOXFRM;
2016 	rth->fl.fl4_dst	= daddr;
2017 	rth->rt_dst	= daddr;
2018 	rth->fl.fl4_tos	= tos;
2019 	rth->fl.mark    = skb->mark;
2020 	rth->fl.fl4_src	= saddr;
2021 	rth->rt_src	= saddr;
2022 	rth->rt_gateway	= daddr;
2023 	rth->rt_iif 	=
2024 		rth->fl.iif	= in_dev->dev->ifindex;
2025 	rth->u.dst.dev	= (out_dev)->dev;
2026 	dev_hold(rth->u.dst.dev);
2027 	rth->idev	= in_dev_get(rth->u.dst.dev);
2028 	rth->fl.oif 	= 0;
2029 	rth->rt_spec_dst= spec_dst;
2030 
2031 	rth->u.dst.input = ip_forward;
2032 	rth->u.dst.output = ip_output;
2033 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2034 
2035 	rt_set_nexthop(rth, res, itag);
2036 
2037 	rth->rt_flags = flags;
2038 
2039 	*result = rth;
2040 	err = 0;
2041  cleanup:
2042 	/* release the working reference to the output device */
2043 	in_dev_put(out_dev);
2044 	return err;
2045 }
2046 
2047 static int ip_mkroute_input(struct sk_buff *skb,
2048 			    struct fib_result *res,
2049 			    const struct flowi *fl,
2050 			    struct in_device *in_dev,
2051 			    __be32 daddr, __be32 saddr, u32 tos)
2052 {
2053 	struct rtable* rth = NULL;
2054 	int err;
2055 	unsigned hash;
2056 
2057 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2058 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2059 		fib_select_multipath(fl, res);
2060 #endif
2061 
2062 	/* create a routing cache entry */
2063 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2064 	if (err)
2065 		return err;
2066 
2067 	/* put it into the cache */
2068 	hash = rt_hash(daddr, saddr, fl->iif,
2069 		       rt_genid(dev_net(rth->u.dst.dev)));
2070 	return rt_intern_hash(hash, rth, NULL, skb);
2071 }
2072 
2073 /*
2074  *	NOTE. We drop all the packets that has local source
2075  *	addresses, because every properly looped back packet
2076  *	must have correct destination already attached by output routine.
2077  *
2078  *	Such approach solves two big problems:
2079  *	1. Not simplex devices are handled properly.
2080  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2081  */
2082 
2083 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2084 			       u8 tos, struct net_device *dev)
2085 {
2086 	struct fib_result res;
2087 	struct in_device *in_dev = in_dev_get(dev);
2088 	struct flowi fl = { .nl_u = { .ip4_u =
2089 				      { .daddr = daddr,
2090 					.saddr = saddr,
2091 					.tos = tos,
2092 					.scope = RT_SCOPE_UNIVERSE,
2093 				      } },
2094 			    .mark = skb->mark,
2095 			    .iif = dev->ifindex };
2096 	unsigned	flags = 0;
2097 	u32		itag = 0;
2098 	struct rtable * rth;
2099 	unsigned	hash;
2100 	__be32		spec_dst;
2101 	int		err = -EINVAL;
2102 	int		free_res = 0;
2103 	struct net    * net = dev_net(dev);
2104 
2105 	/* IP on this device is disabled. */
2106 
2107 	if (!in_dev)
2108 		goto out;
2109 
2110 	/* Check for the most weird martians, which can be not detected
2111 	   by fib_lookup.
2112 	 */
2113 
2114 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2115 	    ipv4_is_loopback(saddr))
2116 		goto martian_source;
2117 
2118 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2119 		goto brd_input;
2120 
2121 	/* Accept zero addresses only to limited broadcast;
2122 	 * I even do not know to fix it or not. Waiting for complains :-)
2123 	 */
2124 	if (ipv4_is_zeronet(saddr))
2125 		goto martian_source;
2126 
2127 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2128 	    ipv4_is_loopback(daddr))
2129 		goto martian_destination;
2130 
2131 	/*
2132 	 *	Now we are ready to route packet.
2133 	 */
2134 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2135 		if (!IN_DEV_FORWARD(in_dev))
2136 			goto e_hostunreach;
2137 		goto no_route;
2138 	}
2139 	free_res = 1;
2140 
2141 	RT_CACHE_STAT_INC(in_slow_tot);
2142 
2143 	if (res.type == RTN_BROADCAST)
2144 		goto brd_input;
2145 
2146 	if (res.type == RTN_LOCAL) {
2147 		int result;
2148 		result = fib_validate_source(saddr, daddr, tos,
2149 					     net->loopback_dev->ifindex,
2150 					     dev, &spec_dst, &itag, skb->mark);
2151 		if (result < 0)
2152 			goto martian_source;
2153 		if (result)
2154 			flags |= RTCF_DIRECTSRC;
2155 		spec_dst = daddr;
2156 		goto local_input;
2157 	}
2158 
2159 	if (!IN_DEV_FORWARD(in_dev))
2160 		goto e_hostunreach;
2161 	if (res.type != RTN_UNICAST)
2162 		goto martian_destination;
2163 
2164 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2165 done:
2166 	in_dev_put(in_dev);
2167 	if (free_res)
2168 		fib_res_put(&res);
2169 out:	return err;
2170 
2171 brd_input:
2172 	if (skb->protocol != htons(ETH_P_IP))
2173 		goto e_inval;
2174 
2175 	if (ipv4_is_zeronet(saddr))
2176 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2177 	else {
2178 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2179 					  &itag, skb->mark);
2180 		if (err < 0)
2181 			goto martian_source;
2182 		if (err)
2183 			flags |= RTCF_DIRECTSRC;
2184 	}
2185 	flags |= RTCF_BROADCAST;
2186 	res.type = RTN_BROADCAST;
2187 	RT_CACHE_STAT_INC(in_brd);
2188 
2189 local_input:
2190 	rth = dst_alloc(&ipv4_dst_ops);
2191 	if (!rth)
2192 		goto e_nobufs;
2193 
2194 	rth->u.dst.output= ip_rt_bug;
2195 	rth->rt_genid = rt_genid(net);
2196 
2197 	atomic_set(&rth->u.dst.__refcnt, 1);
2198 	rth->u.dst.flags= DST_HOST;
2199 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2200 		rth->u.dst.flags |= DST_NOPOLICY;
2201 	rth->fl.fl4_dst	= daddr;
2202 	rth->rt_dst	= daddr;
2203 	rth->fl.fl4_tos	= tos;
2204 	rth->fl.mark    = skb->mark;
2205 	rth->fl.fl4_src	= saddr;
2206 	rth->rt_src	= saddr;
2207 #ifdef CONFIG_NET_CLS_ROUTE
2208 	rth->u.dst.tclassid = itag;
2209 #endif
2210 	rth->rt_iif	=
2211 	rth->fl.iif	= dev->ifindex;
2212 	rth->u.dst.dev	= net->loopback_dev;
2213 	dev_hold(rth->u.dst.dev);
2214 	rth->idev	= in_dev_get(rth->u.dst.dev);
2215 	rth->rt_gateway	= daddr;
2216 	rth->rt_spec_dst= spec_dst;
2217 	rth->u.dst.input= ip_local_deliver;
2218 	rth->rt_flags 	= flags|RTCF_LOCAL;
2219 	if (res.type == RTN_UNREACHABLE) {
2220 		rth->u.dst.input= ip_error;
2221 		rth->u.dst.error= -err;
2222 		rth->rt_flags 	&= ~RTCF_LOCAL;
2223 	}
2224 	rth->rt_type	= res.type;
2225 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2226 	err = rt_intern_hash(hash, rth, NULL, skb);
2227 	goto done;
2228 
2229 no_route:
2230 	RT_CACHE_STAT_INC(in_no_route);
2231 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2232 	res.type = RTN_UNREACHABLE;
2233 	if (err == -ESRCH)
2234 		err = -ENETUNREACH;
2235 	goto local_input;
2236 
2237 	/*
2238 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2239 	 */
2240 martian_destination:
2241 	RT_CACHE_STAT_INC(in_martian_dst);
2242 #ifdef CONFIG_IP_ROUTE_VERBOSE
2243 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2244 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2245 			&daddr, &saddr, dev->name);
2246 #endif
2247 
2248 e_hostunreach:
2249 	err = -EHOSTUNREACH;
2250 	goto done;
2251 
2252 e_inval:
2253 	err = -EINVAL;
2254 	goto done;
2255 
2256 e_nobufs:
2257 	err = -ENOBUFS;
2258 	goto done;
2259 
2260 martian_source:
2261 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2262 	goto e_inval;
2263 }
2264 
2265 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2266 		   u8 tos, struct net_device *dev)
2267 {
2268 	struct rtable * rth;
2269 	unsigned	hash;
2270 	int iif = dev->ifindex;
2271 	struct net *net;
2272 
2273 	net = dev_net(dev);
2274 
2275 	if (!rt_caching(net))
2276 		goto skip_cache;
2277 
2278 	tos &= IPTOS_RT_MASK;
2279 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2280 
2281 	rcu_read_lock();
2282 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2283 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2284 		if (((rth->fl.fl4_dst ^ daddr) |
2285 		     (rth->fl.fl4_src ^ saddr) |
2286 		     (rth->fl.iif ^ iif) |
2287 		     rth->fl.oif |
2288 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2289 		    rth->fl.mark == skb->mark &&
2290 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2291 		    !rt_is_expired(rth)) {
2292 			dst_use(&rth->u.dst, jiffies);
2293 			RT_CACHE_STAT_INC(in_hit);
2294 			rcu_read_unlock();
2295 			skb_dst_set(skb, &rth->u.dst);
2296 			return 0;
2297 		}
2298 		RT_CACHE_STAT_INC(in_hlist_search);
2299 	}
2300 	rcu_read_unlock();
2301 
2302 skip_cache:
2303 	/* Multicast recognition logic is moved from route cache to here.
2304 	   The problem was that too many Ethernet cards have broken/missing
2305 	   hardware multicast filters :-( As result the host on multicasting
2306 	   network acquires a lot of useless route cache entries, sort of
2307 	   SDR messages from all the world. Now we try to get rid of them.
2308 	   Really, provided software IP multicast filter is organized
2309 	   reasonably (at least, hashed), it does not result in a slowdown
2310 	   comparing with route cache reject entries.
2311 	   Note, that multicast routers are not affected, because
2312 	   route cache entry is created eventually.
2313 	 */
2314 	if (ipv4_is_multicast(daddr)) {
2315 		struct in_device *in_dev;
2316 
2317 		rcu_read_lock();
2318 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2319 			int our = ip_check_mc(in_dev, daddr, saddr,
2320 				ip_hdr(skb)->protocol);
2321 			if (our
2322 #ifdef CONFIG_IP_MROUTE
2323 				||
2324 			    (!ipv4_is_local_multicast(daddr) &&
2325 			     IN_DEV_MFORWARD(in_dev))
2326 #endif
2327 			   ) {
2328 				rcu_read_unlock();
2329 				return ip_route_input_mc(skb, daddr, saddr,
2330 							 tos, dev, our);
2331 			}
2332 		}
2333 		rcu_read_unlock();
2334 		return -EINVAL;
2335 	}
2336 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2337 }
2338 
2339 static int __mkroute_output(struct rtable **result,
2340 			    struct fib_result *res,
2341 			    const struct flowi *fl,
2342 			    const struct flowi *oldflp,
2343 			    struct net_device *dev_out,
2344 			    unsigned flags)
2345 {
2346 	struct rtable *rth;
2347 	struct in_device *in_dev;
2348 	u32 tos = RT_FL_TOS(oldflp);
2349 	int err = 0;
2350 
2351 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2352 		return -EINVAL;
2353 
2354 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2355 		res->type = RTN_BROADCAST;
2356 	else if (ipv4_is_multicast(fl->fl4_dst))
2357 		res->type = RTN_MULTICAST;
2358 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2359 		return -EINVAL;
2360 
2361 	if (dev_out->flags & IFF_LOOPBACK)
2362 		flags |= RTCF_LOCAL;
2363 
2364 	/* get work reference to inet device */
2365 	in_dev = in_dev_get(dev_out);
2366 	if (!in_dev)
2367 		return -EINVAL;
2368 
2369 	if (res->type == RTN_BROADCAST) {
2370 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2371 		if (res->fi) {
2372 			fib_info_put(res->fi);
2373 			res->fi = NULL;
2374 		}
2375 	} else if (res->type == RTN_MULTICAST) {
2376 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2377 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2378 				 oldflp->proto))
2379 			flags &= ~RTCF_LOCAL;
2380 		/* If multicast route do not exist use
2381 		   default one, but do not gateway in this case.
2382 		   Yes, it is hack.
2383 		 */
2384 		if (res->fi && res->prefixlen < 4) {
2385 			fib_info_put(res->fi);
2386 			res->fi = NULL;
2387 		}
2388 	}
2389 
2390 
2391 	rth = dst_alloc(&ipv4_dst_ops);
2392 	if (!rth) {
2393 		err = -ENOBUFS;
2394 		goto cleanup;
2395 	}
2396 
2397 	atomic_set(&rth->u.dst.__refcnt, 1);
2398 	rth->u.dst.flags= DST_HOST;
2399 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2400 		rth->u.dst.flags |= DST_NOXFRM;
2401 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2402 		rth->u.dst.flags |= DST_NOPOLICY;
2403 
2404 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2405 	rth->fl.fl4_tos	= tos;
2406 	rth->fl.fl4_src	= oldflp->fl4_src;
2407 	rth->fl.oif	= oldflp->oif;
2408 	rth->fl.mark    = oldflp->mark;
2409 	rth->rt_dst	= fl->fl4_dst;
2410 	rth->rt_src	= fl->fl4_src;
2411 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2412 	/* get references to the devices that are to be hold by the routing
2413 	   cache entry */
2414 	rth->u.dst.dev	= dev_out;
2415 	dev_hold(dev_out);
2416 	rth->idev	= in_dev_get(dev_out);
2417 	rth->rt_gateway = fl->fl4_dst;
2418 	rth->rt_spec_dst= fl->fl4_src;
2419 
2420 	rth->u.dst.output=ip_output;
2421 	rth->rt_genid = rt_genid(dev_net(dev_out));
2422 
2423 	RT_CACHE_STAT_INC(out_slow_tot);
2424 
2425 	if (flags & RTCF_LOCAL) {
2426 		rth->u.dst.input = ip_local_deliver;
2427 		rth->rt_spec_dst = fl->fl4_dst;
2428 	}
2429 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2430 		rth->rt_spec_dst = fl->fl4_src;
2431 		if (flags & RTCF_LOCAL &&
2432 		    !(dev_out->flags & IFF_LOOPBACK)) {
2433 			rth->u.dst.output = ip_mc_output;
2434 			RT_CACHE_STAT_INC(out_slow_mc);
2435 		}
2436 #ifdef CONFIG_IP_MROUTE
2437 		if (res->type == RTN_MULTICAST) {
2438 			if (IN_DEV_MFORWARD(in_dev) &&
2439 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2440 				rth->u.dst.input = ip_mr_input;
2441 				rth->u.dst.output = ip_mc_output;
2442 			}
2443 		}
2444 #endif
2445 	}
2446 
2447 	rt_set_nexthop(rth, res, 0);
2448 
2449 	rth->rt_flags = flags;
2450 
2451 	*result = rth;
2452  cleanup:
2453 	/* release work reference to inet device */
2454 	in_dev_put(in_dev);
2455 
2456 	return err;
2457 }
2458 
2459 static int ip_mkroute_output(struct rtable **rp,
2460 			     struct fib_result *res,
2461 			     const struct flowi *fl,
2462 			     const struct flowi *oldflp,
2463 			     struct net_device *dev_out,
2464 			     unsigned flags)
2465 {
2466 	struct rtable *rth = NULL;
2467 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2468 	unsigned hash;
2469 	if (err == 0) {
2470 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2471 			       rt_genid(dev_net(dev_out)));
2472 		err = rt_intern_hash(hash, rth, rp, NULL);
2473 	}
2474 
2475 	return err;
2476 }
2477 
2478 /*
2479  * Major route resolver routine.
2480  */
2481 
2482 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2483 				const struct flowi *oldflp)
2484 {
2485 	u32 tos	= RT_FL_TOS(oldflp);
2486 	struct flowi fl = { .nl_u = { .ip4_u =
2487 				      { .daddr = oldflp->fl4_dst,
2488 					.saddr = oldflp->fl4_src,
2489 					.tos = tos & IPTOS_RT_MASK,
2490 					.scope = ((tos & RTO_ONLINK) ?
2491 						  RT_SCOPE_LINK :
2492 						  RT_SCOPE_UNIVERSE),
2493 				      } },
2494 			    .mark = oldflp->mark,
2495 			    .iif = net->loopback_dev->ifindex,
2496 			    .oif = oldflp->oif };
2497 	struct fib_result res;
2498 	unsigned flags = 0;
2499 	struct net_device *dev_out = NULL;
2500 	int free_res = 0;
2501 	int err;
2502 
2503 
2504 	res.fi		= NULL;
2505 #ifdef CONFIG_IP_MULTIPLE_TABLES
2506 	res.r		= NULL;
2507 #endif
2508 
2509 	if (oldflp->fl4_src) {
2510 		err = -EINVAL;
2511 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2512 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2513 		    ipv4_is_zeronet(oldflp->fl4_src))
2514 			goto out;
2515 
2516 		/* I removed check for oif == dev_out->oif here.
2517 		   It was wrong for two reasons:
2518 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2519 		      is assigned to multiple interfaces.
2520 		   2. Moreover, we are allowed to send packets with saddr
2521 		      of another iface. --ANK
2522 		 */
2523 
2524 		if (oldflp->oif == 0 &&
2525 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2526 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2527 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2528 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2529 			if (dev_out == NULL)
2530 				goto out;
2531 
2532 			/* Special hack: user can direct multicasts
2533 			   and limited broadcast via necessary interface
2534 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2535 			   This hack is not just for fun, it allows
2536 			   vic,vat and friends to work.
2537 			   They bind socket to loopback, set ttl to zero
2538 			   and expect that it will work.
2539 			   From the viewpoint of routing cache they are broken,
2540 			   because we are not allowed to build multicast path
2541 			   with loopback source addr (look, routing cache
2542 			   cannot know, that ttl is zero, so that packet
2543 			   will not leave this host and route is valid).
2544 			   Luckily, this hack is good workaround.
2545 			 */
2546 
2547 			fl.oif = dev_out->ifindex;
2548 			goto make_route;
2549 		}
2550 
2551 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2552 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2553 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2554 			if (dev_out == NULL)
2555 				goto out;
2556 			dev_put(dev_out);
2557 			dev_out = NULL;
2558 		}
2559 	}
2560 
2561 
2562 	if (oldflp->oif) {
2563 		dev_out = dev_get_by_index(net, oldflp->oif);
2564 		err = -ENODEV;
2565 		if (dev_out == NULL)
2566 			goto out;
2567 
2568 		/* RACE: Check return value of inet_select_addr instead. */
2569 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2570 			dev_put(dev_out);
2571 			goto out;	/* Wrong error code */
2572 		}
2573 
2574 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2575 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2576 			if (!fl.fl4_src)
2577 				fl.fl4_src = inet_select_addr(dev_out, 0,
2578 							      RT_SCOPE_LINK);
2579 			goto make_route;
2580 		}
2581 		if (!fl.fl4_src) {
2582 			if (ipv4_is_multicast(oldflp->fl4_dst))
2583 				fl.fl4_src = inet_select_addr(dev_out, 0,
2584 							      fl.fl4_scope);
2585 			else if (!oldflp->fl4_dst)
2586 				fl.fl4_src = inet_select_addr(dev_out, 0,
2587 							      RT_SCOPE_HOST);
2588 		}
2589 	}
2590 
2591 	if (!fl.fl4_dst) {
2592 		fl.fl4_dst = fl.fl4_src;
2593 		if (!fl.fl4_dst)
2594 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2595 		if (dev_out)
2596 			dev_put(dev_out);
2597 		dev_out = net->loopback_dev;
2598 		dev_hold(dev_out);
2599 		fl.oif = net->loopback_dev->ifindex;
2600 		res.type = RTN_LOCAL;
2601 		flags |= RTCF_LOCAL;
2602 		goto make_route;
2603 	}
2604 
2605 	if (fib_lookup(net, &fl, &res)) {
2606 		res.fi = NULL;
2607 		if (oldflp->oif) {
2608 			/* Apparently, routing tables are wrong. Assume,
2609 			   that the destination is on link.
2610 
2611 			   WHY? DW.
2612 			   Because we are allowed to send to iface
2613 			   even if it has NO routes and NO assigned
2614 			   addresses. When oif is specified, routing
2615 			   tables are looked up with only one purpose:
2616 			   to catch if destination is gatewayed, rather than
2617 			   direct. Moreover, if MSG_DONTROUTE is set,
2618 			   we send packet, ignoring both routing tables
2619 			   and ifaddr state. --ANK
2620 
2621 
2622 			   We could make it even if oif is unknown,
2623 			   likely IPv6, but we do not.
2624 			 */
2625 
2626 			if (fl.fl4_src == 0)
2627 				fl.fl4_src = inet_select_addr(dev_out, 0,
2628 							      RT_SCOPE_LINK);
2629 			res.type = RTN_UNICAST;
2630 			goto make_route;
2631 		}
2632 		if (dev_out)
2633 			dev_put(dev_out);
2634 		err = -ENETUNREACH;
2635 		goto out;
2636 	}
2637 	free_res = 1;
2638 
2639 	if (res.type == RTN_LOCAL) {
2640 		if (!fl.fl4_src)
2641 			fl.fl4_src = fl.fl4_dst;
2642 		if (dev_out)
2643 			dev_put(dev_out);
2644 		dev_out = net->loopback_dev;
2645 		dev_hold(dev_out);
2646 		fl.oif = dev_out->ifindex;
2647 		if (res.fi)
2648 			fib_info_put(res.fi);
2649 		res.fi = NULL;
2650 		flags |= RTCF_LOCAL;
2651 		goto make_route;
2652 	}
2653 
2654 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2655 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2656 		fib_select_multipath(&fl, &res);
2657 	else
2658 #endif
2659 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2660 		fib_select_default(net, &fl, &res);
2661 
2662 	if (!fl.fl4_src)
2663 		fl.fl4_src = FIB_RES_PREFSRC(res);
2664 
2665 	if (dev_out)
2666 		dev_put(dev_out);
2667 	dev_out = FIB_RES_DEV(res);
2668 	dev_hold(dev_out);
2669 	fl.oif = dev_out->ifindex;
2670 
2671 
2672 make_route:
2673 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2674 
2675 
2676 	if (free_res)
2677 		fib_res_put(&res);
2678 	if (dev_out)
2679 		dev_put(dev_out);
2680 out:	return err;
2681 }
2682 
2683 int __ip_route_output_key(struct net *net, struct rtable **rp,
2684 			  const struct flowi *flp)
2685 {
2686 	unsigned hash;
2687 	struct rtable *rth;
2688 
2689 	if (!rt_caching(net))
2690 		goto slow_output;
2691 
2692 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2693 
2694 	rcu_read_lock_bh();
2695 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2696 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2697 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2698 		    rth->fl.fl4_src == flp->fl4_src &&
2699 		    rth->fl.iif == 0 &&
2700 		    rth->fl.oif == flp->oif &&
2701 		    rth->fl.mark == flp->mark &&
2702 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2703 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2704 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2705 		    !rt_is_expired(rth)) {
2706 			dst_use(&rth->u.dst, jiffies);
2707 			RT_CACHE_STAT_INC(out_hit);
2708 			rcu_read_unlock_bh();
2709 			*rp = rth;
2710 			return 0;
2711 		}
2712 		RT_CACHE_STAT_INC(out_hlist_search);
2713 	}
2714 	rcu_read_unlock_bh();
2715 
2716 slow_output:
2717 	return ip_route_output_slow(net, rp, flp);
2718 }
2719 
2720 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2721 
2722 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2723 {
2724 }
2725 
2726 static struct dst_ops ipv4_dst_blackhole_ops = {
2727 	.family			=	AF_INET,
2728 	.protocol		=	cpu_to_be16(ETH_P_IP),
2729 	.destroy		=	ipv4_dst_destroy,
2730 	.check			=	ipv4_dst_check,
2731 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2732 	.entries		=	ATOMIC_INIT(0),
2733 };
2734 
2735 
2736 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2737 {
2738 	struct rtable *ort = *rp;
2739 	struct rtable *rt = (struct rtable *)
2740 		dst_alloc(&ipv4_dst_blackhole_ops);
2741 
2742 	if (rt) {
2743 		struct dst_entry *new = &rt->u.dst;
2744 
2745 		atomic_set(&new->__refcnt, 1);
2746 		new->__use = 1;
2747 		new->input = dst_discard;
2748 		new->output = dst_discard;
2749 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2750 
2751 		new->dev = ort->u.dst.dev;
2752 		if (new->dev)
2753 			dev_hold(new->dev);
2754 
2755 		rt->fl = ort->fl;
2756 
2757 		rt->idev = ort->idev;
2758 		if (rt->idev)
2759 			in_dev_hold(rt->idev);
2760 		rt->rt_genid = rt_genid(net);
2761 		rt->rt_flags = ort->rt_flags;
2762 		rt->rt_type = ort->rt_type;
2763 		rt->rt_dst = ort->rt_dst;
2764 		rt->rt_src = ort->rt_src;
2765 		rt->rt_iif = ort->rt_iif;
2766 		rt->rt_gateway = ort->rt_gateway;
2767 		rt->rt_spec_dst = ort->rt_spec_dst;
2768 		rt->peer = ort->peer;
2769 		if (rt->peer)
2770 			atomic_inc(&rt->peer->refcnt);
2771 
2772 		dst_free(new);
2773 	}
2774 
2775 	dst_release(&(*rp)->u.dst);
2776 	*rp = rt;
2777 	return (rt ? 0 : -ENOMEM);
2778 }
2779 
2780 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2781 			 struct sock *sk, int flags)
2782 {
2783 	int err;
2784 
2785 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2786 		return err;
2787 
2788 	if (flp->proto) {
2789 		if (!flp->fl4_src)
2790 			flp->fl4_src = (*rp)->rt_src;
2791 		if (!flp->fl4_dst)
2792 			flp->fl4_dst = (*rp)->rt_dst;
2793 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2794 				    flags ? XFRM_LOOKUP_WAIT : 0);
2795 		if (err == -EREMOTE)
2796 			err = ipv4_dst_blackhole(net, rp, flp);
2797 
2798 		return err;
2799 	}
2800 
2801 	return 0;
2802 }
2803 
2804 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2805 
2806 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2807 {
2808 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2809 }
2810 
2811 static int rt_fill_info(struct net *net,
2812 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2813 			int nowait, unsigned int flags)
2814 {
2815 	struct rtable *rt = skb_rtable(skb);
2816 	struct rtmsg *r;
2817 	struct nlmsghdr *nlh;
2818 	long expires;
2819 	u32 id = 0, ts = 0, tsage = 0, error;
2820 
2821 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2822 	if (nlh == NULL)
2823 		return -EMSGSIZE;
2824 
2825 	r = nlmsg_data(nlh);
2826 	r->rtm_family	 = AF_INET;
2827 	r->rtm_dst_len	= 32;
2828 	r->rtm_src_len	= 0;
2829 	r->rtm_tos	= rt->fl.fl4_tos;
2830 	r->rtm_table	= RT_TABLE_MAIN;
2831 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2832 	r->rtm_type	= rt->rt_type;
2833 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2834 	r->rtm_protocol = RTPROT_UNSPEC;
2835 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2836 	if (rt->rt_flags & RTCF_NOTIFY)
2837 		r->rtm_flags |= RTM_F_NOTIFY;
2838 
2839 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2840 
2841 	if (rt->fl.fl4_src) {
2842 		r->rtm_src_len = 32;
2843 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2844 	}
2845 	if (rt->u.dst.dev)
2846 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2847 #ifdef CONFIG_NET_CLS_ROUTE
2848 	if (rt->u.dst.tclassid)
2849 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2850 #endif
2851 	if (rt->fl.iif)
2852 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2853 	else if (rt->rt_src != rt->fl.fl4_src)
2854 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2855 
2856 	if (rt->rt_dst != rt->rt_gateway)
2857 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2858 
2859 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2860 		goto nla_put_failure;
2861 
2862 	error = rt->u.dst.error;
2863 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2864 	if (rt->peer) {
2865 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2866 		if (rt->peer->tcp_ts_stamp) {
2867 			ts = rt->peer->tcp_ts;
2868 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2869 		}
2870 	}
2871 
2872 	if (rt->fl.iif) {
2873 #ifdef CONFIG_IP_MROUTE
2874 		__be32 dst = rt->rt_dst;
2875 
2876 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2877 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2878 			int err = ipmr_get_route(net, skb, r, nowait);
2879 			if (err <= 0) {
2880 				if (!nowait) {
2881 					if (err == 0)
2882 						return 0;
2883 					goto nla_put_failure;
2884 				} else {
2885 					if (err == -EMSGSIZE)
2886 						goto nla_put_failure;
2887 					error = err;
2888 				}
2889 			}
2890 		} else
2891 #endif
2892 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2893 	}
2894 
2895 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2896 			       expires, error) < 0)
2897 		goto nla_put_failure;
2898 
2899 	return nlmsg_end(skb, nlh);
2900 
2901 nla_put_failure:
2902 	nlmsg_cancel(skb, nlh);
2903 	return -EMSGSIZE;
2904 }
2905 
2906 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2907 {
2908 	struct net *net = sock_net(in_skb->sk);
2909 	struct rtmsg *rtm;
2910 	struct nlattr *tb[RTA_MAX+1];
2911 	struct rtable *rt = NULL;
2912 	__be32 dst = 0;
2913 	__be32 src = 0;
2914 	u32 iif;
2915 	int err;
2916 	struct sk_buff *skb;
2917 
2918 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2919 	if (err < 0)
2920 		goto errout;
2921 
2922 	rtm = nlmsg_data(nlh);
2923 
2924 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2925 	if (skb == NULL) {
2926 		err = -ENOBUFS;
2927 		goto errout;
2928 	}
2929 
2930 	/* Reserve room for dummy headers, this skb can pass
2931 	   through good chunk of routing engine.
2932 	 */
2933 	skb_reset_mac_header(skb);
2934 	skb_reset_network_header(skb);
2935 
2936 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2937 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2938 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2939 
2940 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2941 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2942 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2943 
2944 	if (iif) {
2945 		struct net_device *dev;
2946 
2947 		dev = __dev_get_by_index(net, iif);
2948 		if (dev == NULL) {
2949 			err = -ENODEV;
2950 			goto errout_free;
2951 		}
2952 
2953 		skb->protocol	= htons(ETH_P_IP);
2954 		skb->dev	= dev;
2955 		local_bh_disable();
2956 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2957 		local_bh_enable();
2958 
2959 		rt = skb_rtable(skb);
2960 		if (err == 0 && rt->u.dst.error)
2961 			err = -rt->u.dst.error;
2962 	} else {
2963 		struct flowi fl = {
2964 			.nl_u = {
2965 				.ip4_u = {
2966 					.daddr = dst,
2967 					.saddr = src,
2968 					.tos = rtm->rtm_tos,
2969 				},
2970 			},
2971 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2972 		};
2973 		err = ip_route_output_key(net, &rt, &fl);
2974 	}
2975 
2976 	if (err)
2977 		goto errout_free;
2978 
2979 	skb_dst_set(skb, &rt->u.dst);
2980 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2981 		rt->rt_flags |= RTCF_NOTIFY;
2982 
2983 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2984 			   RTM_NEWROUTE, 0, 0);
2985 	if (err <= 0)
2986 		goto errout_free;
2987 
2988 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2989 errout:
2990 	return err;
2991 
2992 errout_free:
2993 	kfree_skb(skb);
2994 	goto errout;
2995 }
2996 
2997 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2998 {
2999 	struct rtable *rt;
3000 	int h, s_h;
3001 	int idx, s_idx;
3002 	struct net *net;
3003 
3004 	net = sock_net(skb->sk);
3005 
3006 	s_h = cb->args[0];
3007 	if (s_h < 0)
3008 		s_h = 0;
3009 	s_idx = idx = cb->args[1];
3010 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3011 		if (!rt_hash_table[h].chain)
3012 			continue;
3013 		rcu_read_lock_bh();
3014 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3015 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3016 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3017 				continue;
3018 			if (rt_is_expired(rt))
3019 				continue;
3020 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3021 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3022 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3023 					 1, NLM_F_MULTI) <= 0) {
3024 				skb_dst_drop(skb);
3025 				rcu_read_unlock_bh();
3026 				goto done;
3027 			}
3028 			skb_dst_drop(skb);
3029 		}
3030 		rcu_read_unlock_bh();
3031 	}
3032 
3033 done:
3034 	cb->args[0] = h;
3035 	cb->args[1] = idx;
3036 	return skb->len;
3037 }
3038 
3039 void ip_rt_multicast_event(struct in_device *in_dev)
3040 {
3041 	rt_cache_flush(dev_net(in_dev->dev), 0);
3042 }
3043 
3044 #ifdef CONFIG_SYSCTL
3045 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3046 					void __user *buffer,
3047 					size_t *lenp, loff_t *ppos)
3048 {
3049 	if (write) {
3050 		int flush_delay;
3051 		ctl_table ctl;
3052 		struct net *net;
3053 
3054 		memcpy(&ctl, __ctl, sizeof(ctl));
3055 		ctl.data = &flush_delay;
3056 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3057 
3058 		net = (struct net *)__ctl->extra1;
3059 		rt_cache_flush(net, flush_delay);
3060 		return 0;
3061 	}
3062 
3063 	return -EINVAL;
3064 }
3065 
3066 static void rt_secret_reschedule(int old)
3067 {
3068 	struct net *net;
3069 	int new = ip_rt_secret_interval;
3070 	int diff = new - old;
3071 
3072 	if (!diff)
3073 		return;
3074 
3075 	rtnl_lock();
3076 	for_each_net(net) {
3077 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3078 
3079 		if (!new)
3080 			continue;
3081 
3082 		if (deleted) {
3083 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3084 
3085 			if (time <= 0 || (time += diff) <= 0)
3086 				time = 0;
3087 
3088 			net->ipv4.rt_secret_timer.expires = time;
3089 		} else
3090 			net->ipv4.rt_secret_timer.expires = new;
3091 
3092 		net->ipv4.rt_secret_timer.expires += jiffies;
3093 		add_timer(&net->ipv4.rt_secret_timer);
3094 	}
3095 	rtnl_unlock();
3096 }
3097 
3098 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3099 					  void __user *buffer, size_t *lenp,
3100 					  loff_t *ppos)
3101 {
3102 	int old = ip_rt_secret_interval;
3103 	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3104 
3105 	rt_secret_reschedule(old);
3106 
3107 	return ret;
3108 }
3109 
3110 static ctl_table ipv4_route_table[] = {
3111 	{
3112 		.procname	= "gc_thresh",
3113 		.data		= &ipv4_dst_ops.gc_thresh,
3114 		.maxlen		= sizeof(int),
3115 		.mode		= 0644,
3116 		.proc_handler	= proc_dointvec,
3117 	},
3118 	{
3119 		.procname	= "max_size",
3120 		.data		= &ip_rt_max_size,
3121 		.maxlen		= sizeof(int),
3122 		.mode		= 0644,
3123 		.proc_handler	= proc_dointvec,
3124 	},
3125 	{
3126 		/*  Deprecated. Use gc_min_interval_ms */
3127 
3128 		.procname	= "gc_min_interval",
3129 		.data		= &ip_rt_gc_min_interval,
3130 		.maxlen		= sizeof(int),
3131 		.mode		= 0644,
3132 		.proc_handler	= proc_dointvec_jiffies,
3133 	},
3134 	{
3135 		.procname	= "gc_min_interval_ms",
3136 		.data		= &ip_rt_gc_min_interval,
3137 		.maxlen		= sizeof(int),
3138 		.mode		= 0644,
3139 		.proc_handler	= proc_dointvec_ms_jiffies,
3140 	},
3141 	{
3142 		.procname	= "gc_timeout",
3143 		.data		= &ip_rt_gc_timeout,
3144 		.maxlen		= sizeof(int),
3145 		.mode		= 0644,
3146 		.proc_handler	= proc_dointvec_jiffies,
3147 	},
3148 	{
3149 		.procname	= "gc_interval",
3150 		.data		= &ip_rt_gc_interval,
3151 		.maxlen		= sizeof(int),
3152 		.mode		= 0644,
3153 		.proc_handler	= proc_dointvec_jiffies,
3154 	},
3155 	{
3156 		.procname	= "redirect_load",
3157 		.data		= &ip_rt_redirect_load,
3158 		.maxlen		= sizeof(int),
3159 		.mode		= 0644,
3160 		.proc_handler	= proc_dointvec,
3161 	},
3162 	{
3163 		.procname	= "redirect_number",
3164 		.data		= &ip_rt_redirect_number,
3165 		.maxlen		= sizeof(int),
3166 		.mode		= 0644,
3167 		.proc_handler	= proc_dointvec,
3168 	},
3169 	{
3170 		.procname	= "redirect_silence",
3171 		.data		= &ip_rt_redirect_silence,
3172 		.maxlen		= sizeof(int),
3173 		.mode		= 0644,
3174 		.proc_handler	= proc_dointvec,
3175 	},
3176 	{
3177 		.procname	= "error_cost",
3178 		.data		= &ip_rt_error_cost,
3179 		.maxlen		= sizeof(int),
3180 		.mode		= 0644,
3181 		.proc_handler	= proc_dointvec,
3182 	},
3183 	{
3184 		.procname	= "error_burst",
3185 		.data		= &ip_rt_error_burst,
3186 		.maxlen		= sizeof(int),
3187 		.mode		= 0644,
3188 		.proc_handler	= proc_dointvec,
3189 	},
3190 	{
3191 		.procname	= "gc_elasticity",
3192 		.data		= &ip_rt_gc_elasticity,
3193 		.maxlen		= sizeof(int),
3194 		.mode		= 0644,
3195 		.proc_handler	= proc_dointvec,
3196 	},
3197 	{
3198 		.procname	= "mtu_expires",
3199 		.data		= &ip_rt_mtu_expires,
3200 		.maxlen		= sizeof(int),
3201 		.mode		= 0644,
3202 		.proc_handler	= proc_dointvec_jiffies,
3203 	},
3204 	{
3205 		.procname	= "min_pmtu",
3206 		.data		= &ip_rt_min_pmtu,
3207 		.maxlen		= sizeof(int),
3208 		.mode		= 0644,
3209 		.proc_handler	= proc_dointvec,
3210 	},
3211 	{
3212 		.procname	= "min_adv_mss",
3213 		.data		= &ip_rt_min_advmss,
3214 		.maxlen		= sizeof(int),
3215 		.mode		= 0644,
3216 		.proc_handler	= proc_dointvec,
3217 	},
3218 	{
3219 		.procname	= "secret_interval",
3220 		.data		= &ip_rt_secret_interval,
3221 		.maxlen		= sizeof(int),
3222 		.mode		= 0644,
3223 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3224 	},
3225 	{ }
3226 };
3227 
3228 static struct ctl_table empty[1];
3229 
3230 static struct ctl_table ipv4_skeleton[] =
3231 {
3232 	{ .procname = "route",
3233 	  .mode = 0555, .child = ipv4_route_table},
3234 	{ .procname = "neigh",
3235 	  .mode = 0555, .child = empty},
3236 	{ }
3237 };
3238 
3239 static __net_initdata struct ctl_path ipv4_path[] = {
3240 	{ .procname = "net", },
3241 	{ .procname = "ipv4", },
3242 	{ },
3243 };
3244 
3245 static struct ctl_table ipv4_route_flush_table[] = {
3246 	{
3247 		.procname	= "flush",
3248 		.maxlen		= sizeof(int),
3249 		.mode		= 0200,
3250 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3251 	},
3252 	{ },
3253 };
3254 
3255 static __net_initdata struct ctl_path ipv4_route_path[] = {
3256 	{ .procname = "net", },
3257 	{ .procname = "ipv4", },
3258 	{ .procname = "route", },
3259 	{ },
3260 };
3261 
3262 static __net_init int sysctl_route_net_init(struct net *net)
3263 {
3264 	struct ctl_table *tbl;
3265 
3266 	tbl = ipv4_route_flush_table;
3267 	if (!net_eq(net, &init_net)) {
3268 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3269 		if (tbl == NULL)
3270 			goto err_dup;
3271 	}
3272 	tbl[0].extra1 = net;
3273 
3274 	net->ipv4.route_hdr =
3275 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3276 	if (net->ipv4.route_hdr == NULL)
3277 		goto err_reg;
3278 	return 0;
3279 
3280 err_reg:
3281 	if (tbl != ipv4_route_flush_table)
3282 		kfree(tbl);
3283 err_dup:
3284 	return -ENOMEM;
3285 }
3286 
3287 static __net_exit void sysctl_route_net_exit(struct net *net)
3288 {
3289 	struct ctl_table *tbl;
3290 
3291 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3292 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3293 	BUG_ON(tbl == ipv4_route_flush_table);
3294 	kfree(tbl);
3295 }
3296 
3297 static __net_initdata struct pernet_operations sysctl_route_ops = {
3298 	.init = sysctl_route_net_init,
3299 	.exit = sysctl_route_net_exit,
3300 };
3301 #endif
3302 
3303 
3304 static __net_init int rt_secret_timer_init(struct net *net)
3305 {
3306 	atomic_set(&net->ipv4.rt_genid,
3307 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3308 			(jiffies ^ (jiffies >> 7))));
3309 
3310 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3311 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3312 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3313 
3314 	if (ip_rt_secret_interval) {
3315 		net->ipv4.rt_secret_timer.expires =
3316 			jiffies + net_random() % ip_rt_secret_interval +
3317 			ip_rt_secret_interval;
3318 		add_timer(&net->ipv4.rt_secret_timer);
3319 	}
3320 	return 0;
3321 }
3322 
3323 static __net_exit void rt_secret_timer_exit(struct net *net)
3324 {
3325 	del_timer_sync(&net->ipv4.rt_secret_timer);
3326 }
3327 
3328 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3329 	.init = rt_secret_timer_init,
3330 	.exit = rt_secret_timer_exit,
3331 };
3332 
3333 
3334 #ifdef CONFIG_NET_CLS_ROUTE
3335 struct ip_rt_acct *ip_rt_acct __read_mostly;
3336 #endif /* CONFIG_NET_CLS_ROUTE */
3337 
3338 static __initdata unsigned long rhash_entries;
3339 static int __init set_rhash_entries(char *str)
3340 {
3341 	if (!str)
3342 		return 0;
3343 	rhash_entries = simple_strtoul(str, &str, 0);
3344 	return 1;
3345 }
3346 __setup("rhash_entries=", set_rhash_entries);
3347 
3348 int __init ip_rt_init(void)
3349 {
3350 	int rc = 0;
3351 
3352 #ifdef CONFIG_NET_CLS_ROUTE
3353 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3354 	if (!ip_rt_acct)
3355 		panic("IP: failed to allocate ip_rt_acct\n");
3356 #endif
3357 
3358 	ipv4_dst_ops.kmem_cachep =
3359 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3360 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3361 
3362 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3363 
3364 	rt_hash_table = (struct rt_hash_bucket *)
3365 		alloc_large_system_hash("IP route cache",
3366 					sizeof(struct rt_hash_bucket),
3367 					rhash_entries,
3368 					(totalram_pages >= 128 * 1024) ?
3369 					15 : 17,
3370 					0,
3371 					&rt_hash_log,
3372 					&rt_hash_mask,
3373 					rhash_entries ? 0 : 512 * 1024);
3374 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3375 	rt_hash_lock_init();
3376 
3377 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3378 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3379 
3380 	devinet_init();
3381 	ip_fib_init();
3382 
3383 	/* All the timers, started at system startup tend
3384 	   to synchronize. Perturb it a bit.
3385 	 */
3386 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3387 	expires_ljiffies = jiffies;
3388 	schedule_delayed_work(&expires_work,
3389 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3390 
3391 	if (register_pernet_subsys(&rt_secret_timer_ops))
3392 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3393 
3394 	if (ip_rt_proc_init())
3395 		printk(KERN_ERR "Unable to create route proc files\n");
3396 #ifdef CONFIG_XFRM
3397 	xfrm_init();
3398 	xfrm4_init(ip_rt_max_size);
3399 #endif
3400 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3401 
3402 #ifdef CONFIG_SYSCTL
3403 	register_pernet_subsys(&sysctl_route_ops);
3404 #endif
3405 	return rc;
3406 }
3407 
3408 #ifdef CONFIG_SYSCTL
3409 /*
3410  * We really need to sanitize the damn ipv4 init order, then all
3411  * this nonsense will go away.
3412  */
3413 void __init ip_static_sysctl_init(void)
3414 {
3415 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3416 }
3417 #endif
3418 
3419 EXPORT_SYMBOL(__ip_select_ident);
3420 EXPORT_SYMBOL(ip_route_input);
3421 EXPORT_SYMBOL(ip_route_output_key);
3422