xref: /linux/net/ipv4/route.c (revision cb299ba8b5ef2239429484072fea394cd7581bd7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 
150 
151 static struct dst_ops ipv4_dst_ops = {
152 	.family =		AF_INET,
153 	.protocol =		cpu_to_be16(ETH_P_IP),
154 	.gc =			rt_garbage_collect,
155 	.check =		ipv4_dst_check,
156 	.destroy =		ipv4_dst_destroy,
157 	.ifdown =		ipv4_dst_ifdown,
158 	.negative_advice =	ipv4_negative_advice,
159 	.link_failure =		ipv4_link_failure,
160 	.update_pmtu =		ip_rt_update_pmtu,
161 	.local_out =		__ip_local_out,
162 };
163 
164 #define ECN_OR_COST(class)	TC_PRIO_##class
165 
166 const __u8 ip_tos2prio[16] = {
167 	TC_PRIO_BESTEFFORT,
168 	ECN_OR_COST(FILLER),
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(BESTEFFORT),
171 	TC_PRIO_BULK,
172 	ECN_OR_COST(BULK),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_INTERACTIVE,
176 	ECN_OR_COST(INTERACTIVE),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE_BULK,
180 	ECN_OR_COST(INTERACTIVE_BULK),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK)
183 };
184 
185 
186 /*
187  * Route cache.
188  */
189 
190 /* The locking scheme is rather straight forward:
191  *
192  * 1) Read-Copy Update protects the buckets of the central route hash.
193  * 2) Only writers remove entries, and they hold the lock
194  *    as they look at rtable reference counts.
195  * 3) Only readers acquire references to rtable entries,
196  *    they do so with atomic increments and with the
197  *    lock held.
198  */
199 
200 struct rt_hash_bucket {
201 	struct rtable	*chain;
202 };
203 
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 	defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ	256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ	4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ	2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ	1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ	512
222 # else
223 #  define RT_HASH_LOCK_SZ	256
224 # endif
225 #endif
226 
227 static spinlock_t	*rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229 
230 static __init void rt_hash_lock_init(void)
231 {
232 	int i;
233 
234 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 			GFP_KERNEL);
236 	if (!rt_hash_locks)
237 		panic("IP: failed to allocate rt_hash_locks\n");
238 
239 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 		spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249 
250 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
251 static unsigned			rt_hash_mask __read_mostly;
252 static unsigned int		rt_hash_log  __read_mostly;
253 
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
256 
257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
258 				   int genid)
259 {
260 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
261 			    idx, genid)
262 		& rt_hash_mask;
263 }
264 
265 static inline int rt_genid(struct net *net)
266 {
267 	return atomic_read(&net->ipv4.rt_genid);
268 }
269 
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 	struct seq_net_private p;
273 	int bucket;
274 	int genid;
275 };
276 
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279 	struct rt_cache_iter_state *st = seq->private;
280 	struct rtable *r = NULL;
281 
282 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 		if (!rt_hash_table[st->bucket].chain)
284 			continue;
285 		rcu_read_lock_bh();
286 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
287 		while (r) {
288 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
289 			    r->rt_genid == st->genid)
290 				return r;
291 			r = rcu_dereference_bh(r->dst.rt_next);
292 		}
293 		rcu_read_unlock_bh();
294 	}
295 	return r;
296 }
297 
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 					  struct rtable *r)
300 {
301 	struct rt_cache_iter_state *st = seq->private;
302 
303 	r = r->dst.rt_next;
304 	while (!r) {
305 		rcu_read_unlock_bh();
306 		do {
307 			if (--st->bucket < 0)
308 				return NULL;
309 		} while (!rt_hash_table[st->bucket].chain);
310 		rcu_read_lock_bh();
311 		r = rt_hash_table[st->bucket].chain;
312 	}
313 	return rcu_dereference_bh(r);
314 }
315 
316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
317 					struct rtable *r)
318 {
319 	struct rt_cache_iter_state *st = seq->private;
320 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
321 		if (dev_net(r->dst.dev) != seq_file_net(seq))
322 			continue;
323 		if (r->rt_genid == st->genid)
324 			break;
325 	}
326 	return r;
327 }
328 
329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
330 {
331 	struct rtable *r = rt_cache_get_first(seq);
332 
333 	if (r)
334 		while (pos && (r = rt_cache_get_next(seq, r)))
335 			--pos;
336 	return pos ? NULL : r;
337 }
338 
339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
340 {
341 	struct rt_cache_iter_state *st = seq->private;
342 	if (*pos)
343 		return rt_cache_get_idx(seq, *pos - 1);
344 	st->genid = rt_genid(seq_file_net(seq));
345 	return SEQ_START_TOKEN;
346 }
347 
348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
349 {
350 	struct rtable *r;
351 
352 	if (v == SEQ_START_TOKEN)
353 		r = rt_cache_get_first(seq);
354 	else
355 		r = rt_cache_get_next(seq, v);
356 	++*pos;
357 	return r;
358 }
359 
360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
361 {
362 	if (v && v != SEQ_START_TOKEN)
363 		rcu_read_unlock_bh();
364 }
365 
366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
367 {
368 	if (v == SEQ_START_TOKEN)
369 		seq_printf(seq, "%-127s\n",
370 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
371 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
372 			   "HHUptod\tSpecDst");
373 	else {
374 		struct rtable *r = v;
375 		int len;
376 
377 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
378 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
379 			r->dst.dev ? r->dst.dev->name : "*",
380 			(__force u32)r->rt_dst,
381 			(__force u32)r->rt_gateway,
382 			r->rt_flags, atomic_read(&r->dst.__refcnt),
383 			r->dst.__use, 0, (__force u32)r->rt_src,
384 			(dst_metric(&r->dst, RTAX_ADVMSS) ?
385 			     (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386 			dst_metric(&r->dst, RTAX_WINDOW),
387 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388 			      dst_metric(&r->dst, RTAX_RTTVAR)),
389 			r->fl.fl4_tos,
390 			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
391 			r->dst.hh ? (r->dst.hh->hh_output ==
392 				       dev_queue_xmit) : 0,
393 			r->rt_spec_dst, &len);
394 
395 		seq_printf(seq, "%*s\n", 127 - len, "");
396 	}
397 	return 0;
398 }
399 
400 static const struct seq_operations rt_cache_seq_ops = {
401 	.start  = rt_cache_seq_start,
402 	.next   = rt_cache_seq_next,
403 	.stop   = rt_cache_seq_stop,
404 	.show   = rt_cache_seq_show,
405 };
406 
407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
408 {
409 	return seq_open_net(inode, file, &rt_cache_seq_ops,
410 			sizeof(struct rt_cache_iter_state));
411 }
412 
413 static const struct file_operations rt_cache_seq_fops = {
414 	.owner	 = THIS_MODULE,
415 	.open	 = rt_cache_seq_open,
416 	.read	 = seq_read,
417 	.llseek	 = seq_lseek,
418 	.release = seq_release_net,
419 };
420 
421 
422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
423 {
424 	int cpu;
425 
426 	if (*pos == 0)
427 		return SEQ_START_TOKEN;
428 
429 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
430 		if (!cpu_possible(cpu))
431 			continue;
432 		*pos = cpu+1;
433 		return &per_cpu(rt_cache_stat, cpu);
434 	}
435 	return NULL;
436 }
437 
438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
439 {
440 	int cpu;
441 
442 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
443 		if (!cpu_possible(cpu))
444 			continue;
445 		*pos = cpu+1;
446 		return &per_cpu(rt_cache_stat, cpu);
447 	}
448 	return NULL;
449 
450 }
451 
452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
453 {
454 
455 }
456 
457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
458 {
459 	struct rt_cache_stat *st = v;
460 
461 	if (v == SEQ_START_TOKEN) {
462 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
463 		return 0;
464 	}
465 
466 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
467 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
468 		   dst_entries_get_slow(&ipv4_dst_ops),
469 		   st->in_hit,
470 		   st->in_slow_tot,
471 		   st->in_slow_mc,
472 		   st->in_no_route,
473 		   st->in_brd,
474 		   st->in_martian_dst,
475 		   st->in_martian_src,
476 
477 		   st->out_hit,
478 		   st->out_slow_tot,
479 		   st->out_slow_mc,
480 
481 		   st->gc_total,
482 		   st->gc_ignored,
483 		   st->gc_goal_miss,
484 		   st->gc_dst_overflow,
485 		   st->in_hlist_search,
486 		   st->out_hlist_search
487 		);
488 	return 0;
489 }
490 
491 static const struct seq_operations rt_cpu_seq_ops = {
492 	.start  = rt_cpu_seq_start,
493 	.next   = rt_cpu_seq_next,
494 	.stop   = rt_cpu_seq_stop,
495 	.show   = rt_cpu_seq_show,
496 };
497 
498 
499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
500 {
501 	return seq_open(file, &rt_cpu_seq_ops);
502 }
503 
504 static const struct file_operations rt_cpu_seq_fops = {
505 	.owner	 = THIS_MODULE,
506 	.open	 = rt_cpu_seq_open,
507 	.read	 = seq_read,
508 	.llseek	 = seq_lseek,
509 	.release = seq_release,
510 };
511 
512 #ifdef CONFIG_NET_CLS_ROUTE
513 static int rt_acct_proc_show(struct seq_file *m, void *v)
514 {
515 	struct ip_rt_acct *dst, *src;
516 	unsigned int i, j;
517 
518 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
519 	if (!dst)
520 		return -ENOMEM;
521 
522 	for_each_possible_cpu(i) {
523 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
524 		for (j = 0; j < 256; j++) {
525 			dst[j].o_bytes   += src[j].o_bytes;
526 			dst[j].o_packets += src[j].o_packets;
527 			dst[j].i_bytes   += src[j].i_bytes;
528 			dst[j].i_packets += src[j].i_packets;
529 		}
530 	}
531 
532 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
533 	kfree(dst);
534 	return 0;
535 }
536 
537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
538 {
539 	return single_open(file, rt_acct_proc_show, NULL);
540 }
541 
542 static const struct file_operations rt_acct_proc_fops = {
543 	.owner		= THIS_MODULE,
544 	.open		= rt_acct_proc_open,
545 	.read		= seq_read,
546 	.llseek		= seq_lseek,
547 	.release	= single_release,
548 };
549 #endif
550 
551 static int __net_init ip_rt_do_proc_init(struct net *net)
552 {
553 	struct proc_dir_entry *pde;
554 
555 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
556 			&rt_cache_seq_fops);
557 	if (!pde)
558 		goto err1;
559 
560 	pde = proc_create("rt_cache", S_IRUGO,
561 			  net->proc_net_stat, &rt_cpu_seq_fops);
562 	if (!pde)
563 		goto err2;
564 
565 #ifdef CONFIG_NET_CLS_ROUTE
566 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
567 	if (!pde)
568 		goto err3;
569 #endif
570 	return 0;
571 
572 #ifdef CONFIG_NET_CLS_ROUTE
573 err3:
574 	remove_proc_entry("rt_cache", net->proc_net_stat);
575 #endif
576 err2:
577 	remove_proc_entry("rt_cache", net->proc_net);
578 err1:
579 	return -ENOMEM;
580 }
581 
582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
583 {
584 	remove_proc_entry("rt_cache", net->proc_net_stat);
585 	remove_proc_entry("rt_cache", net->proc_net);
586 #ifdef CONFIG_NET_CLS_ROUTE
587 	remove_proc_entry("rt_acct", net->proc_net);
588 #endif
589 }
590 
591 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
592 	.init = ip_rt_do_proc_init,
593 	.exit = ip_rt_do_proc_exit,
594 };
595 
596 static int __init ip_rt_proc_init(void)
597 {
598 	return register_pernet_subsys(&ip_rt_proc_ops);
599 }
600 
601 #else
602 static inline int ip_rt_proc_init(void)
603 {
604 	return 0;
605 }
606 #endif /* CONFIG_PROC_FS */
607 
608 static inline void rt_free(struct rtable *rt)
609 {
610 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
611 }
612 
613 static inline void rt_drop(struct rtable *rt)
614 {
615 	ip_rt_put(rt);
616 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
617 }
618 
619 static inline int rt_fast_clean(struct rtable *rth)
620 {
621 	/* Kill broadcast/multicast entries very aggresively, if they
622 	   collide in hash table with more useful entries */
623 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624 		rth->fl.iif && rth->dst.rt_next;
625 }
626 
627 static inline int rt_valuable(struct rtable *rth)
628 {
629 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
630 		rth->dst.expires;
631 }
632 
633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
634 {
635 	unsigned long age;
636 	int ret = 0;
637 
638 	if (atomic_read(&rth->dst.__refcnt))
639 		goto out;
640 
641 	ret = 1;
642 	if (rth->dst.expires &&
643 	    time_after_eq(jiffies, rth->dst.expires))
644 		goto out;
645 
646 	age = jiffies - rth->dst.lastuse;
647 	ret = 0;
648 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
649 	    (age <= tmo2 && rt_valuable(rth)))
650 		goto out;
651 	ret = 1;
652 out:	return ret;
653 }
654 
655 /* Bits of score are:
656  * 31: very valuable
657  * 30: not quite useless
658  * 29..0: usage counter
659  */
660 static inline u32 rt_score(struct rtable *rt)
661 {
662 	u32 score = jiffies - rt->dst.lastuse;
663 
664 	score = ~score & ~(3<<30);
665 
666 	if (rt_valuable(rt))
667 		score |= (1<<31);
668 
669 	if (!rt->fl.iif ||
670 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671 		score |= (1<<30);
672 
673 	return score;
674 }
675 
676 static inline bool rt_caching(const struct net *net)
677 {
678 	return net->ipv4.current_rt_cache_rebuild_count <=
679 		net->ipv4.sysctl_rt_cache_rebuild_count;
680 }
681 
682 static inline bool compare_hash_inputs(const struct flowi *fl1,
683 					const struct flowi *fl2)
684 {
685 	return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
686 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
687 		(fl1->iif ^ fl2->iif)) == 0);
688 }
689 
690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
691 {
692 	return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
693 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
694 		(fl1->mark ^ fl2->mark) |
695 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
696 		(fl1->oif ^ fl2->oif) |
697 		(fl1->iif ^ fl2->iif)) == 0;
698 }
699 
700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
701 {
702 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
703 }
704 
705 static inline int rt_is_expired(struct rtable *rth)
706 {
707 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
708 }
709 
710 /*
711  * Perform a full scan of hash table and free all entries.
712  * Can be called by a softirq or a process.
713  * In the later case, we want to be reschedule if necessary
714  */
715 static void rt_do_flush(int process_context)
716 {
717 	unsigned int i;
718 	struct rtable *rth, *next;
719 	struct rtable * tail;
720 
721 	for (i = 0; i <= rt_hash_mask; i++) {
722 		if (process_context && need_resched())
723 			cond_resched();
724 		rth = rt_hash_table[i].chain;
725 		if (!rth)
726 			continue;
727 
728 		spin_lock_bh(rt_hash_lock_addr(i));
729 #ifdef CONFIG_NET_NS
730 		{
731 		struct rtable ** prev, * p;
732 
733 		rth = rt_hash_table[i].chain;
734 
735 		/* defer releasing the head of the list after spin_unlock */
736 		for (tail = rth; tail; tail = tail->dst.rt_next)
737 			if (!rt_is_expired(tail))
738 				break;
739 		if (rth != tail)
740 			rt_hash_table[i].chain = tail;
741 
742 		/* call rt_free on entries after the tail requiring flush */
743 		prev = &rt_hash_table[i].chain;
744 		for (p = *prev; p; p = next) {
745 			next = p->dst.rt_next;
746 			if (!rt_is_expired(p)) {
747 				prev = &p->dst.rt_next;
748 			} else {
749 				*prev = next;
750 				rt_free(p);
751 			}
752 		}
753 		}
754 #else
755 		rth = rt_hash_table[i].chain;
756 		rt_hash_table[i].chain = NULL;
757 		tail = NULL;
758 #endif
759 		spin_unlock_bh(rt_hash_lock_addr(i));
760 
761 		for (; rth != tail; rth = next) {
762 			next = rth->dst.rt_next;
763 			rt_free(rth);
764 		}
765 	}
766 }
767 
768 /*
769  * While freeing expired entries, we compute average chain length
770  * and standard deviation, using fixed-point arithmetic.
771  * This to have an estimation of rt_chain_length_max
772  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
773  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
774  */
775 
776 #define FRACT_BITS 3
777 #define ONE (1UL << FRACT_BITS)
778 
779 /*
780  * Given a hash chain and an item in this hash chain,
781  * find if a previous entry has the same hash_inputs
782  * (but differs on tos, mark or oif)
783  * Returns 0 if an alias is found.
784  * Returns ONE if rth has no alias before itself.
785  */
786 static int has_noalias(const struct rtable *head, const struct rtable *rth)
787 {
788 	const struct rtable *aux = head;
789 
790 	while (aux != rth) {
791 		if (compare_hash_inputs(&aux->fl, &rth->fl))
792 			return 0;
793 		aux = aux->dst.rt_next;
794 	}
795 	return ONE;
796 }
797 
798 static void rt_check_expire(void)
799 {
800 	static unsigned int rover;
801 	unsigned int i = rover, goal;
802 	struct rtable *rth, **rthp;
803 	unsigned long samples = 0;
804 	unsigned long sum = 0, sum2 = 0;
805 	unsigned long delta;
806 	u64 mult;
807 
808 	delta = jiffies - expires_ljiffies;
809 	expires_ljiffies = jiffies;
810 	mult = ((u64)delta) << rt_hash_log;
811 	if (ip_rt_gc_timeout > 1)
812 		do_div(mult, ip_rt_gc_timeout);
813 	goal = (unsigned int)mult;
814 	if (goal > rt_hash_mask)
815 		goal = rt_hash_mask + 1;
816 	for (; goal > 0; goal--) {
817 		unsigned long tmo = ip_rt_gc_timeout;
818 		unsigned long length;
819 
820 		i = (i + 1) & rt_hash_mask;
821 		rthp = &rt_hash_table[i].chain;
822 
823 		if (need_resched())
824 			cond_resched();
825 
826 		samples++;
827 
828 		if (*rthp == NULL)
829 			continue;
830 		length = 0;
831 		spin_lock_bh(rt_hash_lock_addr(i));
832 		while ((rth = *rthp) != NULL) {
833 			prefetch(rth->dst.rt_next);
834 			if (rt_is_expired(rth)) {
835 				*rthp = rth->dst.rt_next;
836 				rt_free(rth);
837 				continue;
838 			}
839 			if (rth->dst.expires) {
840 				/* Entry is expired even if it is in use */
841 				if (time_before_eq(jiffies, rth->dst.expires)) {
842 nofree:
843 					tmo >>= 1;
844 					rthp = &rth->dst.rt_next;
845 					/*
846 					 * We only count entries on
847 					 * a chain with equal hash inputs once
848 					 * so that entries for different QOS
849 					 * levels, and other non-hash input
850 					 * attributes don't unfairly skew
851 					 * the length computation
852 					 */
853 					length += has_noalias(rt_hash_table[i].chain, rth);
854 					continue;
855 				}
856 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857 				goto nofree;
858 
859 			/* Cleanup aged off entries. */
860 			*rthp = rth->dst.rt_next;
861 			rt_free(rth);
862 		}
863 		spin_unlock_bh(rt_hash_lock_addr(i));
864 		sum += length;
865 		sum2 += length*length;
866 	}
867 	if (samples) {
868 		unsigned long avg = sum / samples;
869 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 		rt_chain_length_max = max_t(unsigned long,
871 					ip_rt_gc_elasticity,
872 					(avg + 4*sd) >> FRACT_BITS);
873 	}
874 	rover = i;
875 }
876 
877 /*
878  * rt_worker_func() is run in process context.
879  * we call rt_check_expire() to scan part of the hash table
880  */
881 static void rt_worker_func(struct work_struct *work)
882 {
883 	rt_check_expire();
884 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885 }
886 
887 /*
888  * Pertubation of rt_genid by a small quantity [1..256]
889  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
890  * many times (2^24) without giving recent rt_genid.
891  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
892  */
893 static void rt_cache_invalidate(struct net *net)
894 {
895 	unsigned char shuffle;
896 
897 	get_random_bytes(&shuffle, sizeof(shuffle));
898 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
899 }
900 
901 /*
902  * delay < 0  : invalidate cache (fast : entries will be deleted later)
903  * delay >= 0 : invalidate & flush cache (can be long)
904  */
905 void rt_cache_flush(struct net *net, int delay)
906 {
907 	rt_cache_invalidate(net);
908 	if (delay >= 0)
909 		rt_do_flush(!in_softirq());
910 }
911 
912 /* Flush previous cache invalidated entries from the cache */
913 void rt_cache_flush_batch(void)
914 {
915 	rt_do_flush(!in_softirq());
916 }
917 
918 static void rt_emergency_hash_rebuild(struct net *net)
919 {
920 	if (net_ratelimit())
921 		printk(KERN_WARNING "Route hash chain too long!\n");
922 	rt_cache_invalidate(net);
923 }
924 
925 /*
926    Short description of GC goals.
927 
928    We want to build algorithm, which will keep routing cache
929    at some equilibrium point, when number of aged off entries
930    is kept approximately equal to newly generated ones.
931 
932    Current expiration strength is variable "expire".
933    We try to adjust it dynamically, so that if networking
934    is idle expires is large enough to keep enough of warm entries,
935    and when load increases it reduces to limit cache size.
936  */
937 
938 static int rt_garbage_collect(struct dst_ops *ops)
939 {
940 	static unsigned long expire = RT_GC_TIMEOUT;
941 	static unsigned long last_gc;
942 	static int rover;
943 	static int equilibrium;
944 	struct rtable *rth, **rthp;
945 	unsigned long now = jiffies;
946 	int goal;
947 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
948 
949 	/*
950 	 * Garbage collection is pretty expensive,
951 	 * do not make it too frequently.
952 	 */
953 
954 	RT_CACHE_STAT_INC(gc_total);
955 
956 	if (now - last_gc < ip_rt_gc_min_interval &&
957 	    entries < ip_rt_max_size) {
958 		RT_CACHE_STAT_INC(gc_ignored);
959 		goto out;
960 	}
961 
962 	entries = dst_entries_get_slow(&ipv4_dst_ops);
963 	/* Calculate number of entries, which we want to expire now. */
964 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
965 	if (goal <= 0) {
966 		if (equilibrium < ipv4_dst_ops.gc_thresh)
967 			equilibrium = ipv4_dst_ops.gc_thresh;
968 		goal = entries - equilibrium;
969 		if (goal > 0) {
970 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 			goal = entries - equilibrium;
972 		}
973 	} else {
974 		/* We are in dangerous area. Try to reduce cache really
975 		 * aggressively.
976 		 */
977 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 		equilibrium = entries - goal;
979 	}
980 
981 	if (now - last_gc >= ip_rt_gc_min_interval)
982 		last_gc = now;
983 
984 	if (goal <= 0) {
985 		equilibrium += goal;
986 		goto work_done;
987 	}
988 
989 	do {
990 		int i, k;
991 
992 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993 			unsigned long tmo = expire;
994 
995 			k = (k + 1) & rt_hash_mask;
996 			rthp = &rt_hash_table[k].chain;
997 			spin_lock_bh(rt_hash_lock_addr(k));
998 			while ((rth = *rthp) != NULL) {
999 				if (!rt_is_expired(rth) &&
1000 					!rt_may_expire(rth, tmo, expire)) {
1001 					tmo >>= 1;
1002 					rthp = &rth->dst.rt_next;
1003 					continue;
1004 				}
1005 				*rthp = rth->dst.rt_next;
1006 				rt_free(rth);
1007 				goal--;
1008 			}
1009 			spin_unlock_bh(rt_hash_lock_addr(k));
1010 			if (goal <= 0)
1011 				break;
1012 		}
1013 		rover = k;
1014 
1015 		if (goal <= 0)
1016 			goto work_done;
1017 
1018 		/* Goal is not achieved. We stop process if:
1019 
1020 		   - if expire reduced to zero. Otherwise, expire is halfed.
1021 		   - if table is not full.
1022 		   - if we are called from interrupt.
1023 		   - jiffies check is just fallback/debug loop breaker.
1024 		     We will not spin here for long time in any case.
1025 		 */
1026 
1027 		RT_CACHE_STAT_INC(gc_goal_miss);
1028 
1029 		if (expire == 0)
1030 			break;
1031 
1032 		expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036 #endif
1037 
1038 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039 			goto out;
1040 	} while (!in_softirq() && time_before_eq(jiffies, now));
1041 
1042 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1043 		goto out;
1044 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1045 		goto out;
1046 	if (net_ratelimit())
1047 		printk(KERN_WARNING "dst cache overflow\n");
1048 	RT_CACHE_STAT_INC(gc_dst_overflow);
1049 	return 1;
1050 
1051 work_done:
1052 	expire += ip_rt_gc_min_interval;
1053 	if (expire > ip_rt_gc_timeout ||
1054 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1056 		expire = ip_rt_gc_timeout;
1057 #if RT_CACHE_DEBUG >= 2
1058 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1059 			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1060 #endif
1061 out:	return 0;
1062 }
1063 
1064 /*
1065  * Returns number of entries in a hash chain that have different hash_inputs
1066  */
1067 static int slow_chain_length(const struct rtable *head)
1068 {
1069 	int length = 0;
1070 	const struct rtable *rth = head;
1071 
1072 	while (rth) {
1073 		length += has_noalias(head, rth);
1074 		rth = rth->dst.rt_next;
1075 	}
1076 	return length >> FRACT_BITS;
1077 }
1078 
1079 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1080 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
1081 {
1082 	struct rtable	*rth, **rthp;
1083 	unsigned long	now;
1084 	struct rtable *cand, **candp;
1085 	u32 		min_score;
1086 	int		chain_length;
1087 	int attempts = !in_softirq();
1088 
1089 restart:
1090 	chain_length = 0;
1091 	min_score = ~(u32)0;
1092 	cand = NULL;
1093 	candp = NULL;
1094 	now = jiffies;
1095 
1096 	if (!rt_caching(dev_net(rt->dst.dev))) {
1097 		/*
1098 		 * If we're not caching, just tell the caller we
1099 		 * were successful and don't touch the route.  The
1100 		 * caller hold the sole reference to the cache entry, and
1101 		 * it will be released when the caller is done with it.
1102 		 * If we drop it here, the callers have no way to resolve routes
1103 		 * when we're not caching.  Instead, just point *rp at rt, so
1104 		 * the caller gets a single use out of the route
1105 		 * Note that we do rt_free on this new route entry, so that
1106 		 * once its refcount hits zero, we are still able to reap it
1107 		 * (Thanks Alexey)
1108 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1109 		 * we set DST_NOCACHE so that dst_release() can free dst without
1110 		 * waiting a grace period.
1111 		 */
1112 
1113 		rt->dst.flags |= DST_NOCACHE;
1114 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1115 			int err = arp_bind_neighbour(&rt->dst);
1116 			if (err) {
1117 				if (net_ratelimit())
1118 					printk(KERN_WARNING
1119 					    "Neighbour table failure & not caching routes.\n");
1120 				ip_rt_put(rt);
1121 				return err;
1122 			}
1123 		}
1124 
1125 		goto skip_hashing;
1126 	}
1127 
1128 	rthp = &rt_hash_table[hash].chain;
1129 
1130 	spin_lock_bh(rt_hash_lock_addr(hash));
1131 	while ((rth = *rthp) != NULL) {
1132 		if (rt_is_expired(rth)) {
1133 			*rthp = rth->dst.rt_next;
1134 			rt_free(rth);
1135 			continue;
1136 		}
1137 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1138 			/* Put it first */
1139 			*rthp = rth->dst.rt_next;
1140 			/*
1141 			 * Since lookup is lockfree, the deletion
1142 			 * must be visible to another weakly ordered CPU before
1143 			 * the insertion at the start of the hash chain.
1144 			 */
1145 			rcu_assign_pointer(rth->dst.rt_next,
1146 					   rt_hash_table[hash].chain);
1147 			/*
1148 			 * Since lookup is lockfree, the update writes
1149 			 * must be ordered for consistency on SMP.
1150 			 */
1151 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1152 
1153 			dst_use(&rth->dst, now);
1154 			spin_unlock_bh(rt_hash_lock_addr(hash));
1155 
1156 			rt_drop(rt);
1157 			if (rp)
1158 				*rp = rth;
1159 			else
1160 				skb_dst_set(skb, &rth->dst);
1161 			return 0;
1162 		}
1163 
1164 		if (!atomic_read(&rth->dst.__refcnt)) {
1165 			u32 score = rt_score(rth);
1166 
1167 			if (score <= min_score) {
1168 				cand = rth;
1169 				candp = rthp;
1170 				min_score = score;
1171 			}
1172 		}
1173 
1174 		chain_length++;
1175 
1176 		rthp = &rth->dst.rt_next;
1177 	}
1178 
1179 	if (cand) {
1180 		/* ip_rt_gc_elasticity used to be average length of chain
1181 		 * length, when exceeded gc becomes really aggressive.
1182 		 *
1183 		 * The second limit is less certain. At the moment it allows
1184 		 * only 2 entries per bucket. We will see.
1185 		 */
1186 		if (chain_length > ip_rt_gc_elasticity) {
1187 			*candp = cand->dst.rt_next;
1188 			rt_free(cand);
1189 		}
1190 	} else {
1191 		if (chain_length > rt_chain_length_max &&
1192 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1193 			struct net *net = dev_net(rt->dst.dev);
1194 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1195 			if (!rt_caching(net)) {
1196 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1197 					rt->dst.dev->name, num);
1198 			}
1199 			rt_emergency_hash_rebuild(net);
1200 			spin_unlock_bh(rt_hash_lock_addr(hash));
1201 
1202 			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1203 					ifindex, rt_genid(net));
1204 			goto restart;
1205 		}
1206 	}
1207 
1208 	/* Try to bind route to arp only if it is output
1209 	   route or unicast forwarding path.
1210 	 */
1211 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1212 		int err = arp_bind_neighbour(&rt->dst);
1213 		if (err) {
1214 			spin_unlock_bh(rt_hash_lock_addr(hash));
1215 
1216 			if (err != -ENOBUFS) {
1217 				rt_drop(rt);
1218 				return err;
1219 			}
1220 
1221 			/* Neighbour tables are full and nothing
1222 			   can be released. Try to shrink route cache,
1223 			   it is most likely it holds some neighbour records.
1224 			 */
1225 			if (attempts-- > 0) {
1226 				int saved_elasticity = ip_rt_gc_elasticity;
1227 				int saved_int = ip_rt_gc_min_interval;
1228 				ip_rt_gc_elasticity	= 1;
1229 				ip_rt_gc_min_interval	= 0;
1230 				rt_garbage_collect(&ipv4_dst_ops);
1231 				ip_rt_gc_min_interval	= saved_int;
1232 				ip_rt_gc_elasticity	= saved_elasticity;
1233 				goto restart;
1234 			}
1235 
1236 			if (net_ratelimit())
1237 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1238 			rt_drop(rt);
1239 			return -ENOBUFS;
1240 		}
1241 	}
1242 
1243 	rt->dst.rt_next = rt_hash_table[hash].chain;
1244 
1245 #if RT_CACHE_DEBUG >= 2
1246 	if (rt->dst.rt_next) {
1247 		struct rtable *trt;
1248 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1249 		       hash, &rt->rt_dst);
1250 		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1251 			printk(" . %pI4", &trt->rt_dst);
1252 		printk("\n");
1253 	}
1254 #endif
1255 	/*
1256 	 * Since lookup is lockfree, we must make sure
1257 	 * previous writes to rt are comitted to memory
1258 	 * before making rt visible to other CPUS.
1259 	 */
1260 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1261 
1262 	spin_unlock_bh(rt_hash_lock_addr(hash));
1263 
1264 skip_hashing:
1265 	if (rp)
1266 		*rp = rt;
1267 	else
1268 		skb_dst_set(skb, &rt->dst);
1269 	return 0;
1270 }
1271 
1272 void rt_bind_peer(struct rtable *rt, int create)
1273 {
1274 	struct inet_peer *peer;
1275 
1276 	peer = inet_getpeer(rt->rt_dst, create);
1277 
1278 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1279 		inet_putpeer(peer);
1280 }
1281 
1282 /*
1283  * Peer allocation may fail only in serious out-of-memory conditions.  However
1284  * we still can generate some output.
1285  * Random ID selection looks a bit dangerous because we have no chances to
1286  * select ID being unique in a reasonable period of time.
1287  * But broken packet identifier may be better than no packet at all.
1288  */
1289 static void ip_select_fb_ident(struct iphdr *iph)
1290 {
1291 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1292 	static u32 ip_fallback_id;
1293 	u32 salt;
1294 
1295 	spin_lock_bh(&ip_fb_id_lock);
1296 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1297 	iph->id = htons(salt & 0xFFFF);
1298 	ip_fallback_id = salt;
1299 	spin_unlock_bh(&ip_fb_id_lock);
1300 }
1301 
1302 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1303 {
1304 	struct rtable *rt = (struct rtable *) dst;
1305 
1306 	if (rt) {
1307 		if (rt->peer == NULL)
1308 			rt_bind_peer(rt, 1);
1309 
1310 		/* If peer is attached to destination, it is never detached,
1311 		   so that we need not to grab a lock to dereference it.
1312 		 */
1313 		if (rt->peer) {
1314 			iph->id = htons(inet_getid(rt->peer, more));
1315 			return;
1316 		}
1317 	} else
1318 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1319 		       __builtin_return_address(0));
1320 
1321 	ip_select_fb_ident(iph);
1322 }
1323 EXPORT_SYMBOL(__ip_select_ident);
1324 
1325 static void rt_del(unsigned hash, struct rtable *rt)
1326 {
1327 	struct rtable **rthp, *aux;
1328 
1329 	rthp = &rt_hash_table[hash].chain;
1330 	spin_lock_bh(rt_hash_lock_addr(hash));
1331 	ip_rt_put(rt);
1332 	while ((aux = *rthp) != NULL) {
1333 		if (aux == rt || rt_is_expired(aux)) {
1334 			*rthp = aux->dst.rt_next;
1335 			rt_free(aux);
1336 			continue;
1337 		}
1338 		rthp = &aux->dst.rt_next;
1339 	}
1340 	spin_unlock_bh(rt_hash_lock_addr(hash));
1341 }
1342 
1343 /* called in rcu_read_lock() section */
1344 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1345 		    __be32 saddr, struct net_device *dev)
1346 {
1347 	int i, k;
1348 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1349 	struct rtable *rth, **rthp;
1350 	__be32  skeys[2] = { saddr, 0 };
1351 	int  ikeys[2] = { dev->ifindex, 0 };
1352 	struct netevent_redirect netevent;
1353 	struct net *net;
1354 
1355 	if (!in_dev)
1356 		return;
1357 
1358 	net = dev_net(dev);
1359 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1360 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1361 	    ipv4_is_zeronet(new_gw))
1362 		goto reject_redirect;
1363 
1364 	if (!rt_caching(net))
1365 		goto reject_redirect;
1366 
1367 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1368 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1369 			goto reject_redirect;
1370 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1371 			goto reject_redirect;
1372 	} else {
1373 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1374 			goto reject_redirect;
1375 	}
1376 
1377 	for (i = 0; i < 2; i++) {
1378 		for (k = 0; k < 2; k++) {
1379 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1380 						rt_genid(net));
1381 
1382 			rthp=&rt_hash_table[hash].chain;
1383 
1384 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1385 				struct rtable *rt;
1386 
1387 				if (rth->fl.fl4_dst != daddr ||
1388 				    rth->fl.fl4_src != skeys[i] ||
1389 				    rth->fl.oif != ikeys[k] ||
1390 				    rth->fl.iif != 0 ||
1391 				    rt_is_expired(rth) ||
1392 				    !net_eq(dev_net(rth->dst.dev), net)) {
1393 					rthp = &rth->dst.rt_next;
1394 					continue;
1395 				}
1396 
1397 				if (rth->rt_dst != daddr ||
1398 				    rth->rt_src != saddr ||
1399 				    rth->dst.error ||
1400 				    rth->rt_gateway != old_gw ||
1401 				    rth->dst.dev != dev)
1402 					break;
1403 
1404 				dst_hold(&rth->dst);
1405 
1406 				rt = dst_alloc(&ipv4_dst_ops);
1407 				if (rt == NULL) {
1408 					ip_rt_put(rth);
1409 					return;
1410 				}
1411 
1412 				/* Copy all the information. */
1413 				*rt = *rth;
1414 				rt->dst.__use		= 1;
1415 				atomic_set(&rt->dst.__refcnt, 1);
1416 				rt->dst.child		= NULL;
1417 				if (rt->dst.dev)
1418 					dev_hold(rt->dst.dev);
1419 				if (rt->idev)
1420 					in_dev_hold(rt->idev);
1421 				rt->dst.obsolete	= -1;
1422 				rt->dst.lastuse	= jiffies;
1423 				rt->dst.path		= &rt->dst;
1424 				rt->dst.neighbour	= NULL;
1425 				rt->dst.hh		= NULL;
1426 #ifdef CONFIG_XFRM
1427 				rt->dst.xfrm		= NULL;
1428 #endif
1429 				rt->rt_genid		= rt_genid(net);
1430 				rt->rt_flags		|= RTCF_REDIRECTED;
1431 
1432 				/* Gateway is different ... */
1433 				rt->rt_gateway		= new_gw;
1434 
1435 				/* Redirect received -> path was valid */
1436 				dst_confirm(&rth->dst);
1437 
1438 				if (rt->peer)
1439 					atomic_inc(&rt->peer->refcnt);
1440 
1441 				if (arp_bind_neighbour(&rt->dst) ||
1442 				    !(rt->dst.neighbour->nud_state &
1443 					    NUD_VALID)) {
1444 					if (rt->dst.neighbour)
1445 						neigh_event_send(rt->dst.neighbour, NULL);
1446 					ip_rt_put(rth);
1447 					rt_drop(rt);
1448 					goto do_next;
1449 				}
1450 
1451 				netevent.old = &rth->dst;
1452 				netevent.new = &rt->dst;
1453 				call_netevent_notifiers(NETEVENT_REDIRECT,
1454 							&netevent);
1455 
1456 				rt_del(hash, rth);
1457 				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1458 					ip_rt_put(rt);
1459 				goto do_next;
1460 			}
1461 		do_next:
1462 			;
1463 		}
1464 	}
1465 	return;
1466 
1467 reject_redirect:
1468 #ifdef CONFIG_IP_ROUTE_VERBOSE
1469 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1470 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1471 			"  Advised path = %pI4 -> %pI4\n",
1472 		       &old_gw, dev->name, &new_gw,
1473 		       &saddr, &daddr);
1474 #endif
1475 	;
1476 }
1477 
1478 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1479 {
1480 	struct rtable *rt = (struct rtable *)dst;
1481 	struct dst_entry *ret = dst;
1482 
1483 	if (rt) {
1484 		if (dst->obsolete > 0) {
1485 			ip_rt_put(rt);
1486 			ret = NULL;
1487 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1488 			   (rt->dst.expires &&
1489 			    time_after_eq(jiffies, rt->dst.expires))) {
1490 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491 						rt->fl.oif,
1492 						rt_genid(dev_net(dst->dev)));
1493 #if RT_CACHE_DEBUG >= 1
1494 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1495 				&rt->rt_dst, rt->fl.fl4_tos);
1496 #endif
1497 			rt_del(hash, rt);
1498 			ret = NULL;
1499 		}
1500 	}
1501 	return ret;
1502 }
1503 
1504 /*
1505  * Algorithm:
1506  *	1. The first ip_rt_redirect_number redirects are sent
1507  *	   with exponential backoff, then we stop sending them at all,
1508  *	   assuming that the host ignores our redirects.
1509  *	2. If we did not see packets requiring redirects
1510  *	   during ip_rt_redirect_silence, we assume that the host
1511  *	   forgot redirected route and start to send redirects again.
1512  *
1513  * This algorithm is much cheaper and more intelligent than dumb load limiting
1514  * in icmp.c.
1515  *
1516  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1517  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1518  */
1519 
1520 void ip_rt_send_redirect(struct sk_buff *skb)
1521 {
1522 	struct rtable *rt = skb_rtable(skb);
1523 	struct in_device *in_dev;
1524 	int log_martians;
1525 
1526 	rcu_read_lock();
1527 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1528 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529 		rcu_read_unlock();
1530 		return;
1531 	}
1532 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1533 	rcu_read_unlock();
1534 
1535 	/* No redirected packets during ip_rt_redirect_silence;
1536 	 * reset the algorithm.
1537 	 */
1538 	if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1539 		rt->dst.rate_tokens = 0;
1540 
1541 	/* Too many ignored redirects; do not send anything
1542 	 * set dst.rate_last to the last seen redirected packet.
1543 	 */
1544 	if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1545 		rt->dst.rate_last = jiffies;
1546 		return;
1547 	}
1548 
1549 	/* Check for load limit; set rate_last to the latest sent
1550 	 * redirect.
1551 	 */
1552 	if (rt->dst.rate_tokens == 0 ||
1553 	    time_after(jiffies,
1554 		       (rt->dst.rate_last +
1555 			(ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1556 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557 		rt->dst.rate_last = jiffies;
1558 		++rt->dst.rate_tokens;
1559 #ifdef CONFIG_IP_ROUTE_VERBOSE
1560 		if (log_martians &&
1561 		    rt->dst.rate_tokens == ip_rt_redirect_number &&
1562 		    net_ratelimit())
1563 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564 				&rt->rt_src, rt->rt_iif,
1565 				&rt->rt_dst, &rt->rt_gateway);
1566 #endif
1567 	}
1568 }
1569 
1570 static int ip_error(struct sk_buff *skb)
1571 {
1572 	struct rtable *rt = skb_rtable(skb);
1573 	unsigned long now;
1574 	int code;
1575 
1576 	switch (rt->dst.error) {
1577 		case EINVAL:
1578 		default:
1579 			goto out;
1580 		case EHOSTUNREACH:
1581 			code = ICMP_HOST_UNREACH;
1582 			break;
1583 		case ENETUNREACH:
1584 			code = ICMP_NET_UNREACH;
1585 			IP_INC_STATS_BH(dev_net(rt->dst.dev),
1586 					IPSTATS_MIB_INNOROUTES);
1587 			break;
1588 		case EACCES:
1589 			code = ICMP_PKT_FILTERED;
1590 			break;
1591 	}
1592 
1593 	now = jiffies;
1594 	rt->dst.rate_tokens += now - rt->dst.rate_last;
1595 	if (rt->dst.rate_tokens > ip_rt_error_burst)
1596 		rt->dst.rate_tokens = ip_rt_error_burst;
1597 	rt->dst.rate_last = now;
1598 	if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1599 		rt->dst.rate_tokens -= ip_rt_error_cost;
1600 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601 	}
1602 
1603 out:	kfree_skb(skb);
1604 	return 0;
1605 }
1606 
1607 /*
1608  *	The last two values are not from the RFC but
1609  *	are needed for AMPRnet AX.25 paths.
1610  */
1611 
1612 static const unsigned short mtu_plateau[] =
1613 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1614 
1615 static inline unsigned short guess_mtu(unsigned short old_mtu)
1616 {
1617 	int i;
1618 
1619 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1620 		if (old_mtu > mtu_plateau[i])
1621 			return mtu_plateau[i];
1622 	return 68;
1623 }
1624 
1625 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1626 				 unsigned short new_mtu,
1627 				 struct net_device *dev)
1628 {
1629 	int i, k;
1630 	unsigned short old_mtu = ntohs(iph->tot_len);
1631 	struct rtable *rth;
1632 	int  ikeys[2] = { dev->ifindex, 0 };
1633 	__be32  skeys[2] = { iph->saddr, 0, };
1634 	__be32  daddr = iph->daddr;
1635 	unsigned short est_mtu = 0;
1636 
1637 	for (k = 0; k < 2; k++) {
1638 		for (i = 0; i < 2; i++) {
1639 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1640 						rt_genid(net));
1641 
1642 			rcu_read_lock();
1643 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644 			     rth = rcu_dereference(rth->dst.rt_next)) {
1645 				unsigned short mtu = new_mtu;
1646 
1647 				if (rth->fl.fl4_dst != daddr ||
1648 				    rth->fl.fl4_src != skeys[i] ||
1649 				    rth->rt_dst != daddr ||
1650 				    rth->rt_src != iph->saddr ||
1651 				    rth->fl.oif != ikeys[k] ||
1652 				    rth->fl.iif != 0 ||
1653 				    dst_metric_locked(&rth->dst, RTAX_MTU) ||
1654 				    !net_eq(dev_net(rth->dst.dev), net) ||
1655 				    rt_is_expired(rth))
1656 					continue;
1657 
1658 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1659 
1660 					/* BSD 4.2 compatibility hack :-( */
1661 					if (mtu == 0 &&
1662 					    old_mtu >= dst_mtu(&rth->dst) &&
1663 					    old_mtu >= 68 + (iph->ihl << 2))
1664 						old_mtu -= iph->ihl << 2;
1665 
1666 					mtu = guess_mtu(old_mtu);
1667 				}
1668 				if (mtu <= dst_mtu(&rth->dst)) {
1669 					if (mtu < dst_mtu(&rth->dst)) {
1670 						dst_confirm(&rth->dst);
1671 						if (mtu < ip_rt_min_pmtu) {
1672 							mtu = ip_rt_min_pmtu;
1673 							rth->dst.metrics[RTAX_LOCK-1] |=
1674 								(1 << RTAX_MTU);
1675 						}
1676 						rth->dst.metrics[RTAX_MTU-1] = mtu;
1677 						dst_set_expires(&rth->dst,
1678 							ip_rt_mtu_expires);
1679 					}
1680 					est_mtu = mtu;
1681 				}
1682 			}
1683 			rcu_read_unlock();
1684 		}
1685 	}
1686 	return est_mtu ? : new_mtu;
1687 }
1688 
1689 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1690 {
1691 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1692 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1693 		if (mtu < ip_rt_min_pmtu) {
1694 			mtu = ip_rt_min_pmtu;
1695 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1696 		}
1697 		dst->metrics[RTAX_MTU-1] = mtu;
1698 		dst_set_expires(dst, ip_rt_mtu_expires);
1699 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1700 	}
1701 }
1702 
1703 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704 {
1705 	if (rt_is_expired((struct rtable *)dst))
1706 		return NULL;
1707 	return dst;
1708 }
1709 
1710 static void ipv4_dst_destroy(struct dst_entry *dst)
1711 {
1712 	struct rtable *rt = (struct rtable *) dst;
1713 	struct inet_peer *peer = rt->peer;
1714 	struct in_device *idev = rt->idev;
1715 
1716 	if (peer) {
1717 		rt->peer = NULL;
1718 		inet_putpeer(peer);
1719 	}
1720 
1721 	if (idev) {
1722 		rt->idev = NULL;
1723 		in_dev_put(idev);
1724 	}
1725 }
1726 
1727 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1728 			    int how)
1729 {
1730 	struct rtable *rt = (struct rtable *) dst;
1731 	struct in_device *idev = rt->idev;
1732 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1733 		struct in_device *loopback_idev =
1734 			in_dev_get(dev_net(dev)->loopback_dev);
1735 		if (loopback_idev) {
1736 			rt->idev = loopback_idev;
1737 			in_dev_put(idev);
1738 		}
1739 	}
1740 }
1741 
1742 static void ipv4_link_failure(struct sk_buff *skb)
1743 {
1744 	struct rtable *rt;
1745 
1746 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1747 
1748 	rt = skb_rtable(skb);
1749 	if (rt)
1750 		dst_set_expires(&rt->dst, 0);
1751 }
1752 
1753 static int ip_rt_bug(struct sk_buff *skb)
1754 {
1755 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1756 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1757 		skb->dev ? skb->dev->name : "?");
1758 	kfree_skb(skb);
1759 	return 0;
1760 }
1761 
1762 /*
1763    We do not cache source address of outgoing interface,
1764    because it is used only by IP RR, TS and SRR options,
1765    so that it out of fast path.
1766 
1767    BTW remember: "addr" is allowed to be not aligned
1768    in IP options!
1769  */
1770 
1771 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1772 {
1773 	__be32 src;
1774 	struct fib_result res;
1775 
1776 	if (rt->fl.iif == 0)
1777 		src = rt->rt_src;
1778 	else {
1779 		rcu_read_lock();
1780 		if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1781 			src = FIB_RES_PREFSRC(res);
1782 		else
1783 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1784 					RT_SCOPE_UNIVERSE);
1785 		rcu_read_unlock();
1786 	}
1787 	memcpy(addr, &src, 4);
1788 }
1789 
1790 #ifdef CONFIG_NET_CLS_ROUTE
1791 static void set_class_tag(struct rtable *rt, u32 tag)
1792 {
1793 	if (!(rt->dst.tclassid & 0xFFFF))
1794 		rt->dst.tclassid |= tag & 0xFFFF;
1795 	if (!(rt->dst.tclassid & 0xFFFF0000))
1796 		rt->dst.tclassid |= tag & 0xFFFF0000;
1797 }
1798 #endif
1799 
1800 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1801 {
1802 	struct fib_info *fi = res->fi;
1803 
1804 	if (fi) {
1805 		if (FIB_RES_GW(*res) &&
1806 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807 			rt->rt_gateway = FIB_RES_GW(*res);
1808 		memcpy(rt->dst.metrics, fi->fib_metrics,
1809 		       sizeof(rt->dst.metrics));
1810 		if (fi->fib_mtu == 0) {
1811 			rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1812 			if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1813 			    rt->rt_gateway != rt->rt_dst &&
1814 			    rt->dst.dev->mtu > 576)
1815 				rt->dst.metrics[RTAX_MTU-1] = 576;
1816 		}
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818 		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1819 #endif
1820 	} else
1821 		rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1822 
1823 	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1824 		rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1825 	if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1826 		rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1827 	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1828 		rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1829 				       ip_rt_min_advmss);
1830 	if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1831 		rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1832 
1833 #ifdef CONFIG_NET_CLS_ROUTE
1834 #ifdef CONFIG_IP_MULTIPLE_TABLES
1835 	set_class_tag(rt, fib_rules_tclass(res));
1836 #endif
1837 	set_class_tag(rt, itag);
1838 #endif
1839 	rt->rt_type = res->type;
1840 }
1841 
1842 /* called in rcu_read_lock() section */
1843 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1844 				u8 tos, struct net_device *dev, int our)
1845 {
1846 	unsigned int hash;
1847 	struct rtable *rth;
1848 	__be32 spec_dst;
1849 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1850 	u32 itag = 0;
1851 	int err;
1852 
1853 	/* Primary sanity checks. */
1854 
1855 	if (in_dev == NULL)
1856 		return -EINVAL;
1857 
1858 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1859 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1860 		goto e_inval;
1861 
1862 	if (ipv4_is_zeronet(saddr)) {
1863 		if (!ipv4_is_local_multicast(daddr))
1864 			goto e_inval;
1865 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1866 	} else {
1867 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1868 					  &itag, 0);
1869 		if (err < 0)
1870 			goto e_err;
1871 	}
1872 	rth = dst_alloc(&ipv4_dst_ops);
1873 	if (!rth)
1874 		goto e_nobufs;
1875 
1876 	rth->dst.output = ip_rt_bug;
1877 	rth->dst.obsolete = -1;
1878 
1879 	atomic_set(&rth->dst.__refcnt, 1);
1880 	rth->dst.flags= DST_HOST;
1881 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1882 		rth->dst.flags |= DST_NOPOLICY;
1883 	rth->fl.fl4_dst	= daddr;
1884 	rth->rt_dst	= daddr;
1885 	rth->fl.fl4_tos	= tos;
1886 	rth->fl.mark    = skb->mark;
1887 	rth->fl.fl4_src	= saddr;
1888 	rth->rt_src	= saddr;
1889 #ifdef CONFIG_NET_CLS_ROUTE
1890 	rth->dst.tclassid = itag;
1891 #endif
1892 	rth->rt_iif	=
1893 	rth->fl.iif	= dev->ifindex;
1894 	rth->dst.dev	= init_net.loopback_dev;
1895 	dev_hold(rth->dst.dev);
1896 	rth->idev	= in_dev_get(rth->dst.dev);
1897 	rth->fl.oif	= 0;
1898 	rth->rt_gateway	= daddr;
1899 	rth->rt_spec_dst= spec_dst;
1900 	rth->rt_genid	= rt_genid(dev_net(dev));
1901 	rth->rt_flags	= RTCF_MULTICAST;
1902 	rth->rt_type	= RTN_MULTICAST;
1903 	if (our) {
1904 		rth->dst.input= ip_local_deliver;
1905 		rth->rt_flags |= RTCF_LOCAL;
1906 	}
1907 
1908 #ifdef CONFIG_IP_MROUTE
1909 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1910 		rth->dst.input = ip_mr_input;
1911 #endif
1912 	RT_CACHE_STAT_INC(in_slow_mc);
1913 
1914 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1915 	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1916 
1917 e_nobufs:
1918 	return -ENOBUFS;
1919 e_inval:
1920 	return -EINVAL;
1921 e_err:
1922 	return err;
1923 }
1924 
1925 
1926 static void ip_handle_martian_source(struct net_device *dev,
1927 				     struct in_device *in_dev,
1928 				     struct sk_buff *skb,
1929 				     __be32 daddr,
1930 				     __be32 saddr)
1931 {
1932 	RT_CACHE_STAT_INC(in_martian_src);
1933 #ifdef CONFIG_IP_ROUTE_VERBOSE
1934 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1935 		/*
1936 		 *	RFC1812 recommendation, if source is martian,
1937 		 *	the only hint is MAC header.
1938 		 */
1939 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1940 			&daddr, &saddr, dev->name);
1941 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1942 			int i;
1943 			const unsigned char *p = skb_mac_header(skb);
1944 			printk(KERN_WARNING "ll header: ");
1945 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1946 				printk("%02x", *p);
1947 				if (i < (dev->hard_header_len - 1))
1948 					printk(":");
1949 			}
1950 			printk("\n");
1951 		}
1952 	}
1953 #endif
1954 }
1955 
1956 /* called in rcu_read_lock() section */
1957 static int __mkroute_input(struct sk_buff *skb,
1958 			   struct fib_result *res,
1959 			   struct in_device *in_dev,
1960 			   __be32 daddr, __be32 saddr, u32 tos,
1961 			   struct rtable **result)
1962 {
1963 	struct rtable *rth;
1964 	int err;
1965 	struct in_device *out_dev;
1966 	unsigned int flags = 0;
1967 	__be32 spec_dst;
1968 	u32 itag;
1969 
1970 	/* get a working reference to the output device */
1971 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1972 	if (out_dev == NULL) {
1973 		if (net_ratelimit())
1974 			printk(KERN_CRIT "Bug in ip_route_input" \
1975 			       "_slow(). Please, report\n");
1976 		return -EINVAL;
1977 	}
1978 
1979 
1980 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1981 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1982 	if (err < 0) {
1983 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1984 					 saddr);
1985 
1986 		goto cleanup;
1987 	}
1988 
1989 	if (err)
1990 		flags |= RTCF_DIRECTSRC;
1991 
1992 	if (out_dev == in_dev && err &&
1993 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1994 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1995 		flags |= RTCF_DOREDIRECT;
1996 
1997 	if (skb->protocol != htons(ETH_P_IP)) {
1998 		/* Not IP (i.e. ARP). Do not create route, if it is
1999 		 * invalid for proxy arp. DNAT routes are always valid.
2000 		 *
2001 		 * Proxy arp feature have been extended to allow, ARP
2002 		 * replies back to the same interface, to support
2003 		 * Private VLAN switch technologies. See arp.c.
2004 		 */
2005 		if (out_dev == in_dev &&
2006 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2007 			err = -EINVAL;
2008 			goto cleanup;
2009 		}
2010 	}
2011 
2012 
2013 	rth = dst_alloc(&ipv4_dst_ops);
2014 	if (!rth) {
2015 		err = -ENOBUFS;
2016 		goto cleanup;
2017 	}
2018 
2019 	atomic_set(&rth->dst.__refcnt, 1);
2020 	rth->dst.flags= DST_HOST;
2021 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2022 		rth->dst.flags |= DST_NOPOLICY;
2023 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2024 		rth->dst.flags |= DST_NOXFRM;
2025 	rth->fl.fl4_dst	= daddr;
2026 	rth->rt_dst	= daddr;
2027 	rth->fl.fl4_tos	= tos;
2028 	rth->fl.mark    = skb->mark;
2029 	rth->fl.fl4_src	= saddr;
2030 	rth->rt_src	= saddr;
2031 	rth->rt_gateway	= daddr;
2032 	rth->rt_iif 	=
2033 		rth->fl.iif	= in_dev->dev->ifindex;
2034 	rth->dst.dev	= (out_dev)->dev;
2035 	dev_hold(rth->dst.dev);
2036 	rth->idev	= in_dev_get(rth->dst.dev);
2037 	rth->fl.oif 	= 0;
2038 	rth->rt_spec_dst= spec_dst;
2039 
2040 	rth->dst.obsolete = -1;
2041 	rth->dst.input = ip_forward;
2042 	rth->dst.output = ip_output;
2043 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2044 
2045 	rt_set_nexthop(rth, res, itag);
2046 
2047 	rth->rt_flags = flags;
2048 
2049 	*result = rth;
2050 	err = 0;
2051  cleanup:
2052 	return err;
2053 }
2054 
2055 static int ip_mkroute_input(struct sk_buff *skb,
2056 			    struct fib_result *res,
2057 			    const struct flowi *fl,
2058 			    struct in_device *in_dev,
2059 			    __be32 daddr, __be32 saddr, u32 tos)
2060 {
2061 	struct rtable* rth = NULL;
2062 	int err;
2063 	unsigned hash;
2064 
2065 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2066 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2067 		fib_select_multipath(fl, res);
2068 #endif
2069 
2070 	/* create a routing cache entry */
2071 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2072 	if (err)
2073 		return err;
2074 
2075 	/* put it into the cache */
2076 	hash = rt_hash(daddr, saddr, fl->iif,
2077 		       rt_genid(dev_net(rth->dst.dev)));
2078 	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2079 }
2080 
2081 /*
2082  *	NOTE. We drop all the packets that has local source
2083  *	addresses, because every properly looped back packet
2084  *	must have correct destination already attached by output routine.
2085  *
2086  *	Such approach solves two big problems:
2087  *	1. Not simplex devices are handled properly.
2088  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2089  *	called with rcu_read_lock()
2090  */
2091 
2092 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2093 			       u8 tos, struct net_device *dev)
2094 {
2095 	struct fib_result res;
2096 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2097 	struct flowi fl = { .nl_u = { .ip4_u =
2098 				      { .daddr = daddr,
2099 					.saddr = saddr,
2100 					.tos = tos,
2101 					.scope = RT_SCOPE_UNIVERSE,
2102 				      } },
2103 			    .mark = skb->mark,
2104 			    .iif = dev->ifindex };
2105 	unsigned	flags = 0;
2106 	u32		itag = 0;
2107 	struct rtable * rth;
2108 	unsigned	hash;
2109 	__be32		spec_dst;
2110 	int		err = -EINVAL;
2111 	struct net    * net = dev_net(dev);
2112 
2113 	/* IP on this device is disabled. */
2114 
2115 	if (!in_dev)
2116 		goto out;
2117 
2118 	/* Check for the most weird martians, which can be not detected
2119 	   by fib_lookup.
2120 	 */
2121 
2122 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2123 	    ipv4_is_loopback(saddr))
2124 		goto martian_source;
2125 
2126 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127 		goto brd_input;
2128 
2129 	/* Accept zero addresses only to limited broadcast;
2130 	 * I even do not know to fix it or not. Waiting for complains :-)
2131 	 */
2132 	if (ipv4_is_zeronet(saddr))
2133 		goto martian_source;
2134 
2135 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2136 		goto martian_destination;
2137 
2138 	/*
2139 	 *	Now we are ready to route packet.
2140 	 */
2141 	err = fib_lookup(net, &fl, &res);
2142 	if (err != 0) {
2143 		if (!IN_DEV_FORWARD(in_dev))
2144 			goto e_hostunreach;
2145 		goto no_route;
2146 	}
2147 
2148 	RT_CACHE_STAT_INC(in_slow_tot);
2149 
2150 	if (res.type == RTN_BROADCAST)
2151 		goto brd_input;
2152 
2153 	if (res.type == RTN_LOCAL) {
2154 		err = fib_validate_source(saddr, daddr, tos,
2155 					  net->loopback_dev->ifindex,
2156 					  dev, &spec_dst, &itag, skb->mark);
2157 		if (err < 0)
2158 			goto martian_source_keep_err;
2159 		if (err)
2160 			flags |= RTCF_DIRECTSRC;
2161 		spec_dst = daddr;
2162 		goto local_input;
2163 	}
2164 
2165 	if (!IN_DEV_FORWARD(in_dev))
2166 		goto e_hostunreach;
2167 	if (res.type != RTN_UNICAST)
2168 		goto martian_destination;
2169 
2170 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2171 out:	return err;
2172 
2173 brd_input:
2174 	if (skb->protocol != htons(ETH_P_IP))
2175 		goto e_inval;
2176 
2177 	if (ipv4_is_zeronet(saddr))
2178 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2179 	else {
2180 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2181 					  &itag, skb->mark);
2182 		if (err < 0)
2183 			goto martian_source_keep_err;
2184 		if (err)
2185 			flags |= RTCF_DIRECTSRC;
2186 	}
2187 	flags |= RTCF_BROADCAST;
2188 	res.type = RTN_BROADCAST;
2189 	RT_CACHE_STAT_INC(in_brd);
2190 
2191 local_input:
2192 	rth = dst_alloc(&ipv4_dst_ops);
2193 	if (!rth)
2194 		goto e_nobufs;
2195 
2196 	rth->dst.output= ip_rt_bug;
2197 	rth->dst.obsolete = -1;
2198 	rth->rt_genid = rt_genid(net);
2199 
2200 	atomic_set(&rth->dst.__refcnt, 1);
2201 	rth->dst.flags= DST_HOST;
2202 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203 		rth->dst.flags |= DST_NOPOLICY;
2204 	rth->fl.fl4_dst	= daddr;
2205 	rth->rt_dst	= daddr;
2206 	rth->fl.fl4_tos	= tos;
2207 	rth->fl.mark    = skb->mark;
2208 	rth->fl.fl4_src	= saddr;
2209 	rth->rt_src	= saddr;
2210 #ifdef CONFIG_NET_CLS_ROUTE
2211 	rth->dst.tclassid = itag;
2212 #endif
2213 	rth->rt_iif	=
2214 	rth->fl.iif	= dev->ifindex;
2215 	rth->dst.dev	= net->loopback_dev;
2216 	dev_hold(rth->dst.dev);
2217 	rth->idev	= in_dev_get(rth->dst.dev);
2218 	rth->rt_gateway	= daddr;
2219 	rth->rt_spec_dst= spec_dst;
2220 	rth->dst.input= ip_local_deliver;
2221 	rth->rt_flags 	= flags|RTCF_LOCAL;
2222 	if (res.type == RTN_UNREACHABLE) {
2223 		rth->dst.input= ip_error;
2224 		rth->dst.error= -err;
2225 		rth->rt_flags 	&= ~RTCF_LOCAL;
2226 	}
2227 	rth->rt_type	= res.type;
2228 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2229 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2230 	goto out;
2231 
2232 no_route:
2233 	RT_CACHE_STAT_INC(in_no_route);
2234 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2235 	res.type = RTN_UNREACHABLE;
2236 	if (err == -ESRCH)
2237 		err = -ENETUNREACH;
2238 	goto local_input;
2239 
2240 	/*
2241 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2242 	 */
2243 martian_destination:
2244 	RT_CACHE_STAT_INC(in_martian_dst);
2245 #ifdef CONFIG_IP_ROUTE_VERBOSE
2246 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2247 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2248 			&daddr, &saddr, dev->name);
2249 #endif
2250 
2251 e_hostunreach:
2252 	err = -EHOSTUNREACH;
2253 	goto out;
2254 
2255 e_inval:
2256 	err = -EINVAL;
2257 	goto out;
2258 
2259 e_nobufs:
2260 	err = -ENOBUFS;
2261 	goto out;
2262 
2263 martian_source:
2264 	err = -EINVAL;
2265 martian_source_keep_err:
2266 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2267 	goto out;
2268 }
2269 
2270 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2271 			   u8 tos, struct net_device *dev, bool noref)
2272 {
2273 	struct rtable * rth;
2274 	unsigned	hash;
2275 	int iif = dev->ifindex;
2276 	struct net *net;
2277 	int res;
2278 
2279 	net = dev_net(dev);
2280 
2281 	rcu_read_lock();
2282 
2283 	if (!rt_caching(net))
2284 		goto skip_cache;
2285 
2286 	tos &= IPTOS_RT_MASK;
2287 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2288 
2289 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290 	     rth = rcu_dereference(rth->dst.rt_next)) {
2291 		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2292 		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2293 		     (rth->fl.iif ^ iif) |
2294 		     rth->fl.oif |
2295 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2296 		    rth->fl.mark == skb->mark &&
2297 		    net_eq(dev_net(rth->dst.dev), net) &&
2298 		    !rt_is_expired(rth)) {
2299 			if (noref) {
2300 				dst_use_noref(&rth->dst, jiffies);
2301 				skb_dst_set_noref(skb, &rth->dst);
2302 			} else {
2303 				dst_use(&rth->dst, jiffies);
2304 				skb_dst_set(skb, &rth->dst);
2305 			}
2306 			RT_CACHE_STAT_INC(in_hit);
2307 			rcu_read_unlock();
2308 			return 0;
2309 		}
2310 		RT_CACHE_STAT_INC(in_hlist_search);
2311 	}
2312 
2313 skip_cache:
2314 	/* Multicast recognition logic is moved from route cache to here.
2315 	   The problem was that too many Ethernet cards have broken/missing
2316 	   hardware multicast filters :-( As result the host on multicasting
2317 	   network acquires a lot of useless route cache entries, sort of
2318 	   SDR messages from all the world. Now we try to get rid of them.
2319 	   Really, provided software IP multicast filter is organized
2320 	   reasonably (at least, hashed), it does not result in a slowdown
2321 	   comparing with route cache reject entries.
2322 	   Note, that multicast routers are not affected, because
2323 	   route cache entry is created eventually.
2324 	 */
2325 	if (ipv4_is_multicast(daddr)) {
2326 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2327 
2328 		if (in_dev) {
2329 			int our = ip_check_mc(in_dev, daddr, saddr,
2330 					      ip_hdr(skb)->protocol);
2331 			if (our
2332 #ifdef CONFIG_IP_MROUTE
2333 				||
2334 			    (!ipv4_is_local_multicast(daddr) &&
2335 			     IN_DEV_MFORWARD(in_dev))
2336 #endif
2337 			   ) {
2338 				int res = ip_route_input_mc(skb, daddr, saddr,
2339 							    tos, dev, our);
2340 				rcu_read_unlock();
2341 				return res;
2342 			}
2343 		}
2344 		rcu_read_unlock();
2345 		return -EINVAL;
2346 	}
2347 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2348 	rcu_read_unlock();
2349 	return res;
2350 }
2351 EXPORT_SYMBOL(ip_route_input_common);
2352 
2353 /* called with rcu_read_lock() */
2354 static int __mkroute_output(struct rtable **result,
2355 			    struct fib_result *res,
2356 			    const struct flowi *fl,
2357 			    const struct flowi *oldflp,
2358 			    struct net_device *dev_out,
2359 			    unsigned flags)
2360 {
2361 	struct rtable *rth;
2362 	struct in_device *in_dev;
2363 	u32 tos = RT_FL_TOS(oldflp);
2364 
2365 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2366 		return -EINVAL;
2367 
2368 	if (ipv4_is_lbcast(fl->fl4_dst))
2369 		res->type = RTN_BROADCAST;
2370 	else if (ipv4_is_multicast(fl->fl4_dst))
2371 		res->type = RTN_MULTICAST;
2372 	else if (ipv4_is_zeronet(fl->fl4_dst))
2373 		return -EINVAL;
2374 
2375 	if (dev_out->flags & IFF_LOOPBACK)
2376 		flags |= RTCF_LOCAL;
2377 
2378 	in_dev = __in_dev_get_rcu(dev_out);
2379 	if (!in_dev)
2380 		return -EINVAL;
2381 
2382 	if (res->type == RTN_BROADCAST) {
2383 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 		res->fi = NULL;
2385 	} else if (res->type == RTN_MULTICAST) {
2386 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2388 				 oldflp->proto))
2389 			flags &= ~RTCF_LOCAL;
2390 		/* If multicast route do not exist use
2391 		 * default one, but do not gateway in this case.
2392 		 * Yes, it is hack.
2393 		 */
2394 		if (res->fi && res->prefixlen < 4)
2395 			res->fi = NULL;
2396 	}
2397 
2398 
2399 	rth = dst_alloc(&ipv4_dst_ops);
2400 	if (!rth)
2401 		return -ENOBUFS;
2402 
2403 	in_dev_hold(in_dev);
2404 	rth->idev = in_dev;
2405 
2406 	atomic_set(&rth->dst.__refcnt, 1);
2407 	rth->dst.flags= DST_HOST;
2408 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2409 		rth->dst.flags |= DST_NOXFRM;
2410 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2411 		rth->dst.flags |= DST_NOPOLICY;
2412 
2413 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2414 	rth->fl.fl4_tos	= tos;
2415 	rth->fl.fl4_src	= oldflp->fl4_src;
2416 	rth->fl.oif	= oldflp->oif;
2417 	rth->fl.mark    = oldflp->mark;
2418 	rth->rt_dst	= fl->fl4_dst;
2419 	rth->rt_src	= fl->fl4_src;
2420 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2421 	/* get references to the devices that are to be hold by the routing
2422 	   cache entry */
2423 	rth->dst.dev	= dev_out;
2424 	dev_hold(dev_out);
2425 	rth->rt_gateway = fl->fl4_dst;
2426 	rth->rt_spec_dst= fl->fl4_src;
2427 
2428 	rth->dst.output=ip_output;
2429 	rth->dst.obsolete = -1;
2430 	rth->rt_genid = rt_genid(dev_net(dev_out));
2431 
2432 	RT_CACHE_STAT_INC(out_slow_tot);
2433 
2434 	if (flags & RTCF_LOCAL) {
2435 		rth->dst.input = ip_local_deliver;
2436 		rth->rt_spec_dst = fl->fl4_dst;
2437 	}
2438 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2439 		rth->rt_spec_dst = fl->fl4_src;
2440 		if (flags & RTCF_LOCAL &&
2441 		    !(dev_out->flags & IFF_LOOPBACK)) {
2442 			rth->dst.output = ip_mc_output;
2443 			RT_CACHE_STAT_INC(out_slow_mc);
2444 		}
2445 #ifdef CONFIG_IP_MROUTE
2446 		if (res->type == RTN_MULTICAST) {
2447 			if (IN_DEV_MFORWARD(in_dev) &&
2448 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2449 				rth->dst.input = ip_mr_input;
2450 				rth->dst.output = ip_mc_output;
2451 			}
2452 		}
2453 #endif
2454 	}
2455 
2456 	rt_set_nexthop(rth, res, 0);
2457 
2458 	rth->rt_flags = flags;
2459 	*result = rth;
2460 	return 0;
2461 }
2462 
2463 /* called with rcu_read_lock() */
2464 static int ip_mkroute_output(struct rtable **rp,
2465 			     struct fib_result *res,
2466 			     const struct flowi *fl,
2467 			     const struct flowi *oldflp,
2468 			     struct net_device *dev_out,
2469 			     unsigned flags)
2470 {
2471 	struct rtable *rth = NULL;
2472 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2473 	unsigned hash;
2474 	if (err == 0) {
2475 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2476 			       rt_genid(dev_net(dev_out)));
2477 		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2478 	}
2479 
2480 	return err;
2481 }
2482 
2483 /*
2484  * Major route resolver routine.
2485  * called with rcu_read_lock();
2486  */
2487 
2488 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2489 				const struct flowi *oldflp)
2490 {
2491 	u32 tos	= RT_FL_TOS(oldflp);
2492 	struct flowi fl = { .nl_u = { .ip4_u =
2493 				      { .daddr = oldflp->fl4_dst,
2494 					.saddr = oldflp->fl4_src,
2495 					.tos = tos & IPTOS_RT_MASK,
2496 					.scope = ((tos & RTO_ONLINK) ?
2497 						  RT_SCOPE_LINK :
2498 						  RT_SCOPE_UNIVERSE),
2499 				      } },
2500 			    .mark = oldflp->mark,
2501 			    .iif = net->loopback_dev->ifindex,
2502 			    .oif = oldflp->oif };
2503 	struct fib_result res;
2504 	unsigned int flags = 0;
2505 	struct net_device *dev_out = NULL;
2506 	int err;
2507 
2508 
2509 	res.fi		= NULL;
2510 #ifdef CONFIG_IP_MULTIPLE_TABLES
2511 	res.r		= NULL;
2512 #endif
2513 
2514 	if (oldflp->fl4_src) {
2515 		err = -EINVAL;
2516 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2517 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2518 		    ipv4_is_zeronet(oldflp->fl4_src))
2519 			goto out;
2520 
2521 		/* I removed check for oif == dev_out->oif here.
2522 		   It was wrong for two reasons:
2523 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524 		      is assigned to multiple interfaces.
2525 		   2. Moreover, we are allowed to send packets with saddr
2526 		      of another iface. --ANK
2527 		 */
2528 
2529 		if (oldflp->oif == 0 &&
2530 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2531 		     ipv4_is_lbcast(oldflp->fl4_dst))) {
2532 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2533 			dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2534 			if (dev_out == NULL)
2535 				goto out;
2536 
2537 			/* Special hack: user can direct multicasts
2538 			   and limited broadcast via necessary interface
2539 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540 			   This hack is not just for fun, it allows
2541 			   vic,vat and friends to work.
2542 			   They bind socket to loopback, set ttl to zero
2543 			   and expect that it will work.
2544 			   From the viewpoint of routing cache they are broken,
2545 			   because we are not allowed to build multicast path
2546 			   with loopback source addr (look, routing cache
2547 			   cannot know, that ttl is zero, so that packet
2548 			   will not leave this host and route is valid).
2549 			   Luckily, this hack is good workaround.
2550 			 */
2551 
2552 			fl.oif = dev_out->ifindex;
2553 			goto make_route;
2554 		}
2555 
2556 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2557 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558 			if (!__ip_dev_find(net, oldflp->fl4_src, false))
2559 				goto out;
2560 		}
2561 	}
2562 
2563 
2564 	if (oldflp->oif) {
2565 		dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2566 		err = -ENODEV;
2567 		if (dev_out == NULL)
2568 			goto out;
2569 
2570 		/* RACE: Check return value of inet_select_addr instead. */
2571 		if (rcu_dereference(dev_out->ip_ptr) == NULL)
2572 			goto out;	/* Wrong error code */
2573 
2574 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2575 		    ipv4_is_lbcast(oldflp->fl4_dst)) {
2576 			if (!fl.fl4_src)
2577 				fl.fl4_src = inet_select_addr(dev_out, 0,
2578 							      RT_SCOPE_LINK);
2579 			goto make_route;
2580 		}
2581 		if (!fl.fl4_src) {
2582 			if (ipv4_is_multicast(oldflp->fl4_dst))
2583 				fl.fl4_src = inet_select_addr(dev_out, 0,
2584 							      fl.fl4_scope);
2585 			else if (!oldflp->fl4_dst)
2586 				fl.fl4_src = inet_select_addr(dev_out, 0,
2587 							      RT_SCOPE_HOST);
2588 		}
2589 	}
2590 
2591 	if (!fl.fl4_dst) {
2592 		fl.fl4_dst = fl.fl4_src;
2593 		if (!fl.fl4_dst)
2594 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2595 		dev_out = net->loopback_dev;
2596 		fl.oif = net->loopback_dev->ifindex;
2597 		res.type = RTN_LOCAL;
2598 		flags |= RTCF_LOCAL;
2599 		goto make_route;
2600 	}
2601 
2602 	if (fib_lookup(net, &fl, &res)) {
2603 		res.fi = NULL;
2604 		if (oldflp->oif) {
2605 			/* Apparently, routing tables are wrong. Assume,
2606 			   that the destination is on link.
2607 
2608 			   WHY? DW.
2609 			   Because we are allowed to send to iface
2610 			   even if it has NO routes and NO assigned
2611 			   addresses. When oif is specified, routing
2612 			   tables are looked up with only one purpose:
2613 			   to catch if destination is gatewayed, rather than
2614 			   direct. Moreover, if MSG_DONTROUTE is set,
2615 			   we send packet, ignoring both routing tables
2616 			   and ifaddr state. --ANK
2617 
2618 
2619 			   We could make it even if oif is unknown,
2620 			   likely IPv6, but we do not.
2621 			 */
2622 
2623 			if (fl.fl4_src == 0)
2624 				fl.fl4_src = inet_select_addr(dev_out, 0,
2625 							      RT_SCOPE_LINK);
2626 			res.type = RTN_UNICAST;
2627 			goto make_route;
2628 		}
2629 		err = -ENETUNREACH;
2630 		goto out;
2631 	}
2632 
2633 	if (res.type == RTN_LOCAL) {
2634 		if (!fl.fl4_src)
2635 			fl.fl4_src = fl.fl4_dst;
2636 		dev_out = net->loopback_dev;
2637 		fl.oif = dev_out->ifindex;
2638 		res.fi = NULL;
2639 		flags |= RTCF_LOCAL;
2640 		goto make_route;
2641 	}
2642 
2643 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2644 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2645 		fib_select_multipath(&fl, &res);
2646 	else
2647 #endif
2648 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2649 		fib_select_default(net, &fl, &res);
2650 
2651 	if (!fl.fl4_src)
2652 		fl.fl4_src = FIB_RES_PREFSRC(res);
2653 
2654 	dev_out = FIB_RES_DEV(res);
2655 	fl.oif = dev_out->ifindex;
2656 
2657 
2658 make_route:
2659 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2660 
2661 out:	return err;
2662 }
2663 
2664 int __ip_route_output_key(struct net *net, struct rtable **rp,
2665 			  const struct flowi *flp)
2666 {
2667 	unsigned int hash;
2668 	int res;
2669 	struct rtable *rth;
2670 
2671 	if (!rt_caching(net))
2672 		goto slow_output;
2673 
2674 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2675 
2676 	rcu_read_lock_bh();
2677 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2678 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2679 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2680 		    rth->fl.fl4_src == flp->fl4_src &&
2681 		    rth->fl.iif == 0 &&
2682 		    rth->fl.oif == flp->oif &&
2683 		    rth->fl.mark == flp->mark &&
2684 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2685 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2686 		    net_eq(dev_net(rth->dst.dev), net) &&
2687 		    !rt_is_expired(rth)) {
2688 			dst_use(&rth->dst, jiffies);
2689 			RT_CACHE_STAT_INC(out_hit);
2690 			rcu_read_unlock_bh();
2691 			*rp = rth;
2692 			return 0;
2693 		}
2694 		RT_CACHE_STAT_INC(out_hlist_search);
2695 	}
2696 	rcu_read_unlock_bh();
2697 
2698 slow_output:
2699 	rcu_read_lock();
2700 	res = ip_route_output_slow(net, rp, flp);
2701 	rcu_read_unlock();
2702 	return res;
2703 }
2704 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705 
2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707 {
2708 	return NULL;
2709 }
2710 
2711 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2712 {
2713 }
2714 
2715 static struct dst_ops ipv4_dst_blackhole_ops = {
2716 	.family			=	AF_INET,
2717 	.protocol		=	cpu_to_be16(ETH_P_IP),
2718 	.destroy		=	ipv4_dst_destroy,
2719 	.check			=	ipv4_blackhole_dst_check,
2720 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2721 };
2722 
2723 
2724 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2725 {
2726 	struct rtable *ort = *rp;
2727 	struct rtable *rt = (struct rtable *)
2728 		dst_alloc(&ipv4_dst_blackhole_ops);
2729 
2730 	if (rt) {
2731 		struct dst_entry *new = &rt->dst;
2732 
2733 		atomic_set(&new->__refcnt, 1);
2734 		new->__use = 1;
2735 		new->input = dst_discard;
2736 		new->output = dst_discard;
2737 		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2738 
2739 		new->dev = ort->dst.dev;
2740 		if (new->dev)
2741 			dev_hold(new->dev);
2742 
2743 		rt->fl = ort->fl;
2744 
2745 		rt->idev = ort->idev;
2746 		if (rt->idev)
2747 			in_dev_hold(rt->idev);
2748 		rt->rt_genid = rt_genid(net);
2749 		rt->rt_flags = ort->rt_flags;
2750 		rt->rt_type = ort->rt_type;
2751 		rt->rt_dst = ort->rt_dst;
2752 		rt->rt_src = ort->rt_src;
2753 		rt->rt_iif = ort->rt_iif;
2754 		rt->rt_gateway = ort->rt_gateway;
2755 		rt->rt_spec_dst = ort->rt_spec_dst;
2756 		rt->peer = ort->peer;
2757 		if (rt->peer)
2758 			atomic_inc(&rt->peer->refcnt);
2759 
2760 		dst_free(new);
2761 	}
2762 
2763 	dst_release(&(*rp)->dst);
2764 	*rp = rt;
2765 	return rt ? 0 : -ENOMEM;
2766 }
2767 
2768 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2769 			 struct sock *sk, int flags)
2770 {
2771 	int err;
2772 
2773 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2774 		return err;
2775 
2776 	if (flp->proto) {
2777 		if (!flp->fl4_src)
2778 			flp->fl4_src = (*rp)->rt_src;
2779 		if (!flp->fl4_dst)
2780 			flp->fl4_dst = (*rp)->rt_dst;
2781 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2782 				    flags ? XFRM_LOOKUP_WAIT : 0);
2783 		if (err == -EREMOTE)
2784 			err = ipv4_dst_blackhole(net, rp, flp);
2785 
2786 		return err;
2787 	}
2788 
2789 	return 0;
2790 }
2791 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2792 
2793 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2794 {
2795 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2796 }
2797 EXPORT_SYMBOL(ip_route_output_key);
2798 
2799 static int rt_fill_info(struct net *net,
2800 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2801 			int nowait, unsigned int flags)
2802 {
2803 	struct rtable *rt = skb_rtable(skb);
2804 	struct rtmsg *r;
2805 	struct nlmsghdr *nlh;
2806 	long expires;
2807 	u32 id = 0, ts = 0, tsage = 0, error;
2808 
2809 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2810 	if (nlh == NULL)
2811 		return -EMSGSIZE;
2812 
2813 	r = nlmsg_data(nlh);
2814 	r->rtm_family	 = AF_INET;
2815 	r->rtm_dst_len	= 32;
2816 	r->rtm_src_len	= 0;
2817 	r->rtm_tos	= rt->fl.fl4_tos;
2818 	r->rtm_table	= RT_TABLE_MAIN;
2819 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2820 	r->rtm_type	= rt->rt_type;
2821 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2822 	r->rtm_protocol = RTPROT_UNSPEC;
2823 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2824 	if (rt->rt_flags & RTCF_NOTIFY)
2825 		r->rtm_flags |= RTM_F_NOTIFY;
2826 
2827 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2828 
2829 	if (rt->fl.fl4_src) {
2830 		r->rtm_src_len = 32;
2831 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2832 	}
2833 	if (rt->dst.dev)
2834 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2835 #ifdef CONFIG_NET_CLS_ROUTE
2836 	if (rt->dst.tclassid)
2837 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2838 #endif
2839 	if (rt->fl.iif)
2840 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2841 	else if (rt->rt_src != rt->fl.fl4_src)
2842 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2843 
2844 	if (rt->rt_dst != rt->rt_gateway)
2845 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2846 
2847 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2848 		goto nla_put_failure;
2849 
2850 	if (rt->fl.mark)
2851 		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2852 
2853 	error = rt->dst.error;
2854 	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2855 	if (rt->peer) {
2856 		inet_peer_refcheck(rt->peer);
2857 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2858 		if (rt->peer->tcp_ts_stamp) {
2859 			ts = rt->peer->tcp_ts;
2860 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2861 		}
2862 	}
2863 
2864 	if (rt->fl.iif) {
2865 #ifdef CONFIG_IP_MROUTE
2866 		__be32 dst = rt->rt_dst;
2867 
2868 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2869 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2870 			int err = ipmr_get_route(net, skb, r, nowait);
2871 			if (err <= 0) {
2872 				if (!nowait) {
2873 					if (err == 0)
2874 						return 0;
2875 					goto nla_put_failure;
2876 				} else {
2877 					if (err == -EMSGSIZE)
2878 						goto nla_put_failure;
2879 					error = err;
2880 				}
2881 			}
2882 		} else
2883 #endif
2884 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2885 	}
2886 
2887 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2888 			       expires, error) < 0)
2889 		goto nla_put_failure;
2890 
2891 	return nlmsg_end(skb, nlh);
2892 
2893 nla_put_failure:
2894 	nlmsg_cancel(skb, nlh);
2895 	return -EMSGSIZE;
2896 }
2897 
2898 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2899 {
2900 	struct net *net = sock_net(in_skb->sk);
2901 	struct rtmsg *rtm;
2902 	struct nlattr *tb[RTA_MAX+1];
2903 	struct rtable *rt = NULL;
2904 	__be32 dst = 0;
2905 	__be32 src = 0;
2906 	u32 iif;
2907 	int err;
2908 	int mark;
2909 	struct sk_buff *skb;
2910 
2911 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2912 	if (err < 0)
2913 		goto errout;
2914 
2915 	rtm = nlmsg_data(nlh);
2916 
2917 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2918 	if (skb == NULL) {
2919 		err = -ENOBUFS;
2920 		goto errout;
2921 	}
2922 
2923 	/* Reserve room for dummy headers, this skb can pass
2924 	   through good chunk of routing engine.
2925 	 */
2926 	skb_reset_mac_header(skb);
2927 	skb_reset_network_header(skb);
2928 
2929 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2930 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2931 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2932 
2933 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2934 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2935 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2936 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2937 
2938 	if (iif) {
2939 		struct net_device *dev;
2940 
2941 		dev = __dev_get_by_index(net, iif);
2942 		if (dev == NULL) {
2943 			err = -ENODEV;
2944 			goto errout_free;
2945 		}
2946 
2947 		skb->protocol	= htons(ETH_P_IP);
2948 		skb->dev	= dev;
2949 		skb->mark	= mark;
2950 		local_bh_disable();
2951 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2952 		local_bh_enable();
2953 
2954 		rt = skb_rtable(skb);
2955 		if (err == 0 && rt->dst.error)
2956 			err = -rt->dst.error;
2957 	} else {
2958 		struct flowi fl = {
2959 			.nl_u = {
2960 				.ip4_u = {
2961 					.daddr = dst,
2962 					.saddr = src,
2963 					.tos = rtm->rtm_tos,
2964 				},
2965 			},
2966 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2967 			.mark = mark,
2968 		};
2969 		err = ip_route_output_key(net, &rt, &fl);
2970 	}
2971 
2972 	if (err)
2973 		goto errout_free;
2974 
2975 	skb_dst_set(skb, &rt->dst);
2976 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2977 		rt->rt_flags |= RTCF_NOTIFY;
2978 
2979 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2980 			   RTM_NEWROUTE, 0, 0);
2981 	if (err <= 0)
2982 		goto errout_free;
2983 
2984 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2985 errout:
2986 	return err;
2987 
2988 errout_free:
2989 	kfree_skb(skb);
2990 	goto errout;
2991 }
2992 
2993 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2994 {
2995 	struct rtable *rt;
2996 	int h, s_h;
2997 	int idx, s_idx;
2998 	struct net *net;
2999 
3000 	net = sock_net(skb->sk);
3001 
3002 	s_h = cb->args[0];
3003 	if (s_h < 0)
3004 		s_h = 0;
3005 	s_idx = idx = cb->args[1];
3006 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3007 		if (!rt_hash_table[h].chain)
3008 			continue;
3009 		rcu_read_lock_bh();
3010 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3011 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3012 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3013 				continue;
3014 			if (rt_is_expired(rt))
3015 				continue;
3016 			skb_dst_set_noref(skb, &rt->dst);
3017 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3018 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3019 					 1, NLM_F_MULTI) <= 0) {
3020 				skb_dst_drop(skb);
3021 				rcu_read_unlock_bh();
3022 				goto done;
3023 			}
3024 			skb_dst_drop(skb);
3025 		}
3026 		rcu_read_unlock_bh();
3027 	}
3028 
3029 done:
3030 	cb->args[0] = h;
3031 	cb->args[1] = idx;
3032 	return skb->len;
3033 }
3034 
3035 void ip_rt_multicast_event(struct in_device *in_dev)
3036 {
3037 	rt_cache_flush(dev_net(in_dev->dev), 0);
3038 }
3039 
3040 #ifdef CONFIG_SYSCTL
3041 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3042 					void __user *buffer,
3043 					size_t *lenp, loff_t *ppos)
3044 {
3045 	if (write) {
3046 		int flush_delay;
3047 		ctl_table ctl;
3048 		struct net *net;
3049 
3050 		memcpy(&ctl, __ctl, sizeof(ctl));
3051 		ctl.data = &flush_delay;
3052 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3053 
3054 		net = (struct net *)__ctl->extra1;
3055 		rt_cache_flush(net, flush_delay);
3056 		return 0;
3057 	}
3058 
3059 	return -EINVAL;
3060 }
3061 
3062 static ctl_table ipv4_route_table[] = {
3063 	{
3064 		.procname	= "gc_thresh",
3065 		.data		= &ipv4_dst_ops.gc_thresh,
3066 		.maxlen		= sizeof(int),
3067 		.mode		= 0644,
3068 		.proc_handler	= proc_dointvec,
3069 	},
3070 	{
3071 		.procname	= "max_size",
3072 		.data		= &ip_rt_max_size,
3073 		.maxlen		= sizeof(int),
3074 		.mode		= 0644,
3075 		.proc_handler	= proc_dointvec,
3076 	},
3077 	{
3078 		/*  Deprecated. Use gc_min_interval_ms */
3079 
3080 		.procname	= "gc_min_interval",
3081 		.data		= &ip_rt_gc_min_interval,
3082 		.maxlen		= sizeof(int),
3083 		.mode		= 0644,
3084 		.proc_handler	= proc_dointvec_jiffies,
3085 	},
3086 	{
3087 		.procname	= "gc_min_interval_ms",
3088 		.data		= &ip_rt_gc_min_interval,
3089 		.maxlen		= sizeof(int),
3090 		.mode		= 0644,
3091 		.proc_handler	= proc_dointvec_ms_jiffies,
3092 	},
3093 	{
3094 		.procname	= "gc_timeout",
3095 		.data		= &ip_rt_gc_timeout,
3096 		.maxlen		= sizeof(int),
3097 		.mode		= 0644,
3098 		.proc_handler	= proc_dointvec_jiffies,
3099 	},
3100 	{
3101 		.procname	= "gc_interval",
3102 		.data		= &ip_rt_gc_interval,
3103 		.maxlen		= sizeof(int),
3104 		.mode		= 0644,
3105 		.proc_handler	= proc_dointvec_jiffies,
3106 	},
3107 	{
3108 		.procname	= "redirect_load",
3109 		.data		= &ip_rt_redirect_load,
3110 		.maxlen		= sizeof(int),
3111 		.mode		= 0644,
3112 		.proc_handler	= proc_dointvec,
3113 	},
3114 	{
3115 		.procname	= "redirect_number",
3116 		.data		= &ip_rt_redirect_number,
3117 		.maxlen		= sizeof(int),
3118 		.mode		= 0644,
3119 		.proc_handler	= proc_dointvec,
3120 	},
3121 	{
3122 		.procname	= "redirect_silence",
3123 		.data		= &ip_rt_redirect_silence,
3124 		.maxlen		= sizeof(int),
3125 		.mode		= 0644,
3126 		.proc_handler	= proc_dointvec,
3127 	},
3128 	{
3129 		.procname	= "error_cost",
3130 		.data		= &ip_rt_error_cost,
3131 		.maxlen		= sizeof(int),
3132 		.mode		= 0644,
3133 		.proc_handler	= proc_dointvec,
3134 	},
3135 	{
3136 		.procname	= "error_burst",
3137 		.data		= &ip_rt_error_burst,
3138 		.maxlen		= sizeof(int),
3139 		.mode		= 0644,
3140 		.proc_handler	= proc_dointvec,
3141 	},
3142 	{
3143 		.procname	= "gc_elasticity",
3144 		.data		= &ip_rt_gc_elasticity,
3145 		.maxlen		= sizeof(int),
3146 		.mode		= 0644,
3147 		.proc_handler	= proc_dointvec,
3148 	},
3149 	{
3150 		.procname	= "mtu_expires",
3151 		.data		= &ip_rt_mtu_expires,
3152 		.maxlen		= sizeof(int),
3153 		.mode		= 0644,
3154 		.proc_handler	= proc_dointvec_jiffies,
3155 	},
3156 	{
3157 		.procname	= "min_pmtu",
3158 		.data		= &ip_rt_min_pmtu,
3159 		.maxlen		= sizeof(int),
3160 		.mode		= 0644,
3161 		.proc_handler	= proc_dointvec,
3162 	},
3163 	{
3164 		.procname	= "min_adv_mss",
3165 		.data		= &ip_rt_min_advmss,
3166 		.maxlen		= sizeof(int),
3167 		.mode		= 0644,
3168 		.proc_handler	= proc_dointvec,
3169 	},
3170 	{ }
3171 };
3172 
3173 static struct ctl_table empty[1];
3174 
3175 static struct ctl_table ipv4_skeleton[] =
3176 {
3177 	{ .procname = "route",
3178 	  .mode = 0555, .child = ipv4_route_table},
3179 	{ .procname = "neigh",
3180 	  .mode = 0555, .child = empty},
3181 	{ }
3182 };
3183 
3184 static __net_initdata struct ctl_path ipv4_path[] = {
3185 	{ .procname = "net", },
3186 	{ .procname = "ipv4", },
3187 	{ },
3188 };
3189 
3190 static struct ctl_table ipv4_route_flush_table[] = {
3191 	{
3192 		.procname	= "flush",
3193 		.maxlen		= sizeof(int),
3194 		.mode		= 0200,
3195 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3196 	},
3197 	{ },
3198 };
3199 
3200 static __net_initdata struct ctl_path ipv4_route_path[] = {
3201 	{ .procname = "net", },
3202 	{ .procname = "ipv4", },
3203 	{ .procname = "route", },
3204 	{ },
3205 };
3206 
3207 static __net_init int sysctl_route_net_init(struct net *net)
3208 {
3209 	struct ctl_table *tbl;
3210 
3211 	tbl = ipv4_route_flush_table;
3212 	if (!net_eq(net, &init_net)) {
3213 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3214 		if (tbl == NULL)
3215 			goto err_dup;
3216 	}
3217 	tbl[0].extra1 = net;
3218 
3219 	net->ipv4.route_hdr =
3220 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3221 	if (net->ipv4.route_hdr == NULL)
3222 		goto err_reg;
3223 	return 0;
3224 
3225 err_reg:
3226 	if (tbl != ipv4_route_flush_table)
3227 		kfree(tbl);
3228 err_dup:
3229 	return -ENOMEM;
3230 }
3231 
3232 static __net_exit void sysctl_route_net_exit(struct net *net)
3233 {
3234 	struct ctl_table *tbl;
3235 
3236 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3237 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3238 	BUG_ON(tbl == ipv4_route_flush_table);
3239 	kfree(tbl);
3240 }
3241 
3242 static __net_initdata struct pernet_operations sysctl_route_ops = {
3243 	.init = sysctl_route_net_init,
3244 	.exit = sysctl_route_net_exit,
3245 };
3246 #endif
3247 
3248 static __net_init int rt_genid_init(struct net *net)
3249 {
3250 	get_random_bytes(&net->ipv4.rt_genid,
3251 			 sizeof(net->ipv4.rt_genid));
3252 	return 0;
3253 }
3254 
3255 static __net_initdata struct pernet_operations rt_genid_ops = {
3256 	.init = rt_genid_init,
3257 };
3258 
3259 
3260 #ifdef CONFIG_NET_CLS_ROUTE
3261 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3262 #endif /* CONFIG_NET_CLS_ROUTE */
3263 
3264 static __initdata unsigned long rhash_entries;
3265 static int __init set_rhash_entries(char *str)
3266 {
3267 	if (!str)
3268 		return 0;
3269 	rhash_entries = simple_strtoul(str, &str, 0);
3270 	return 1;
3271 }
3272 __setup("rhash_entries=", set_rhash_entries);
3273 
3274 int __init ip_rt_init(void)
3275 {
3276 	int rc = 0;
3277 
3278 #ifdef CONFIG_NET_CLS_ROUTE
3279 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3280 	if (!ip_rt_acct)
3281 		panic("IP: failed to allocate ip_rt_acct\n");
3282 #endif
3283 
3284 	ipv4_dst_ops.kmem_cachep =
3285 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3286 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3287 
3288 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3289 
3290 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3291 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3292 
3293 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3294 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3295 
3296 	rt_hash_table = (struct rt_hash_bucket *)
3297 		alloc_large_system_hash("IP route cache",
3298 					sizeof(struct rt_hash_bucket),
3299 					rhash_entries,
3300 					(totalram_pages >= 128 * 1024) ?
3301 					15 : 17,
3302 					0,
3303 					&rt_hash_log,
3304 					&rt_hash_mask,
3305 					rhash_entries ? 0 : 512 * 1024);
3306 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3307 	rt_hash_lock_init();
3308 
3309 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3310 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3311 
3312 	devinet_init();
3313 	ip_fib_init();
3314 
3315 	/* All the timers, started at system startup tend
3316 	   to synchronize. Perturb it a bit.
3317 	 */
3318 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3319 	expires_ljiffies = jiffies;
3320 	schedule_delayed_work(&expires_work,
3321 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3322 
3323 	if (ip_rt_proc_init())
3324 		printk(KERN_ERR "Unable to create route proc files\n");
3325 #ifdef CONFIG_XFRM
3326 	xfrm_init();
3327 	xfrm4_init(ip_rt_max_size);
3328 #endif
3329 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3330 
3331 #ifdef CONFIG_SYSCTL
3332 	register_pernet_subsys(&sysctl_route_ops);
3333 #endif
3334 	register_pernet_subsys(&rt_genid_ops);
3335 	return rc;
3336 }
3337 
3338 #ifdef CONFIG_SYSCTL
3339 /*
3340  * We really need to sanitize the damn ipv4 init order, then all
3341  * this nonsense will go away.
3342  */
3343 void __init ip_static_sysctl_init(void)
3344 {
3345 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3346 }
3347 #endif
3348