xref: /linux/net/ipv4/route.c (revision d39d0ed196aa1685bb24771e92f78633c66ac9cb)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 
150 
151 static struct dst_ops ipv4_dst_ops = {
152 	.family =		AF_INET,
153 	.protocol =		cpu_to_be16(ETH_P_IP),
154 	.gc =			rt_garbage_collect,
155 	.check =		ipv4_dst_check,
156 	.destroy =		ipv4_dst_destroy,
157 	.ifdown =		ipv4_dst_ifdown,
158 	.negative_advice =	ipv4_negative_advice,
159 	.link_failure =		ipv4_link_failure,
160 	.update_pmtu =		ip_rt_update_pmtu,
161 	.local_out =		__ip_local_out,
162 	.entries =		ATOMIC_INIT(0),
163 };
164 
165 #define ECN_OR_COST(class)	TC_PRIO_##class
166 
167 const __u8 ip_tos2prio[16] = {
168 	TC_PRIO_BESTEFFORT,
169 	ECN_OR_COST(FILLER),
170 	TC_PRIO_BESTEFFORT,
171 	ECN_OR_COST(BESTEFFORT),
172 	TC_PRIO_BULK,
173 	ECN_OR_COST(BULK),
174 	TC_PRIO_BULK,
175 	ECN_OR_COST(BULK),
176 	TC_PRIO_INTERACTIVE,
177 	ECN_OR_COST(INTERACTIVE),
178 	TC_PRIO_INTERACTIVE,
179 	ECN_OR_COST(INTERACTIVE),
180 	TC_PRIO_INTERACTIVE_BULK,
181 	ECN_OR_COST(INTERACTIVE_BULK),
182 	TC_PRIO_INTERACTIVE_BULK,
183 	ECN_OR_COST(INTERACTIVE_BULK)
184 };
185 
186 
187 /*
188  * Route cache.
189  */
190 
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200 
201 struct rt_hash_bucket {
202 	struct rtable	*chain;
203 };
204 
205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206 	defined(CONFIG_PROVE_LOCKING)
207 /*
208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209  * The size of this table is a power of two and depends on the number of CPUS.
210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211  */
212 #ifdef CONFIG_LOCKDEP
213 # define RT_HASH_LOCK_SZ	256
214 #else
215 # if NR_CPUS >= 32
216 #  define RT_HASH_LOCK_SZ	4096
217 # elif NR_CPUS >= 16
218 #  define RT_HASH_LOCK_SZ	2048
219 # elif NR_CPUS >= 8
220 #  define RT_HASH_LOCK_SZ	1024
221 # elif NR_CPUS >= 4
222 #  define RT_HASH_LOCK_SZ	512
223 # else
224 #  define RT_HASH_LOCK_SZ	256
225 # endif
226 #endif
227 
228 static spinlock_t	*rt_hash_locks;
229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230 
231 static __init void rt_hash_lock_init(void)
232 {
233 	int i;
234 
235 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236 			GFP_KERNEL);
237 	if (!rt_hash_locks)
238 		panic("IP: failed to allocate rt_hash_locks\n");
239 
240 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241 		spin_lock_init(&rt_hash_locks[i]);
242 }
243 #else
244 # define rt_hash_lock_addr(slot) NULL
245 
246 static inline void rt_hash_lock_init(void)
247 {
248 }
249 #endif
250 
251 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
252 static unsigned			rt_hash_mask __read_mostly;
253 static unsigned int		rt_hash_log  __read_mostly;
254 
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
257 
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259 				   int genid)
260 {
261 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
262 			    idx, genid)
263 		& rt_hash_mask;
264 }
265 
266 static inline int rt_genid(struct net *net)
267 {
268 	return atomic_read(&net->ipv4.rt_genid);
269 }
270 
271 #ifdef CONFIG_PROC_FS
272 struct rt_cache_iter_state {
273 	struct seq_net_private p;
274 	int bucket;
275 	int genid;
276 };
277 
278 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 {
280 	struct rt_cache_iter_state *st = seq->private;
281 	struct rtable *r = NULL;
282 
283 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 		if (!rt_hash_table[st->bucket].chain)
285 			continue;
286 		rcu_read_lock_bh();
287 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
288 		while (r) {
289 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
290 			    r->rt_genid == st->genid)
291 				return r;
292 			r = rcu_dereference_bh(r->dst.rt_next);
293 		}
294 		rcu_read_unlock_bh();
295 	}
296 	return r;
297 }
298 
299 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
300 					  struct rtable *r)
301 {
302 	struct rt_cache_iter_state *st = seq->private;
303 
304 	r = r->dst.rt_next;
305 	while (!r) {
306 		rcu_read_unlock_bh();
307 		do {
308 			if (--st->bucket < 0)
309 				return NULL;
310 		} while (!rt_hash_table[st->bucket].chain);
311 		rcu_read_lock_bh();
312 		r = rt_hash_table[st->bucket].chain;
313 	}
314 	return rcu_dereference_bh(r);
315 }
316 
317 static struct rtable *rt_cache_get_next(struct seq_file *seq,
318 					struct rtable *r)
319 {
320 	struct rt_cache_iter_state *st = seq->private;
321 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
322 		if (dev_net(r->dst.dev) != seq_file_net(seq))
323 			continue;
324 		if (r->rt_genid == st->genid)
325 			break;
326 	}
327 	return r;
328 }
329 
330 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
331 {
332 	struct rtable *r = rt_cache_get_first(seq);
333 
334 	if (r)
335 		while (pos && (r = rt_cache_get_next(seq, r)))
336 			--pos;
337 	return pos ? NULL : r;
338 }
339 
340 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
341 {
342 	struct rt_cache_iter_state *st = seq->private;
343 	if (*pos)
344 		return rt_cache_get_idx(seq, *pos - 1);
345 	st->genid = rt_genid(seq_file_net(seq));
346 	return SEQ_START_TOKEN;
347 }
348 
349 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
350 {
351 	struct rtable *r;
352 
353 	if (v == SEQ_START_TOKEN)
354 		r = rt_cache_get_first(seq);
355 	else
356 		r = rt_cache_get_next(seq, v);
357 	++*pos;
358 	return r;
359 }
360 
361 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
362 {
363 	if (v && v != SEQ_START_TOKEN)
364 		rcu_read_unlock_bh();
365 }
366 
367 static int rt_cache_seq_show(struct seq_file *seq, void *v)
368 {
369 	if (v == SEQ_START_TOKEN)
370 		seq_printf(seq, "%-127s\n",
371 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
372 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
373 			   "HHUptod\tSpecDst");
374 	else {
375 		struct rtable *r = v;
376 		int len;
377 
378 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
379 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
380 			r->dst.dev ? r->dst.dev->name : "*",
381 			(__force u32)r->rt_dst,
382 			(__force u32)r->rt_gateway,
383 			r->rt_flags, atomic_read(&r->dst.__refcnt),
384 			r->dst.__use, 0, (__force u32)r->rt_src,
385 			(dst_metric(&r->dst, RTAX_ADVMSS) ?
386 			     (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
387 			dst_metric(&r->dst, RTAX_WINDOW),
388 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389 			      dst_metric(&r->dst, RTAX_RTTVAR)),
390 			r->fl.fl4_tos,
391 			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
392 			r->dst.hh ? (r->dst.hh->hh_output ==
393 				       dev_queue_xmit) : 0,
394 			r->rt_spec_dst, &len);
395 
396 		seq_printf(seq, "%*s\n", 127 - len, "");
397 	}
398 	return 0;
399 }
400 
401 static const struct seq_operations rt_cache_seq_ops = {
402 	.start  = rt_cache_seq_start,
403 	.next   = rt_cache_seq_next,
404 	.stop   = rt_cache_seq_stop,
405 	.show   = rt_cache_seq_show,
406 };
407 
408 static int rt_cache_seq_open(struct inode *inode, struct file *file)
409 {
410 	return seq_open_net(inode, file, &rt_cache_seq_ops,
411 			sizeof(struct rt_cache_iter_state));
412 }
413 
414 static const struct file_operations rt_cache_seq_fops = {
415 	.owner	 = THIS_MODULE,
416 	.open	 = rt_cache_seq_open,
417 	.read	 = seq_read,
418 	.llseek	 = seq_lseek,
419 	.release = seq_release_net,
420 };
421 
422 
423 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
424 {
425 	int cpu;
426 
427 	if (*pos == 0)
428 		return SEQ_START_TOKEN;
429 
430 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
431 		if (!cpu_possible(cpu))
432 			continue;
433 		*pos = cpu+1;
434 		return &per_cpu(rt_cache_stat, cpu);
435 	}
436 	return NULL;
437 }
438 
439 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
440 {
441 	int cpu;
442 
443 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
444 		if (!cpu_possible(cpu))
445 			continue;
446 		*pos = cpu+1;
447 		return &per_cpu(rt_cache_stat, cpu);
448 	}
449 	return NULL;
450 
451 }
452 
453 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
454 {
455 
456 }
457 
458 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459 {
460 	struct rt_cache_stat *st = v;
461 
462 	if (v == SEQ_START_TOKEN) {
463 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
464 		return 0;
465 	}
466 
467 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
468 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 		   atomic_read(&ipv4_dst_ops.entries),
470 		   st->in_hit,
471 		   st->in_slow_tot,
472 		   st->in_slow_mc,
473 		   st->in_no_route,
474 		   st->in_brd,
475 		   st->in_martian_dst,
476 		   st->in_martian_src,
477 
478 		   st->out_hit,
479 		   st->out_slow_tot,
480 		   st->out_slow_mc,
481 
482 		   st->gc_total,
483 		   st->gc_ignored,
484 		   st->gc_goal_miss,
485 		   st->gc_dst_overflow,
486 		   st->in_hlist_search,
487 		   st->out_hlist_search
488 		);
489 	return 0;
490 }
491 
492 static const struct seq_operations rt_cpu_seq_ops = {
493 	.start  = rt_cpu_seq_start,
494 	.next   = rt_cpu_seq_next,
495 	.stop   = rt_cpu_seq_stop,
496 	.show   = rt_cpu_seq_show,
497 };
498 
499 
500 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501 {
502 	return seq_open(file, &rt_cpu_seq_ops);
503 }
504 
505 static const struct file_operations rt_cpu_seq_fops = {
506 	.owner	 = THIS_MODULE,
507 	.open	 = rt_cpu_seq_open,
508 	.read	 = seq_read,
509 	.llseek	 = seq_lseek,
510 	.release = seq_release,
511 };
512 
513 #ifdef CONFIG_NET_CLS_ROUTE
514 static int rt_acct_proc_show(struct seq_file *m, void *v)
515 {
516 	struct ip_rt_acct *dst, *src;
517 	unsigned int i, j;
518 
519 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
520 	if (!dst)
521 		return -ENOMEM;
522 
523 	for_each_possible_cpu(i) {
524 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
525 		for (j = 0; j < 256; j++) {
526 			dst[j].o_bytes   += src[j].o_bytes;
527 			dst[j].o_packets += src[j].o_packets;
528 			dst[j].i_bytes   += src[j].i_bytes;
529 			dst[j].i_packets += src[j].i_packets;
530 		}
531 	}
532 
533 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
534 	kfree(dst);
535 	return 0;
536 }
537 
538 static int rt_acct_proc_open(struct inode *inode, struct file *file)
539 {
540 	return single_open(file, rt_acct_proc_show, NULL);
541 }
542 
543 static const struct file_operations rt_acct_proc_fops = {
544 	.owner		= THIS_MODULE,
545 	.open		= rt_acct_proc_open,
546 	.read		= seq_read,
547 	.llseek		= seq_lseek,
548 	.release	= single_release,
549 };
550 #endif
551 
552 static int __net_init ip_rt_do_proc_init(struct net *net)
553 {
554 	struct proc_dir_entry *pde;
555 
556 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
557 			&rt_cache_seq_fops);
558 	if (!pde)
559 		goto err1;
560 
561 	pde = proc_create("rt_cache", S_IRUGO,
562 			  net->proc_net_stat, &rt_cpu_seq_fops);
563 	if (!pde)
564 		goto err2;
565 
566 #ifdef CONFIG_NET_CLS_ROUTE
567 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
568 	if (!pde)
569 		goto err3;
570 #endif
571 	return 0;
572 
573 #ifdef CONFIG_NET_CLS_ROUTE
574 err3:
575 	remove_proc_entry("rt_cache", net->proc_net_stat);
576 #endif
577 err2:
578 	remove_proc_entry("rt_cache", net->proc_net);
579 err1:
580 	return -ENOMEM;
581 }
582 
583 static void __net_exit ip_rt_do_proc_exit(struct net *net)
584 {
585 	remove_proc_entry("rt_cache", net->proc_net_stat);
586 	remove_proc_entry("rt_cache", net->proc_net);
587 #ifdef CONFIG_NET_CLS_ROUTE
588 	remove_proc_entry("rt_acct", net->proc_net);
589 #endif
590 }
591 
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593 	.init = ip_rt_do_proc_init,
594 	.exit = ip_rt_do_proc_exit,
595 };
596 
597 static int __init ip_rt_proc_init(void)
598 {
599 	return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601 
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605 	return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608 
609 static inline void rt_free(struct rtable *rt)
610 {
611 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
612 }
613 
614 static inline void rt_drop(struct rtable *rt)
615 {
616 	ip_rt_put(rt);
617 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
618 }
619 
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622 	/* Kill broadcast/multicast entries very aggresively, if they
623 	   collide in hash table with more useful entries */
624 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 		rth->fl.iif && rth->dst.rt_next;
626 }
627 
628 static inline int rt_valuable(struct rtable *rth)
629 {
630 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 		rth->dst.expires;
632 }
633 
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636 	unsigned long age;
637 	int ret = 0;
638 
639 	if (atomic_read(&rth->dst.__refcnt))
640 		goto out;
641 
642 	ret = 1;
643 	if (rth->dst.expires &&
644 	    time_after_eq(jiffies, rth->dst.expires))
645 		goto out;
646 
647 	age = jiffies - rth->dst.lastuse;
648 	ret = 0;
649 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 	    (age <= tmo2 && rt_valuable(rth)))
651 		goto out;
652 	ret = 1;
653 out:	return ret;
654 }
655 
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663 	u32 score = jiffies - rt->dst.lastuse;
664 
665 	score = ~score & ~(3<<30);
666 
667 	if (rt_valuable(rt))
668 		score |= (1<<31);
669 
670 	if (!rt->fl.iif ||
671 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 		score |= (1<<30);
673 
674 	return score;
675 }
676 
677 static inline bool rt_caching(const struct net *net)
678 {
679 	return net->ipv4.current_rt_cache_rebuild_count <=
680 		net->ipv4.sysctl_rt_cache_rebuild_count;
681 }
682 
683 static inline bool compare_hash_inputs(const struct flowi *fl1,
684 					const struct flowi *fl2)
685 {
686 	return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
687 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
688 		(fl1->iif ^ fl2->iif)) == 0);
689 }
690 
691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692 {
693 	return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
694 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
695 		(fl1->mark ^ fl2->mark) |
696 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
697 		(fl1->oif ^ fl2->oif) |
698 		(fl1->iif ^ fl2->iif)) == 0;
699 }
700 
701 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
702 {
703 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
704 }
705 
706 static inline int rt_is_expired(struct rtable *rth)
707 {
708 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
709 }
710 
711 /*
712  * Perform a full scan of hash table and free all entries.
713  * Can be called by a softirq or a process.
714  * In the later case, we want to be reschedule if necessary
715  */
716 static void rt_do_flush(int process_context)
717 {
718 	unsigned int i;
719 	struct rtable *rth, *next;
720 	struct rtable * tail;
721 
722 	for (i = 0; i <= rt_hash_mask; i++) {
723 		if (process_context && need_resched())
724 			cond_resched();
725 		rth = rt_hash_table[i].chain;
726 		if (!rth)
727 			continue;
728 
729 		spin_lock_bh(rt_hash_lock_addr(i));
730 #ifdef CONFIG_NET_NS
731 		{
732 		struct rtable ** prev, * p;
733 
734 		rth = rt_hash_table[i].chain;
735 
736 		/* defer releasing the head of the list after spin_unlock */
737 		for (tail = rth; tail; tail = tail->dst.rt_next)
738 			if (!rt_is_expired(tail))
739 				break;
740 		if (rth != tail)
741 			rt_hash_table[i].chain = tail;
742 
743 		/* call rt_free on entries after the tail requiring flush */
744 		prev = &rt_hash_table[i].chain;
745 		for (p = *prev; p; p = next) {
746 			next = p->dst.rt_next;
747 			if (!rt_is_expired(p)) {
748 				prev = &p->dst.rt_next;
749 			} else {
750 				*prev = next;
751 				rt_free(p);
752 			}
753 		}
754 		}
755 #else
756 		rth = rt_hash_table[i].chain;
757 		rt_hash_table[i].chain = NULL;
758 		tail = NULL;
759 #endif
760 		spin_unlock_bh(rt_hash_lock_addr(i));
761 
762 		for (; rth != tail; rth = next) {
763 			next = rth->dst.rt_next;
764 			rt_free(rth);
765 		}
766 	}
767 }
768 
769 /*
770  * While freeing expired entries, we compute average chain length
771  * and standard deviation, using fixed-point arithmetic.
772  * This to have an estimation of rt_chain_length_max
773  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
774  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
775  */
776 
777 #define FRACT_BITS 3
778 #define ONE (1UL << FRACT_BITS)
779 
780 /*
781  * Given a hash chain and an item in this hash chain,
782  * find if a previous entry has the same hash_inputs
783  * (but differs on tos, mark or oif)
784  * Returns 0 if an alias is found.
785  * Returns ONE if rth has no alias before itself.
786  */
787 static int has_noalias(const struct rtable *head, const struct rtable *rth)
788 {
789 	const struct rtable *aux = head;
790 
791 	while (aux != rth) {
792 		if (compare_hash_inputs(&aux->fl, &rth->fl))
793 			return 0;
794 		aux = aux->dst.rt_next;
795 	}
796 	return ONE;
797 }
798 
799 static void rt_check_expire(void)
800 {
801 	static unsigned int rover;
802 	unsigned int i = rover, goal;
803 	struct rtable *rth, **rthp;
804 	unsigned long samples = 0;
805 	unsigned long sum = 0, sum2 = 0;
806 	unsigned long delta;
807 	u64 mult;
808 
809 	delta = jiffies - expires_ljiffies;
810 	expires_ljiffies = jiffies;
811 	mult = ((u64)delta) << rt_hash_log;
812 	if (ip_rt_gc_timeout > 1)
813 		do_div(mult, ip_rt_gc_timeout);
814 	goal = (unsigned int)mult;
815 	if (goal > rt_hash_mask)
816 		goal = rt_hash_mask + 1;
817 	for (; goal > 0; goal--) {
818 		unsigned long tmo = ip_rt_gc_timeout;
819 		unsigned long length;
820 
821 		i = (i + 1) & rt_hash_mask;
822 		rthp = &rt_hash_table[i].chain;
823 
824 		if (need_resched())
825 			cond_resched();
826 
827 		samples++;
828 
829 		if (*rthp == NULL)
830 			continue;
831 		length = 0;
832 		spin_lock_bh(rt_hash_lock_addr(i));
833 		while ((rth = *rthp) != NULL) {
834 			prefetch(rth->dst.rt_next);
835 			if (rt_is_expired(rth)) {
836 				*rthp = rth->dst.rt_next;
837 				rt_free(rth);
838 				continue;
839 			}
840 			if (rth->dst.expires) {
841 				/* Entry is expired even if it is in use */
842 				if (time_before_eq(jiffies, rth->dst.expires)) {
843 nofree:
844 					tmo >>= 1;
845 					rthp = &rth->dst.rt_next;
846 					/*
847 					 * We only count entries on
848 					 * a chain with equal hash inputs once
849 					 * so that entries for different QOS
850 					 * levels, and other non-hash input
851 					 * attributes don't unfairly skew
852 					 * the length computation
853 					 */
854 					length += has_noalias(rt_hash_table[i].chain, rth);
855 					continue;
856 				}
857 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
858 				goto nofree;
859 
860 			/* Cleanup aged off entries. */
861 			*rthp = rth->dst.rt_next;
862 			rt_free(rth);
863 		}
864 		spin_unlock_bh(rt_hash_lock_addr(i));
865 		sum += length;
866 		sum2 += length*length;
867 	}
868 	if (samples) {
869 		unsigned long avg = sum / samples;
870 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
871 		rt_chain_length_max = max_t(unsigned long,
872 					ip_rt_gc_elasticity,
873 					(avg + 4*sd) >> FRACT_BITS);
874 	}
875 	rover = i;
876 }
877 
878 /*
879  * rt_worker_func() is run in process context.
880  * we call rt_check_expire() to scan part of the hash table
881  */
882 static void rt_worker_func(struct work_struct *work)
883 {
884 	rt_check_expire();
885 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
886 }
887 
888 /*
889  * Pertubation of rt_genid by a small quantity [1..256]
890  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
891  * many times (2^24) without giving recent rt_genid.
892  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
893  */
894 static void rt_cache_invalidate(struct net *net)
895 {
896 	unsigned char shuffle;
897 
898 	get_random_bytes(&shuffle, sizeof(shuffle));
899 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
900 }
901 
902 /*
903  * delay < 0  : invalidate cache (fast : entries will be deleted later)
904  * delay >= 0 : invalidate & flush cache (can be long)
905  */
906 void rt_cache_flush(struct net *net, int delay)
907 {
908 	rt_cache_invalidate(net);
909 	if (delay >= 0)
910 		rt_do_flush(!in_softirq());
911 }
912 
913 /* Flush previous cache invalidated entries from the cache */
914 void rt_cache_flush_batch(void)
915 {
916 	rt_do_flush(!in_softirq());
917 }
918 
919 static void rt_emergency_hash_rebuild(struct net *net)
920 {
921 	if (net_ratelimit())
922 		printk(KERN_WARNING "Route hash chain too long!\n");
923 	rt_cache_invalidate(net);
924 }
925 
926 /*
927    Short description of GC goals.
928 
929    We want to build algorithm, which will keep routing cache
930    at some equilibrium point, when number of aged off entries
931    is kept approximately equal to newly generated ones.
932 
933    Current expiration strength is variable "expire".
934    We try to adjust it dynamically, so that if networking
935    is idle expires is large enough to keep enough of warm entries,
936    and when load increases it reduces to limit cache size.
937  */
938 
939 static int rt_garbage_collect(struct dst_ops *ops)
940 {
941 	static unsigned long expire = RT_GC_TIMEOUT;
942 	static unsigned long last_gc;
943 	static int rover;
944 	static int equilibrium;
945 	struct rtable *rth, **rthp;
946 	unsigned long now = jiffies;
947 	int goal;
948 
949 	/*
950 	 * Garbage collection is pretty expensive,
951 	 * do not make it too frequently.
952 	 */
953 
954 	RT_CACHE_STAT_INC(gc_total);
955 
956 	if (now - last_gc < ip_rt_gc_min_interval &&
957 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
958 		RT_CACHE_STAT_INC(gc_ignored);
959 		goto out;
960 	}
961 
962 	/* Calculate number of entries, which we want to expire now. */
963 	goal = atomic_read(&ipv4_dst_ops.entries) -
964 		(ip_rt_gc_elasticity << rt_hash_log);
965 	if (goal <= 0) {
966 		if (equilibrium < ipv4_dst_ops.gc_thresh)
967 			equilibrium = ipv4_dst_ops.gc_thresh;
968 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
969 		if (goal > 0) {
970 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
972 		}
973 	} else {
974 		/* We are in dangerous area. Try to reduce cache really
975 		 * aggressively.
976 		 */
977 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
979 	}
980 
981 	if (now - last_gc >= ip_rt_gc_min_interval)
982 		last_gc = now;
983 
984 	if (goal <= 0) {
985 		equilibrium += goal;
986 		goto work_done;
987 	}
988 
989 	do {
990 		int i, k;
991 
992 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993 			unsigned long tmo = expire;
994 
995 			k = (k + 1) & rt_hash_mask;
996 			rthp = &rt_hash_table[k].chain;
997 			spin_lock_bh(rt_hash_lock_addr(k));
998 			while ((rth = *rthp) != NULL) {
999 				if (!rt_is_expired(rth) &&
1000 					!rt_may_expire(rth, tmo, expire)) {
1001 					tmo >>= 1;
1002 					rthp = &rth->dst.rt_next;
1003 					continue;
1004 				}
1005 				*rthp = rth->dst.rt_next;
1006 				rt_free(rth);
1007 				goal--;
1008 			}
1009 			spin_unlock_bh(rt_hash_lock_addr(k));
1010 			if (goal <= 0)
1011 				break;
1012 		}
1013 		rover = k;
1014 
1015 		if (goal <= 0)
1016 			goto work_done;
1017 
1018 		/* Goal is not achieved. We stop process if:
1019 
1020 		   - if expire reduced to zero. Otherwise, expire is halfed.
1021 		   - if table is not full.
1022 		   - if we are called from interrupt.
1023 		   - jiffies check is just fallback/debug loop breaker.
1024 		     We will not spin here for long time in any case.
1025 		 */
1026 
1027 		RT_CACHE_STAT_INC(gc_goal_miss);
1028 
1029 		if (expire == 0)
1030 			break;
1031 
1032 		expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1036 #endif
1037 
1038 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1039 			goto out;
1040 	} while (!in_softirq() && time_before_eq(jiffies, now));
1041 
1042 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1043 		goto out;
1044 	if (net_ratelimit())
1045 		printk(KERN_WARNING "dst cache overflow\n");
1046 	RT_CACHE_STAT_INC(gc_dst_overflow);
1047 	return 1;
1048 
1049 work_done:
1050 	expire += ip_rt_gc_min_interval;
1051 	if (expire > ip_rt_gc_timeout ||
1052 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1053 		expire = ip_rt_gc_timeout;
1054 #if RT_CACHE_DEBUG >= 2
1055 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1057 #endif
1058 out:	return 0;
1059 }
1060 
1061 /*
1062  * Returns number of entries in a hash chain that have different hash_inputs
1063  */
1064 static int slow_chain_length(const struct rtable *head)
1065 {
1066 	int length = 0;
1067 	const struct rtable *rth = head;
1068 
1069 	while (rth) {
1070 		length += has_noalias(head, rth);
1071 		rth = rth->dst.rt_next;
1072 	}
1073 	return length >> FRACT_BITS;
1074 }
1075 
1076 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
1078 {
1079 	struct rtable	*rth, **rthp;
1080 	unsigned long	now;
1081 	struct rtable *cand, **candp;
1082 	u32 		min_score;
1083 	int		chain_length;
1084 	int attempts = !in_softirq();
1085 
1086 restart:
1087 	chain_length = 0;
1088 	min_score = ~(u32)0;
1089 	cand = NULL;
1090 	candp = NULL;
1091 	now = jiffies;
1092 
1093 	if (!rt_caching(dev_net(rt->dst.dev))) {
1094 		/*
1095 		 * If we're not caching, just tell the caller we
1096 		 * were successful and don't touch the route.  The
1097 		 * caller hold the sole reference to the cache entry, and
1098 		 * it will be released when the caller is done with it.
1099 		 * If we drop it here, the callers have no way to resolve routes
1100 		 * when we're not caching.  Instead, just point *rp at rt, so
1101 		 * the caller gets a single use out of the route
1102 		 * Note that we do rt_free on this new route entry, so that
1103 		 * once its refcount hits zero, we are still able to reap it
1104 		 * (Thanks Alexey)
1105 		 * Note also the rt_free uses call_rcu.  We don't actually
1106 		 * need rcu protection here, this is just our path to get
1107 		 * on the route gc list.
1108 		 */
1109 
1110 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 			int err = arp_bind_neighbour(&rt->dst);
1112 			if (err) {
1113 				if (net_ratelimit())
1114 					printk(KERN_WARNING
1115 					    "Neighbour table failure & not caching routes.\n");
1116 				rt_drop(rt);
1117 				return err;
1118 			}
1119 		}
1120 
1121 		rt_free(rt);
1122 		goto skip_hashing;
1123 	}
1124 
1125 	rthp = &rt_hash_table[hash].chain;
1126 
1127 	spin_lock_bh(rt_hash_lock_addr(hash));
1128 	while ((rth = *rthp) != NULL) {
1129 		if (rt_is_expired(rth)) {
1130 			*rthp = rth->dst.rt_next;
1131 			rt_free(rth);
1132 			continue;
1133 		}
1134 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135 			/* Put it first */
1136 			*rthp = rth->dst.rt_next;
1137 			/*
1138 			 * Since lookup is lockfree, the deletion
1139 			 * must be visible to another weakly ordered CPU before
1140 			 * the insertion at the start of the hash chain.
1141 			 */
1142 			rcu_assign_pointer(rth->dst.rt_next,
1143 					   rt_hash_table[hash].chain);
1144 			/*
1145 			 * Since lookup is lockfree, the update writes
1146 			 * must be ordered for consistency on SMP.
1147 			 */
1148 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149 
1150 			dst_use(&rth->dst, now);
1151 			spin_unlock_bh(rt_hash_lock_addr(hash));
1152 
1153 			rt_drop(rt);
1154 			if (rp)
1155 				*rp = rth;
1156 			else
1157 				skb_dst_set(skb, &rth->dst);
1158 			return 0;
1159 		}
1160 
1161 		if (!atomic_read(&rth->dst.__refcnt)) {
1162 			u32 score = rt_score(rth);
1163 
1164 			if (score <= min_score) {
1165 				cand = rth;
1166 				candp = rthp;
1167 				min_score = score;
1168 			}
1169 		}
1170 
1171 		chain_length++;
1172 
1173 		rthp = &rth->dst.rt_next;
1174 	}
1175 
1176 	if (cand) {
1177 		/* ip_rt_gc_elasticity used to be average length of chain
1178 		 * length, when exceeded gc becomes really aggressive.
1179 		 *
1180 		 * The second limit is less certain. At the moment it allows
1181 		 * only 2 entries per bucket. We will see.
1182 		 */
1183 		if (chain_length > ip_rt_gc_elasticity) {
1184 			*candp = cand->dst.rt_next;
1185 			rt_free(cand);
1186 		}
1187 	} else {
1188 		if (chain_length > rt_chain_length_max &&
1189 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 			struct net *net = dev_net(rt->dst.dev);
1191 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 			if (!rt_caching(net)) {
1193 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1194 					rt->dst.dev->name, num);
1195 			}
1196 			rt_emergency_hash_rebuild(net);
1197 			spin_unlock_bh(rt_hash_lock_addr(hash));
1198 
1199 			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1200 					ifindex, rt_genid(net));
1201 			goto restart;
1202 		}
1203 	}
1204 
1205 	/* Try to bind route to arp only if it is output
1206 	   route or unicast forwarding path.
1207 	 */
1208 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1209 		int err = arp_bind_neighbour(&rt->dst);
1210 		if (err) {
1211 			spin_unlock_bh(rt_hash_lock_addr(hash));
1212 
1213 			if (err != -ENOBUFS) {
1214 				rt_drop(rt);
1215 				return err;
1216 			}
1217 
1218 			/* Neighbour tables are full and nothing
1219 			   can be released. Try to shrink route cache,
1220 			   it is most likely it holds some neighbour records.
1221 			 */
1222 			if (attempts-- > 0) {
1223 				int saved_elasticity = ip_rt_gc_elasticity;
1224 				int saved_int = ip_rt_gc_min_interval;
1225 				ip_rt_gc_elasticity	= 1;
1226 				ip_rt_gc_min_interval	= 0;
1227 				rt_garbage_collect(&ipv4_dst_ops);
1228 				ip_rt_gc_min_interval	= saved_int;
1229 				ip_rt_gc_elasticity	= saved_elasticity;
1230 				goto restart;
1231 			}
1232 
1233 			if (net_ratelimit())
1234 				printk(KERN_WARNING "Neighbour table overflow.\n");
1235 			rt_drop(rt);
1236 			return -ENOBUFS;
1237 		}
1238 	}
1239 
1240 	rt->dst.rt_next = rt_hash_table[hash].chain;
1241 
1242 #if RT_CACHE_DEBUG >= 2
1243 	if (rt->dst.rt_next) {
1244 		struct rtable *trt;
1245 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1246 		       hash, &rt->rt_dst);
1247 		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1248 			printk(" . %pI4", &trt->rt_dst);
1249 		printk("\n");
1250 	}
1251 #endif
1252 	/*
1253 	 * Since lookup is lockfree, we must make sure
1254 	 * previous writes to rt are comitted to memory
1255 	 * before making rt visible to other CPUS.
1256 	 */
1257 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1258 
1259 	spin_unlock_bh(rt_hash_lock_addr(hash));
1260 
1261 skip_hashing:
1262 	if (rp)
1263 		*rp = rt;
1264 	else
1265 		skb_dst_set(skb, &rt->dst);
1266 	return 0;
1267 }
1268 
1269 void rt_bind_peer(struct rtable *rt, int create)
1270 {
1271 	static DEFINE_SPINLOCK(rt_peer_lock);
1272 	struct inet_peer *peer;
1273 
1274 	peer = inet_getpeer(rt->rt_dst, create);
1275 
1276 	spin_lock_bh(&rt_peer_lock);
1277 	if (rt->peer == NULL) {
1278 		rt->peer = peer;
1279 		peer = NULL;
1280 	}
1281 	spin_unlock_bh(&rt_peer_lock);
1282 	if (peer)
1283 		inet_putpeer(peer);
1284 }
1285 
1286 /*
1287  * Peer allocation may fail only in serious out-of-memory conditions.  However
1288  * we still can generate some output.
1289  * Random ID selection looks a bit dangerous because we have no chances to
1290  * select ID being unique in a reasonable period of time.
1291  * But broken packet identifier may be better than no packet at all.
1292  */
1293 static void ip_select_fb_ident(struct iphdr *iph)
1294 {
1295 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1296 	static u32 ip_fallback_id;
1297 	u32 salt;
1298 
1299 	spin_lock_bh(&ip_fb_id_lock);
1300 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1301 	iph->id = htons(salt & 0xFFFF);
1302 	ip_fallback_id = salt;
1303 	spin_unlock_bh(&ip_fb_id_lock);
1304 }
1305 
1306 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1307 {
1308 	struct rtable *rt = (struct rtable *) dst;
1309 
1310 	if (rt) {
1311 		if (rt->peer == NULL)
1312 			rt_bind_peer(rt, 1);
1313 
1314 		/* If peer is attached to destination, it is never detached,
1315 		   so that we need not to grab a lock to dereference it.
1316 		 */
1317 		if (rt->peer) {
1318 			iph->id = htons(inet_getid(rt->peer, more));
1319 			return;
1320 		}
1321 	} else
1322 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1323 		       __builtin_return_address(0));
1324 
1325 	ip_select_fb_ident(iph);
1326 }
1327 EXPORT_SYMBOL(__ip_select_ident);
1328 
1329 static void rt_del(unsigned hash, struct rtable *rt)
1330 {
1331 	struct rtable **rthp, *aux;
1332 
1333 	rthp = &rt_hash_table[hash].chain;
1334 	spin_lock_bh(rt_hash_lock_addr(hash));
1335 	ip_rt_put(rt);
1336 	while ((aux = *rthp) != NULL) {
1337 		if (aux == rt || rt_is_expired(aux)) {
1338 			*rthp = aux->dst.rt_next;
1339 			rt_free(aux);
1340 			continue;
1341 		}
1342 		rthp = &aux->dst.rt_next;
1343 	}
1344 	spin_unlock_bh(rt_hash_lock_addr(hash));
1345 }
1346 
1347 /* called in rcu_read_lock() section */
1348 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1349 		    __be32 saddr, struct net_device *dev)
1350 {
1351 	int i, k;
1352 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1353 	struct rtable *rth, **rthp;
1354 	__be32  skeys[2] = { saddr, 0 };
1355 	int  ikeys[2] = { dev->ifindex, 0 };
1356 	struct netevent_redirect netevent;
1357 	struct net *net;
1358 
1359 	if (!in_dev)
1360 		return;
1361 
1362 	net = dev_net(dev);
1363 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1364 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1365 	    ipv4_is_zeronet(new_gw))
1366 		goto reject_redirect;
1367 
1368 	if (!rt_caching(net))
1369 		goto reject_redirect;
1370 
1371 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1372 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1373 			goto reject_redirect;
1374 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1375 			goto reject_redirect;
1376 	} else {
1377 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1378 			goto reject_redirect;
1379 	}
1380 
1381 	for (i = 0; i < 2; i++) {
1382 		for (k = 0; k < 2; k++) {
1383 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1384 						rt_genid(net));
1385 
1386 			rthp=&rt_hash_table[hash].chain;
1387 
1388 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1389 				struct rtable *rt;
1390 
1391 				if (rth->fl.fl4_dst != daddr ||
1392 				    rth->fl.fl4_src != skeys[i] ||
1393 				    rth->fl.oif != ikeys[k] ||
1394 				    rth->fl.iif != 0 ||
1395 				    rt_is_expired(rth) ||
1396 				    !net_eq(dev_net(rth->dst.dev), net)) {
1397 					rthp = &rth->dst.rt_next;
1398 					continue;
1399 				}
1400 
1401 				if (rth->rt_dst != daddr ||
1402 				    rth->rt_src != saddr ||
1403 				    rth->dst.error ||
1404 				    rth->rt_gateway != old_gw ||
1405 				    rth->dst.dev != dev)
1406 					break;
1407 
1408 				dst_hold(&rth->dst);
1409 
1410 				rt = dst_alloc(&ipv4_dst_ops);
1411 				if (rt == NULL) {
1412 					ip_rt_put(rth);
1413 					return;
1414 				}
1415 
1416 				/* Copy all the information. */
1417 				*rt = *rth;
1418 				rt->dst.__use		= 1;
1419 				atomic_set(&rt->dst.__refcnt, 1);
1420 				rt->dst.child		= NULL;
1421 				if (rt->dst.dev)
1422 					dev_hold(rt->dst.dev);
1423 				if (rt->idev)
1424 					in_dev_hold(rt->idev);
1425 				rt->dst.obsolete	= -1;
1426 				rt->dst.lastuse	= jiffies;
1427 				rt->dst.path		= &rt->dst;
1428 				rt->dst.neighbour	= NULL;
1429 				rt->dst.hh		= NULL;
1430 #ifdef CONFIG_XFRM
1431 				rt->dst.xfrm		= NULL;
1432 #endif
1433 				rt->rt_genid		= rt_genid(net);
1434 				rt->rt_flags		|= RTCF_REDIRECTED;
1435 
1436 				/* Gateway is different ... */
1437 				rt->rt_gateway		= new_gw;
1438 
1439 				/* Redirect received -> path was valid */
1440 				dst_confirm(&rth->dst);
1441 
1442 				if (rt->peer)
1443 					atomic_inc(&rt->peer->refcnt);
1444 
1445 				if (arp_bind_neighbour(&rt->dst) ||
1446 				    !(rt->dst.neighbour->nud_state &
1447 					    NUD_VALID)) {
1448 					if (rt->dst.neighbour)
1449 						neigh_event_send(rt->dst.neighbour, NULL);
1450 					ip_rt_put(rth);
1451 					rt_drop(rt);
1452 					goto do_next;
1453 				}
1454 
1455 				netevent.old = &rth->dst;
1456 				netevent.new = &rt->dst;
1457 				call_netevent_notifiers(NETEVENT_REDIRECT,
1458 							&netevent);
1459 
1460 				rt_del(hash, rth);
1461 				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 					ip_rt_put(rt);
1463 				goto do_next;
1464 			}
1465 		do_next:
1466 			;
1467 		}
1468 	}
1469 	return;
1470 
1471 reject_redirect:
1472 #ifdef CONFIG_IP_ROUTE_VERBOSE
1473 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1474 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1475 			"  Advised path = %pI4 -> %pI4\n",
1476 		       &old_gw, dev->name, &new_gw,
1477 		       &saddr, &daddr);
1478 #endif
1479 	;
1480 }
1481 
1482 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1483 {
1484 	struct rtable *rt = (struct rtable *)dst;
1485 	struct dst_entry *ret = dst;
1486 
1487 	if (rt) {
1488 		if (dst->obsolete > 0) {
1489 			ip_rt_put(rt);
1490 			ret = NULL;
1491 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1492 			   (rt->dst.expires &&
1493 			    time_after_eq(jiffies, rt->dst.expires))) {
1494 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 						rt->fl.oif,
1496 						rt_genid(dev_net(dst->dev)));
1497 #if RT_CACHE_DEBUG >= 1
1498 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 				&rt->rt_dst, rt->fl.fl4_tos);
1500 #endif
1501 			rt_del(hash, rt);
1502 			ret = NULL;
1503 		}
1504 	}
1505 	return ret;
1506 }
1507 
1508 /*
1509  * Algorithm:
1510  *	1. The first ip_rt_redirect_number redirects are sent
1511  *	   with exponential backoff, then we stop sending them at all,
1512  *	   assuming that the host ignores our redirects.
1513  *	2. If we did not see packets requiring redirects
1514  *	   during ip_rt_redirect_silence, we assume that the host
1515  *	   forgot redirected route and start to send redirects again.
1516  *
1517  * This algorithm is much cheaper and more intelligent than dumb load limiting
1518  * in icmp.c.
1519  *
1520  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1521  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1522  */
1523 
1524 void ip_rt_send_redirect(struct sk_buff *skb)
1525 {
1526 	struct rtable *rt = skb_rtable(skb);
1527 	struct in_device *in_dev;
1528 	int log_martians;
1529 
1530 	rcu_read_lock();
1531 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1532 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1533 		rcu_read_unlock();
1534 		return;
1535 	}
1536 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 	rcu_read_unlock();
1538 
1539 	/* No redirected packets during ip_rt_redirect_silence;
1540 	 * reset the algorithm.
1541 	 */
1542 	if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1543 		rt->dst.rate_tokens = 0;
1544 
1545 	/* Too many ignored redirects; do not send anything
1546 	 * set dst.rate_last to the last seen redirected packet.
1547 	 */
1548 	if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1549 		rt->dst.rate_last = jiffies;
1550 		return;
1551 	}
1552 
1553 	/* Check for load limit; set rate_last to the latest sent
1554 	 * redirect.
1555 	 */
1556 	if (rt->dst.rate_tokens == 0 ||
1557 	    time_after(jiffies,
1558 		       (rt->dst.rate_last +
1559 			(ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1560 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 		rt->dst.rate_last = jiffies;
1562 		++rt->dst.rate_tokens;
1563 #ifdef CONFIG_IP_ROUTE_VERBOSE
1564 		if (log_martians &&
1565 		    rt->dst.rate_tokens == ip_rt_redirect_number &&
1566 		    net_ratelimit())
1567 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 				&rt->rt_src, rt->rt_iif,
1569 				&rt->rt_dst, &rt->rt_gateway);
1570 #endif
1571 	}
1572 }
1573 
1574 static int ip_error(struct sk_buff *skb)
1575 {
1576 	struct rtable *rt = skb_rtable(skb);
1577 	unsigned long now;
1578 	int code;
1579 
1580 	switch (rt->dst.error) {
1581 		case EINVAL:
1582 		default:
1583 			goto out;
1584 		case EHOSTUNREACH:
1585 			code = ICMP_HOST_UNREACH;
1586 			break;
1587 		case ENETUNREACH:
1588 			code = ICMP_NET_UNREACH;
1589 			IP_INC_STATS_BH(dev_net(rt->dst.dev),
1590 					IPSTATS_MIB_INNOROUTES);
1591 			break;
1592 		case EACCES:
1593 			code = ICMP_PKT_FILTERED;
1594 			break;
1595 	}
1596 
1597 	now = jiffies;
1598 	rt->dst.rate_tokens += now - rt->dst.rate_last;
1599 	if (rt->dst.rate_tokens > ip_rt_error_burst)
1600 		rt->dst.rate_tokens = ip_rt_error_burst;
1601 	rt->dst.rate_last = now;
1602 	if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1603 		rt->dst.rate_tokens -= ip_rt_error_cost;
1604 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1605 	}
1606 
1607 out:	kfree_skb(skb);
1608 	return 0;
1609 }
1610 
1611 /*
1612  *	The last two values are not from the RFC but
1613  *	are needed for AMPRnet AX.25 paths.
1614  */
1615 
1616 static const unsigned short mtu_plateau[] =
1617 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1618 
1619 static inline unsigned short guess_mtu(unsigned short old_mtu)
1620 {
1621 	int i;
1622 
1623 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1624 		if (old_mtu > mtu_plateau[i])
1625 			return mtu_plateau[i];
1626 	return 68;
1627 }
1628 
1629 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630 				 unsigned short new_mtu,
1631 				 struct net_device *dev)
1632 {
1633 	int i, k;
1634 	unsigned short old_mtu = ntohs(iph->tot_len);
1635 	struct rtable *rth;
1636 	int  ikeys[2] = { dev->ifindex, 0 };
1637 	__be32  skeys[2] = { iph->saddr, 0, };
1638 	__be32  daddr = iph->daddr;
1639 	unsigned short est_mtu = 0;
1640 
1641 	for (k = 0; k < 2; k++) {
1642 		for (i = 0; i < 2; i++) {
1643 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1644 						rt_genid(net));
1645 
1646 			rcu_read_lock();
1647 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648 			     rth = rcu_dereference(rth->dst.rt_next)) {
1649 				unsigned short mtu = new_mtu;
1650 
1651 				if (rth->fl.fl4_dst != daddr ||
1652 				    rth->fl.fl4_src != skeys[i] ||
1653 				    rth->rt_dst != daddr ||
1654 				    rth->rt_src != iph->saddr ||
1655 				    rth->fl.oif != ikeys[k] ||
1656 				    rth->fl.iif != 0 ||
1657 				    dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 				    !net_eq(dev_net(rth->dst.dev), net) ||
1659 				    rt_is_expired(rth))
1660 					continue;
1661 
1662 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1663 
1664 					/* BSD 4.2 compatibility hack :-( */
1665 					if (mtu == 0 &&
1666 					    old_mtu >= dst_mtu(&rth->dst) &&
1667 					    old_mtu >= 68 + (iph->ihl << 2))
1668 						old_mtu -= iph->ihl << 2;
1669 
1670 					mtu = guess_mtu(old_mtu);
1671 				}
1672 				if (mtu <= dst_mtu(&rth->dst)) {
1673 					if (mtu < dst_mtu(&rth->dst)) {
1674 						dst_confirm(&rth->dst);
1675 						if (mtu < ip_rt_min_pmtu) {
1676 							mtu = ip_rt_min_pmtu;
1677 							rth->dst.metrics[RTAX_LOCK-1] |=
1678 								(1 << RTAX_MTU);
1679 						}
1680 						rth->dst.metrics[RTAX_MTU-1] = mtu;
1681 						dst_set_expires(&rth->dst,
1682 							ip_rt_mtu_expires);
1683 					}
1684 					est_mtu = mtu;
1685 				}
1686 			}
1687 			rcu_read_unlock();
1688 		}
1689 	}
1690 	return est_mtu ? : new_mtu;
1691 }
1692 
1693 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1694 {
1695 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1696 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1697 		if (mtu < ip_rt_min_pmtu) {
1698 			mtu = ip_rt_min_pmtu;
1699 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1700 		}
1701 		dst->metrics[RTAX_MTU-1] = mtu;
1702 		dst_set_expires(dst, ip_rt_mtu_expires);
1703 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1704 	}
1705 }
1706 
1707 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1708 {
1709 	if (rt_is_expired((struct rtable *)dst))
1710 		return NULL;
1711 	return dst;
1712 }
1713 
1714 static void ipv4_dst_destroy(struct dst_entry *dst)
1715 {
1716 	struct rtable *rt = (struct rtable *) dst;
1717 	struct inet_peer *peer = rt->peer;
1718 	struct in_device *idev = rt->idev;
1719 
1720 	if (peer) {
1721 		rt->peer = NULL;
1722 		inet_putpeer(peer);
1723 	}
1724 
1725 	if (idev) {
1726 		rt->idev = NULL;
1727 		in_dev_put(idev);
1728 	}
1729 }
1730 
1731 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1732 			    int how)
1733 {
1734 	struct rtable *rt = (struct rtable *) dst;
1735 	struct in_device *idev = rt->idev;
1736 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1737 		struct in_device *loopback_idev =
1738 			in_dev_get(dev_net(dev)->loopback_dev);
1739 		if (loopback_idev) {
1740 			rt->idev = loopback_idev;
1741 			in_dev_put(idev);
1742 		}
1743 	}
1744 }
1745 
1746 static void ipv4_link_failure(struct sk_buff *skb)
1747 {
1748 	struct rtable *rt;
1749 
1750 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1751 
1752 	rt = skb_rtable(skb);
1753 	if (rt)
1754 		dst_set_expires(&rt->dst, 0);
1755 }
1756 
1757 static int ip_rt_bug(struct sk_buff *skb)
1758 {
1759 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1760 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1761 		skb->dev ? skb->dev->name : "?");
1762 	kfree_skb(skb);
1763 	return 0;
1764 }
1765 
1766 /*
1767    We do not cache source address of outgoing interface,
1768    because it is used only by IP RR, TS and SRR options,
1769    so that it out of fast path.
1770 
1771    BTW remember: "addr" is allowed to be not aligned
1772    in IP options!
1773  */
1774 
1775 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1776 {
1777 	__be32 src;
1778 	struct fib_result res;
1779 
1780 	if (rt->fl.iif == 0)
1781 		src = rt->rt_src;
1782 	else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
1783 		src = FIB_RES_PREFSRC(res);
1784 		fib_res_put(&res);
1785 	} else
1786 		src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1787 					RT_SCOPE_UNIVERSE);
1788 	memcpy(addr, &src, 4);
1789 }
1790 
1791 #ifdef CONFIG_NET_CLS_ROUTE
1792 static void set_class_tag(struct rtable *rt, u32 tag)
1793 {
1794 	if (!(rt->dst.tclassid & 0xFFFF))
1795 		rt->dst.tclassid |= tag & 0xFFFF;
1796 	if (!(rt->dst.tclassid & 0xFFFF0000))
1797 		rt->dst.tclassid |= tag & 0xFFFF0000;
1798 }
1799 #endif
1800 
1801 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1802 {
1803 	struct fib_info *fi = res->fi;
1804 
1805 	if (fi) {
1806 		if (FIB_RES_GW(*res) &&
1807 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808 			rt->rt_gateway = FIB_RES_GW(*res);
1809 		memcpy(rt->dst.metrics, fi->fib_metrics,
1810 		       sizeof(rt->dst.metrics));
1811 		if (fi->fib_mtu == 0) {
1812 			rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813 			if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814 			    rt->rt_gateway != rt->rt_dst &&
1815 			    rt->dst.dev->mtu > 576)
1816 				rt->dst.metrics[RTAX_MTU-1] = 576;
1817 		}
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819 		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1820 #endif
1821 	} else
1822 		rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1823 
1824 	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1825 		rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1826 	if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1827 		rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1828 	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1829 		rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1830 				       ip_rt_min_advmss);
1831 	if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832 		rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1833 
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 #ifdef CONFIG_IP_MULTIPLE_TABLES
1836 	set_class_tag(rt, fib_rules_tclass(res));
1837 #endif
1838 	set_class_tag(rt, itag);
1839 #endif
1840 	rt->rt_type = res->type;
1841 }
1842 
1843 /* called in rcu_read_lock() section */
1844 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1845 				u8 tos, struct net_device *dev, int our)
1846 {
1847 	unsigned int hash;
1848 	struct rtable *rth;
1849 	__be32 spec_dst;
1850 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1851 	u32 itag = 0;
1852 	int err;
1853 
1854 	/* Primary sanity checks. */
1855 
1856 	if (in_dev == NULL)
1857 		return -EINVAL;
1858 
1859 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1860 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1861 		goto e_inval;
1862 
1863 	if (ipv4_is_zeronet(saddr)) {
1864 		if (!ipv4_is_local_multicast(daddr))
1865 			goto e_inval;
1866 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 	} else {
1868 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869 					  &itag, 0);
1870 		if (err < 0)
1871 			goto e_err;
1872 	}
1873 	rth = dst_alloc(&ipv4_dst_ops);
1874 	if (!rth)
1875 		goto e_nobufs;
1876 
1877 	rth->dst.output = ip_rt_bug;
1878 	rth->dst.obsolete = -1;
1879 
1880 	atomic_set(&rth->dst.__refcnt, 1);
1881 	rth->dst.flags= DST_HOST;
1882 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883 		rth->dst.flags |= DST_NOPOLICY;
1884 	rth->fl.fl4_dst	= daddr;
1885 	rth->rt_dst	= daddr;
1886 	rth->fl.fl4_tos	= tos;
1887 	rth->fl.mark    = skb->mark;
1888 	rth->fl.fl4_src	= saddr;
1889 	rth->rt_src	= saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891 	rth->dst.tclassid = itag;
1892 #endif
1893 	rth->rt_iif	=
1894 	rth->fl.iif	= dev->ifindex;
1895 	rth->dst.dev	= init_net.loopback_dev;
1896 	dev_hold(rth->dst.dev);
1897 	rth->idev	= in_dev_get(rth->dst.dev);
1898 	rth->fl.oif	= 0;
1899 	rth->rt_gateway	= daddr;
1900 	rth->rt_spec_dst= spec_dst;
1901 	rth->rt_genid	= rt_genid(dev_net(dev));
1902 	rth->rt_flags	= RTCF_MULTICAST;
1903 	rth->rt_type	= RTN_MULTICAST;
1904 	if (our) {
1905 		rth->dst.input= ip_local_deliver;
1906 		rth->rt_flags |= RTCF_LOCAL;
1907 	}
1908 
1909 #ifdef CONFIG_IP_MROUTE
1910 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911 		rth->dst.input = ip_mr_input;
1912 #endif
1913 	RT_CACHE_STAT_INC(in_slow_mc);
1914 
1915 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916 	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1917 
1918 e_nobufs:
1919 	return -ENOBUFS;
1920 e_inval:
1921 	return -EINVAL;
1922 e_err:
1923 	return err;
1924 }
1925 
1926 
1927 static void ip_handle_martian_source(struct net_device *dev,
1928 				     struct in_device *in_dev,
1929 				     struct sk_buff *skb,
1930 				     __be32 daddr,
1931 				     __be32 saddr)
1932 {
1933 	RT_CACHE_STAT_INC(in_martian_src);
1934 #ifdef CONFIG_IP_ROUTE_VERBOSE
1935 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1936 		/*
1937 		 *	RFC1812 recommendation, if source is martian,
1938 		 *	the only hint is MAC header.
1939 		 */
1940 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1941 			&daddr, &saddr, dev->name);
1942 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1943 			int i;
1944 			const unsigned char *p = skb_mac_header(skb);
1945 			printk(KERN_WARNING "ll header: ");
1946 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1947 				printk("%02x", *p);
1948 				if (i < (dev->hard_header_len - 1))
1949 					printk(":");
1950 			}
1951 			printk("\n");
1952 		}
1953 	}
1954 #endif
1955 }
1956 
1957 /* called in rcu_read_lock() section */
1958 static int __mkroute_input(struct sk_buff *skb,
1959 			   struct fib_result *res,
1960 			   struct in_device *in_dev,
1961 			   __be32 daddr, __be32 saddr, u32 tos,
1962 			   struct rtable **result)
1963 {
1964 	struct rtable *rth;
1965 	int err;
1966 	struct in_device *out_dev;
1967 	unsigned int flags = 0;
1968 	__be32 spec_dst;
1969 	u32 itag;
1970 
1971 	/* get a working reference to the output device */
1972 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1973 	if (out_dev == NULL) {
1974 		if (net_ratelimit())
1975 			printk(KERN_CRIT "Bug in ip_route_input" \
1976 			       "_slow(). Please, report\n");
1977 		return -EINVAL;
1978 	}
1979 
1980 
1981 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1982 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1983 	if (err < 0) {
1984 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1985 					 saddr);
1986 
1987 		goto cleanup;
1988 	}
1989 
1990 	if (err)
1991 		flags |= RTCF_DIRECTSRC;
1992 
1993 	if (out_dev == in_dev && err &&
1994 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1995 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1996 		flags |= RTCF_DOREDIRECT;
1997 
1998 	if (skb->protocol != htons(ETH_P_IP)) {
1999 		/* Not IP (i.e. ARP). Do not create route, if it is
2000 		 * invalid for proxy arp. DNAT routes are always valid.
2001 		 *
2002 		 * Proxy arp feature have been extended to allow, ARP
2003 		 * replies back to the same interface, to support
2004 		 * Private VLAN switch technologies. See arp.c.
2005 		 */
2006 		if (out_dev == in_dev &&
2007 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2008 			err = -EINVAL;
2009 			goto cleanup;
2010 		}
2011 	}
2012 
2013 
2014 	rth = dst_alloc(&ipv4_dst_ops);
2015 	if (!rth) {
2016 		err = -ENOBUFS;
2017 		goto cleanup;
2018 	}
2019 
2020 	atomic_set(&rth->dst.__refcnt, 1);
2021 	rth->dst.flags= DST_HOST;
2022 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2023 		rth->dst.flags |= DST_NOPOLICY;
2024 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2025 		rth->dst.flags |= DST_NOXFRM;
2026 	rth->fl.fl4_dst	= daddr;
2027 	rth->rt_dst	= daddr;
2028 	rth->fl.fl4_tos	= tos;
2029 	rth->fl.mark    = skb->mark;
2030 	rth->fl.fl4_src	= saddr;
2031 	rth->rt_src	= saddr;
2032 	rth->rt_gateway	= daddr;
2033 	rth->rt_iif 	=
2034 		rth->fl.iif	= in_dev->dev->ifindex;
2035 	rth->dst.dev	= (out_dev)->dev;
2036 	dev_hold(rth->dst.dev);
2037 	rth->idev	= in_dev_get(rth->dst.dev);
2038 	rth->fl.oif 	= 0;
2039 	rth->rt_spec_dst= spec_dst;
2040 
2041 	rth->dst.obsolete = -1;
2042 	rth->dst.input = ip_forward;
2043 	rth->dst.output = ip_output;
2044 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2045 
2046 	rt_set_nexthop(rth, res, itag);
2047 
2048 	rth->rt_flags = flags;
2049 
2050 	*result = rth;
2051 	err = 0;
2052  cleanup:
2053 	return err;
2054 }
2055 
2056 static int ip_mkroute_input(struct sk_buff *skb,
2057 			    struct fib_result *res,
2058 			    const struct flowi *fl,
2059 			    struct in_device *in_dev,
2060 			    __be32 daddr, __be32 saddr, u32 tos)
2061 {
2062 	struct rtable* rth = NULL;
2063 	int err;
2064 	unsigned hash;
2065 
2066 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2067 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2068 		fib_select_multipath(fl, res);
2069 #endif
2070 
2071 	/* create a routing cache entry */
2072 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2073 	if (err)
2074 		return err;
2075 
2076 	/* put it into the cache */
2077 	hash = rt_hash(daddr, saddr, fl->iif,
2078 		       rt_genid(dev_net(rth->dst.dev)));
2079 	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2080 }
2081 
2082 /*
2083  *	NOTE. We drop all the packets that has local source
2084  *	addresses, because every properly looped back packet
2085  *	must have correct destination already attached by output routine.
2086  *
2087  *	Such approach solves two big problems:
2088  *	1. Not simplex devices are handled properly.
2089  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2090  */
2091 
2092 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2093 			       u8 tos, struct net_device *dev)
2094 {
2095 	struct fib_result res;
2096 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2097 	struct flowi fl = { .nl_u = { .ip4_u =
2098 				      { .daddr = daddr,
2099 					.saddr = saddr,
2100 					.tos = tos,
2101 					.scope = RT_SCOPE_UNIVERSE,
2102 				      } },
2103 			    .mark = skb->mark,
2104 			    .iif = dev->ifindex };
2105 	unsigned	flags = 0;
2106 	u32		itag = 0;
2107 	struct rtable * rth;
2108 	unsigned	hash;
2109 	__be32		spec_dst;
2110 	int		err = -EINVAL;
2111 	int		free_res = 0;
2112 	struct net    * net = dev_net(dev);
2113 
2114 	/* IP on this device is disabled. */
2115 
2116 	if (!in_dev)
2117 		goto out;
2118 
2119 	/* Check for the most weird martians, which can be not detected
2120 	   by fib_lookup.
2121 	 */
2122 
2123 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2124 	    ipv4_is_loopback(saddr))
2125 		goto martian_source;
2126 
2127 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2128 		goto brd_input;
2129 
2130 	/* Accept zero addresses only to limited broadcast;
2131 	 * I even do not know to fix it or not. Waiting for complains :-)
2132 	 */
2133 	if (ipv4_is_zeronet(saddr))
2134 		goto martian_source;
2135 
2136 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2137 	    ipv4_is_loopback(daddr))
2138 		goto martian_destination;
2139 
2140 	/*
2141 	 *	Now we are ready to route packet.
2142 	 */
2143 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2144 		if (!IN_DEV_FORWARD(in_dev))
2145 			goto e_hostunreach;
2146 		goto no_route;
2147 	}
2148 	free_res = 1;
2149 
2150 	RT_CACHE_STAT_INC(in_slow_tot);
2151 
2152 	if (res.type == RTN_BROADCAST)
2153 		goto brd_input;
2154 
2155 	if (res.type == RTN_LOCAL) {
2156 		err = fib_validate_source(saddr, daddr, tos,
2157 					     net->loopback_dev->ifindex,
2158 					     dev, &spec_dst, &itag, skb->mark);
2159 		if (err < 0)
2160 			goto martian_source_keep_err;
2161 		if (err)
2162 			flags |= RTCF_DIRECTSRC;
2163 		spec_dst = daddr;
2164 		goto local_input;
2165 	}
2166 
2167 	if (!IN_DEV_FORWARD(in_dev))
2168 		goto e_hostunreach;
2169 	if (res.type != RTN_UNICAST)
2170 		goto martian_destination;
2171 
2172 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2173 done:
2174 	if (free_res)
2175 		fib_res_put(&res);
2176 out:	return err;
2177 
2178 brd_input:
2179 	if (skb->protocol != htons(ETH_P_IP))
2180 		goto e_inval;
2181 
2182 	if (ipv4_is_zeronet(saddr))
2183 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2184 	else {
2185 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2186 					  &itag, skb->mark);
2187 		if (err < 0)
2188 			goto martian_source_keep_err;
2189 		if (err)
2190 			flags |= RTCF_DIRECTSRC;
2191 	}
2192 	flags |= RTCF_BROADCAST;
2193 	res.type = RTN_BROADCAST;
2194 	RT_CACHE_STAT_INC(in_brd);
2195 
2196 local_input:
2197 	rth = dst_alloc(&ipv4_dst_ops);
2198 	if (!rth)
2199 		goto e_nobufs;
2200 
2201 	rth->dst.output= ip_rt_bug;
2202 	rth->dst.obsolete = -1;
2203 	rth->rt_genid = rt_genid(net);
2204 
2205 	atomic_set(&rth->dst.__refcnt, 1);
2206 	rth->dst.flags= DST_HOST;
2207 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2208 		rth->dst.flags |= DST_NOPOLICY;
2209 	rth->fl.fl4_dst	= daddr;
2210 	rth->rt_dst	= daddr;
2211 	rth->fl.fl4_tos	= tos;
2212 	rth->fl.mark    = skb->mark;
2213 	rth->fl.fl4_src	= saddr;
2214 	rth->rt_src	= saddr;
2215 #ifdef CONFIG_NET_CLS_ROUTE
2216 	rth->dst.tclassid = itag;
2217 #endif
2218 	rth->rt_iif	=
2219 	rth->fl.iif	= dev->ifindex;
2220 	rth->dst.dev	= net->loopback_dev;
2221 	dev_hold(rth->dst.dev);
2222 	rth->idev	= in_dev_get(rth->dst.dev);
2223 	rth->rt_gateway	= daddr;
2224 	rth->rt_spec_dst= spec_dst;
2225 	rth->dst.input= ip_local_deliver;
2226 	rth->rt_flags 	= flags|RTCF_LOCAL;
2227 	if (res.type == RTN_UNREACHABLE) {
2228 		rth->dst.input= ip_error;
2229 		rth->dst.error= -err;
2230 		rth->rt_flags 	&= ~RTCF_LOCAL;
2231 	}
2232 	rth->rt_type	= res.type;
2233 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2234 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2235 	goto done;
2236 
2237 no_route:
2238 	RT_CACHE_STAT_INC(in_no_route);
2239 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2240 	res.type = RTN_UNREACHABLE;
2241 	if (err == -ESRCH)
2242 		err = -ENETUNREACH;
2243 	goto local_input;
2244 
2245 	/*
2246 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2247 	 */
2248 martian_destination:
2249 	RT_CACHE_STAT_INC(in_martian_dst);
2250 #ifdef CONFIG_IP_ROUTE_VERBOSE
2251 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2252 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2253 			&daddr, &saddr, dev->name);
2254 #endif
2255 
2256 e_hostunreach:
2257 	err = -EHOSTUNREACH;
2258 	goto done;
2259 
2260 e_inval:
2261 	err = -EINVAL;
2262 	goto done;
2263 
2264 e_nobufs:
2265 	err = -ENOBUFS;
2266 	goto done;
2267 
2268 martian_source:
2269 	err = -EINVAL;
2270 martian_source_keep_err:
2271 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 	goto done;
2273 }
2274 
2275 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2276 			   u8 tos, struct net_device *dev, bool noref)
2277 {
2278 	struct rtable * rth;
2279 	unsigned	hash;
2280 	int iif = dev->ifindex;
2281 	struct net *net;
2282 	int res;
2283 
2284 	net = dev_net(dev);
2285 
2286 	rcu_read_lock();
2287 
2288 	if (!rt_caching(net))
2289 		goto skip_cache;
2290 
2291 	tos &= IPTOS_RT_MASK;
2292 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2293 
2294 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2295 	     rth = rcu_dereference(rth->dst.rt_next)) {
2296 		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2297 		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2298 		     (rth->fl.iif ^ iif) |
2299 		     rth->fl.oif |
2300 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2301 		    rth->fl.mark == skb->mark &&
2302 		    net_eq(dev_net(rth->dst.dev), net) &&
2303 		    !rt_is_expired(rth)) {
2304 			if (noref) {
2305 				dst_use_noref(&rth->dst, jiffies);
2306 				skb_dst_set_noref(skb, &rth->dst);
2307 			} else {
2308 				dst_use(&rth->dst, jiffies);
2309 				skb_dst_set(skb, &rth->dst);
2310 			}
2311 			RT_CACHE_STAT_INC(in_hit);
2312 			rcu_read_unlock();
2313 			return 0;
2314 		}
2315 		RT_CACHE_STAT_INC(in_hlist_search);
2316 	}
2317 
2318 skip_cache:
2319 	/* Multicast recognition logic is moved from route cache to here.
2320 	   The problem was that too many Ethernet cards have broken/missing
2321 	   hardware multicast filters :-( As result the host on multicasting
2322 	   network acquires a lot of useless route cache entries, sort of
2323 	   SDR messages from all the world. Now we try to get rid of them.
2324 	   Really, provided software IP multicast filter is organized
2325 	   reasonably (at least, hashed), it does not result in a slowdown
2326 	   comparing with route cache reject entries.
2327 	   Note, that multicast routers are not affected, because
2328 	   route cache entry is created eventually.
2329 	 */
2330 	if (ipv4_is_multicast(daddr)) {
2331 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2332 
2333 		if (in_dev) {
2334 			int our = ip_check_mc(in_dev, daddr, saddr,
2335 					      ip_hdr(skb)->protocol);
2336 			if (our
2337 #ifdef CONFIG_IP_MROUTE
2338 				||
2339 			    (!ipv4_is_local_multicast(daddr) &&
2340 			     IN_DEV_MFORWARD(in_dev))
2341 #endif
2342 			   ) {
2343 				int res = ip_route_input_mc(skb, daddr, saddr,
2344 							    tos, dev, our);
2345 				rcu_read_unlock();
2346 				return res;
2347 			}
2348 		}
2349 		rcu_read_unlock();
2350 		return -EINVAL;
2351 	}
2352 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 	rcu_read_unlock();
2354 	return res;
2355 }
2356 EXPORT_SYMBOL(ip_route_input_common);
2357 
2358 static int __mkroute_output(struct rtable **result,
2359 			    struct fib_result *res,
2360 			    const struct flowi *fl,
2361 			    const struct flowi *oldflp,
2362 			    struct net_device *dev_out,
2363 			    unsigned flags)
2364 {
2365 	struct rtable *rth;
2366 	struct in_device *in_dev;
2367 	u32 tos = RT_FL_TOS(oldflp);
2368 	int err = 0;
2369 
2370 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2371 		return -EINVAL;
2372 
2373 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2374 		res->type = RTN_BROADCAST;
2375 	else if (ipv4_is_multicast(fl->fl4_dst))
2376 		res->type = RTN_MULTICAST;
2377 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2378 		return -EINVAL;
2379 
2380 	if (dev_out->flags & IFF_LOOPBACK)
2381 		flags |= RTCF_LOCAL;
2382 
2383 	/* get work reference to inet device */
2384 	in_dev = in_dev_get(dev_out);
2385 	if (!in_dev)
2386 		return -EINVAL;
2387 
2388 	if (res->type == RTN_BROADCAST) {
2389 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 		if (res->fi) {
2391 			fib_info_put(res->fi);
2392 			res->fi = NULL;
2393 		}
2394 	} else if (res->type == RTN_MULTICAST) {
2395 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2396 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 				 oldflp->proto))
2398 			flags &= ~RTCF_LOCAL;
2399 		/* If multicast route do not exist use
2400 		   default one, but do not gateway in this case.
2401 		   Yes, it is hack.
2402 		 */
2403 		if (res->fi && res->prefixlen < 4) {
2404 			fib_info_put(res->fi);
2405 			res->fi = NULL;
2406 		}
2407 	}
2408 
2409 
2410 	rth = dst_alloc(&ipv4_dst_ops);
2411 	if (!rth) {
2412 		err = -ENOBUFS;
2413 		goto cleanup;
2414 	}
2415 
2416 	atomic_set(&rth->dst.__refcnt, 1);
2417 	rth->dst.flags= DST_HOST;
2418 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2419 		rth->dst.flags |= DST_NOXFRM;
2420 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2421 		rth->dst.flags |= DST_NOPOLICY;
2422 
2423 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2424 	rth->fl.fl4_tos	= tos;
2425 	rth->fl.fl4_src	= oldflp->fl4_src;
2426 	rth->fl.oif	= oldflp->oif;
2427 	rth->fl.mark    = oldflp->mark;
2428 	rth->rt_dst	= fl->fl4_dst;
2429 	rth->rt_src	= fl->fl4_src;
2430 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2431 	/* get references to the devices that are to be hold by the routing
2432 	   cache entry */
2433 	rth->dst.dev	= dev_out;
2434 	dev_hold(dev_out);
2435 	rth->idev	= in_dev_get(dev_out);
2436 	rth->rt_gateway = fl->fl4_dst;
2437 	rth->rt_spec_dst= fl->fl4_src;
2438 
2439 	rth->dst.output=ip_output;
2440 	rth->dst.obsolete = -1;
2441 	rth->rt_genid = rt_genid(dev_net(dev_out));
2442 
2443 	RT_CACHE_STAT_INC(out_slow_tot);
2444 
2445 	if (flags & RTCF_LOCAL) {
2446 		rth->dst.input = ip_local_deliver;
2447 		rth->rt_spec_dst = fl->fl4_dst;
2448 	}
2449 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2450 		rth->rt_spec_dst = fl->fl4_src;
2451 		if (flags & RTCF_LOCAL &&
2452 		    !(dev_out->flags & IFF_LOOPBACK)) {
2453 			rth->dst.output = ip_mc_output;
2454 			RT_CACHE_STAT_INC(out_slow_mc);
2455 		}
2456 #ifdef CONFIG_IP_MROUTE
2457 		if (res->type == RTN_MULTICAST) {
2458 			if (IN_DEV_MFORWARD(in_dev) &&
2459 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2460 				rth->dst.input = ip_mr_input;
2461 				rth->dst.output = ip_mc_output;
2462 			}
2463 		}
2464 #endif
2465 	}
2466 
2467 	rt_set_nexthop(rth, res, 0);
2468 
2469 	rth->rt_flags = flags;
2470 
2471 	*result = rth;
2472  cleanup:
2473 	/* release work reference to inet device */
2474 	in_dev_put(in_dev);
2475 
2476 	return err;
2477 }
2478 
2479 static int ip_mkroute_output(struct rtable **rp,
2480 			     struct fib_result *res,
2481 			     const struct flowi *fl,
2482 			     const struct flowi *oldflp,
2483 			     struct net_device *dev_out,
2484 			     unsigned flags)
2485 {
2486 	struct rtable *rth = NULL;
2487 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2488 	unsigned hash;
2489 	if (err == 0) {
2490 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2491 			       rt_genid(dev_net(dev_out)));
2492 		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2493 	}
2494 
2495 	return err;
2496 }
2497 
2498 /*
2499  * Major route resolver routine.
2500  */
2501 
2502 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2503 				const struct flowi *oldflp)
2504 {
2505 	u32 tos	= RT_FL_TOS(oldflp);
2506 	struct flowi fl = { .nl_u = { .ip4_u =
2507 				      { .daddr = oldflp->fl4_dst,
2508 					.saddr = oldflp->fl4_src,
2509 					.tos = tos & IPTOS_RT_MASK,
2510 					.scope = ((tos & RTO_ONLINK) ?
2511 						  RT_SCOPE_LINK :
2512 						  RT_SCOPE_UNIVERSE),
2513 				      } },
2514 			    .mark = oldflp->mark,
2515 			    .iif = net->loopback_dev->ifindex,
2516 			    .oif = oldflp->oif };
2517 	struct fib_result res;
2518 	unsigned flags = 0;
2519 	struct net_device *dev_out = NULL;
2520 	int free_res = 0;
2521 	int err;
2522 
2523 
2524 	res.fi		= NULL;
2525 #ifdef CONFIG_IP_MULTIPLE_TABLES
2526 	res.r		= NULL;
2527 #endif
2528 
2529 	if (oldflp->fl4_src) {
2530 		err = -EINVAL;
2531 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2532 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2533 		    ipv4_is_zeronet(oldflp->fl4_src))
2534 			goto out;
2535 
2536 		/* I removed check for oif == dev_out->oif here.
2537 		   It was wrong for two reasons:
2538 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2539 		      is assigned to multiple interfaces.
2540 		   2. Moreover, we are allowed to send packets with saddr
2541 		      of another iface. --ANK
2542 		 */
2543 
2544 		if (oldflp->oif == 0 &&
2545 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2546 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2547 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2549 			if (dev_out == NULL)
2550 				goto out;
2551 
2552 			/* Special hack: user can direct multicasts
2553 			   and limited broadcast via necessary interface
2554 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2555 			   This hack is not just for fun, it allows
2556 			   vic,vat and friends to work.
2557 			   They bind socket to loopback, set ttl to zero
2558 			   and expect that it will work.
2559 			   From the viewpoint of routing cache they are broken,
2560 			   because we are not allowed to build multicast path
2561 			   with loopback source addr (look, routing cache
2562 			   cannot know, that ttl is zero, so that packet
2563 			   will not leave this host and route is valid).
2564 			   Luckily, this hack is good workaround.
2565 			 */
2566 
2567 			fl.oif = dev_out->ifindex;
2568 			goto make_route;
2569 		}
2570 
2571 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2572 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2574 			if (dev_out == NULL)
2575 				goto out;
2576 			dev_put(dev_out);
2577 			dev_out = NULL;
2578 		}
2579 	}
2580 
2581 
2582 	if (oldflp->oif) {
2583 		dev_out = dev_get_by_index(net, oldflp->oif);
2584 		err = -ENODEV;
2585 		if (dev_out == NULL)
2586 			goto out;
2587 
2588 		/* RACE: Check return value of inet_select_addr instead. */
2589 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2590 			dev_put(dev_out);
2591 			goto out;	/* Wrong error code */
2592 		}
2593 
2594 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2595 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2596 			if (!fl.fl4_src)
2597 				fl.fl4_src = inet_select_addr(dev_out, 0,
2598 							      RT_SCOPE_LINK);
2599 			goto make_route;
2600 		}
2601 		if (!fl.fl4_src) {
2602 			if (ipv4_is_multicast(oldflp->fl4_dst))
2603 				fl.fl4_src = inet_select_addr(dev_out, 0,
2604 							      fl.fl4_scope);
2605 			else if (!oldflp->fl4_dst)
2606 				fl.fl4_src = inet_select_addr(dev_out, 0,
2607 							      RT_SCOPE_HOST);
2608 		}
2609 	}
2610 
2611 	if (!fl.fl4_dst) {
2612 		fl.fl4_dst = fl.fl4_src;
2613 		if (!fl.fl4_dst)
2614 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2615 		if (dev_out)
2616 			dev_put(dev_out);
2617 		dev_out = net->loopback_dev;
2618 		dev_hold(dev_out);
2619 		fl.oif = net->loopback_dev->ifindex;
2620 		res.type = RTN_LOCAL;
2621 		flags |= RTCF_LOCAL;
2622 		goto make_route;
2623 	}
2624 
2625 	if (fib_lookup(net, &fl, &res)) {
2626 		res.fi = NULL;
2627 		if (oldflp->oif) {
2628 			/* Apparently, routing tables are wrong. Assume,
2629 			   that the destination is on link.
2630 
2631 			   WHY? DW.
2632 			   Because we are allowed to send to iface
2633 			   even if it has NO routes and NO assigned
2634 			   addresses. When oif is specified, routing
2635 			   tables are looked up with only one purpose:
2636 			   to catch if destination is gatewayed, rather than
2637 			   direct. Moreover, if MSG_DONTROUTE is set,
2638 			   we send packet, ignoring both routing tables
2639 			   and ifaddr state. --ANK
2640 
2641 
2642 			   We could make it even if oif is unknown,
2643 			   likely IPv6, but we do not.
2644 			 */
2645 
2646 			if (fl.fl4_src == 0)
2647 				fl.fl4_src = inet_select_addr(dev_out, 0,
2648 							      RT_SCOPE_LINK);
2649 			res.type = RTN_UNICAST;
2650 			goto make_route;
2651 		}
2652 		if (dev_out)
2653 			dev_put(dev_out);
2654 		err = -ENETUNREACH;
2655 		goto out;
2656 	}
2657 	free_res = 1;
2658 
2659 	if (res.type == RTN_LOCAL) {
2660 		if (!fl.fl4_src)
2661 			fl.fl4_src = fl.fl4_dst;
2662 		if (dev_out)
2663 			dev_put(dev_out);
2664 		dev_out = net->loopback_dev;
2665 		dev_hold(dev_out);
2666 		fl.oif = dev_out->ifindex;
2667 		if (res.fi)
2668 			fib_info_put(res.fi);
2669 		res.fi = NULL;
2670 		flags |= RTCF_LOCAL;
2671 		goto make_route;
2672 	}
2673 
2674 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2675 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2676 		fib_select_multipath(&fl, &res);
2677 	else
2678 #endif
2679 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2680 		fib_select_default(net, &fl, &res);
2681 
2682 	if (!fl.fl4_src)
2683 		fl.fl4_src = FIB_RES_PREFSRC(res);
2684 
2685 	if (dev_out)
2686 		dev_put(dev_out);
2687 	dev_out = FIB_RES_DEV(res);
2688 	dev_hold(dev_out);
2689 	fl.oif = dev_out->ifindex;
2690 
2691 
2692 make_route:
2693 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2694 
2695 
2696 	if (free_res)
2697 		fib_res_put(&res);
2698 	if (dev_out)
2699 		dev_put(dev_out);
2700 out:	return err;
2701 }
2702 
2703 int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 			  const struct flowi *flp)
2705 {
2706 	unsigned hash;
2707 	struct rtable *rth;
2708 
2709 	if (!rt_caching(net))
2710 		goto slow_output;
2711 
2712 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2713 
2714 	rcu_read_lock_bh();
2715 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2716 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2717 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2718 		    rth->fl.fl4_src == flp->fl4_src &&
2719 		    rth->fl.iif == 0 &&
2720 		    rth->fl.oif == flp->oif &&
2721 		    rth->fl.mark == flp->mark &&
2722 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2723 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2724 		    net_eq(dev_net(rth->dst.dev), net) &&
2725 		    !rt_is_expired(rth)) {
2726 			dst_use(&rth->dst, jiffies);
2727 			RT_CACHE_STAT_INC(out_hit);
2728 			rcu_read_unlock_bh();
2729 			*rp = rth;
2730 			return 0;
2731 		}
2732 		RT_CACHE_STAT_INC(out_hlist_search);
2733 	}
2734 	rcu_read_unlock_bh();
2735 
2736 slow_output:
2737 	return ip_route_output_slow(net, rp, flp);
2738 }
2739 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 
2741 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2742 {
2743 }
2744 
2745 static struct dst_ops ipv4_dst_blackhole_ops = {
2746 	.family			=	AF_INET,
2747 	.protocol		=	cpu_to_be16(ETH_P_IP),
2748 	.destroy		=	ipv4_dst_destroy,
2749 	.check			=	ipv4_dst_check,
2750 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2751 	.entries		=	ATOMIC_INIT(0),
2752 };
2753 
2754 
2755 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2756 {
2757 	struct rtable *ort = *rp;
2758 	struct rtable *rt = (struct rtable *)
2759 		dst_alloc(&ipv4_dst_blackhole_ops);
2760 
2761 	if (rt) {
2762 		struct dst_entry *new = &rt->dst;
2763 
2764 		atomic_set(&new->__refcnt, 1);
2765 		new->__use = 1;
2766 		new->input = dst_discard;
2767 		new->output = dst_discard;
2768 		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2769 
2770 		new->dev = ort->dst.dev;
2771 		if (new->dev)
2772 			dev_hold(new->dev);
2773 
2774 		rt->fl = ort->fl;
2775 
2776 		rt->idev = ort->idev;
2777 		if (rt->idev)
2778 			in_dev_hold(rt->idev);
2779 		rt->rt_genid = rt_genid(net);
2780 		rt->rt_flags = ort->rt_flags;
2781 		rt->rt_type = ort->rt_type;
2782 		rt->rt_dst = ort->rt_dst;
2783 		rt->rt_src = ort->rt_src;
2784 		rt->rt_iif = ort->rt_iif;
2785 		rt->rt_gateway = ort->rt_gateway;
2786 		rt->rt_spec_dst = ort->rt_spec_dst;
2787 		rt->peer = ort->peer;
2788 		if (rt->peer)
2789 			atomic_inc(&rt->peer->refcnt);
2790 
2791 		dst_free(new);
2792 	}
2793 
2794 	dst_release(&(*rp)->dst);
2795 	*rp = rt;
2796 	return (rt ? 0 : -ENOMEM);
2797 }
2798 
2799 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2800 			 struct sock *sk, int flags)
2801 {
2802 	int err;
2803 
2804 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2805 		return err;
2806 
2807 	if (flp->proto) {
2808 		if (!flp->fl4_src)
2809 			flp->fl4_src = (*rp)->rt_src;
2810 		if (!flp->fl4_dst)
2811 			flp->fl4_dst = (*rp)->rt_dst;
2812 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2813 				    flags ? XFRM_LOOKUP_WAIT : 0);
2814 		if (err == -EREMOTE)
2815 			err = ipv4_dst_blackhole(net, rp, flp);
2816 
2817 		return err;
2818 	}
2819 
2820 	return 0;
2821 }
2822 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2823 
2824 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2825 {
2826 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2827 }
2828 EXPORT_SYMBOL(ip_route_output_key);
2829 
2830 static int rt_fill_info(struct net *net,
2831 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2832 			int nowait, unsigned int flags)
2833 {
2834 	struct rtable *rt = skb_rtable(skb);
2835 	struct rtmsg *r;
2836 	struct nlmsghdr *nlh;
2837 	long expires;
2838 	u32 id = 0, ts = 0, tsage = 0, error;
2839 
2840 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2841 	if (nlh == NULL)
2842 		return -EMSGSIZE;
2843 
2844 	r = nlmsg_data(nlh);
2845 	r->rtm_family	 = AF_INET;
2846 	r->rtm_dst_len	= 32;
2847 	r->rtm_src_len	= 0;
2848 	r->rtm_tos	= rt->fl.fl4_tos;
2849 	r->rtm_table	= RT_TABLE_MAIN;
2850 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2851 	r->rtm_type	= rt->rt_type;
2852 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2853 	r->rtm_protocol = RTPROT_UNSPEC;
2854 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2855 	if (rt->rt_flags & RTCF_NOTIFY)
2856 		r->rtm_flags |= RTM_F_NOTIFY;
2857 
2858 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2859 
2860 	if (rt->fl.fl4_src) {
2861 		r->rtm_src_len = 32;
2862 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2863 	}
2864 	if (rt->dst.dev)
2865 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2866 #ifdef CONFIG_NET_CLS_ROUTE
2867 	if (rt->dst.tclassid)
2868 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2869 #endif
2870 	if (rt->fl.iif)
2871 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2872 	else if (rt->rt_src != rt->fl.fl4_src)
2873 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2874 
2875 	if (rt->rt_dst != rt->rt_gateway)
2876 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2877 
2878 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2879 		goto nla_put_failure;
2880 
2881 	if (rt->fl.mark)
2882 		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2883 
2884 	error = rt->dst.error;
2885 	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2886 	if (rt->peer) {
2887 		inet_peer_refcheck(rt->peer);
2888 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2889 		if (rt->peer->tcp_ts_stamp) {
2890 			ts = rt->peer->tcp_ts;
2891 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2892 		}
2893 	}
2894 
2895 	if (rt->fl.iif) {
2896 #ifdef CONFIG_IP_MROUTE
2897 		__be32 dst = rt->rt_dst;
2898 
2899 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2900 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2901 			int err = ipmr_get_route(net, skb, r, nowait);
2902 			if (err <= 0) {
2903 				if (!nowait) {
2904 					if (err == 0)
2905 						return 0;
2906 					goto nla_put_failure;
2907 				} else {
2908 					if (err == -EMSGSIZE)
2909 						goto nla_put_failure;
2910 					error = err;
2911 				}
2912 			}
2913 		} else
2914 #endif
2915 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2916 	}
2917 
2918 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2919 			       expires, error) < 0)
2920 		goto nla_put_failure;
2921 
2922 	return nlmsg_end(skb, nlh);
2923 
2924 nla_put_failure:
2925 	nlmsg_cancel(skb, nlh);
2926 	return -EMSGSIZE;
2927 }
2928 
2929 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2930 {
2931 	struct net *net = sock_net(in_skb->sk);
2932 	struct rtmsg *rtm;
2933 	struct nlattr *tb[RTA_MAX+1];
2934 	struct rtable *rt = NULL;
2935 	__be32 dst = 0;
2936 	__be32 src = 0;
2937 	u32 iif;
2938 	int err;
2939 	int mark;
2940 	struct sk_buff *skb;
2941 
2942 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2943 	if (err < 0)
2944 		goto errout;
2945 
2946 	rtm = nlmsg_data(nlh);
2947 
2948 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2949 	if (skb == NULL) {
2950 		err = -ENOBUFS;
2951 		goto errout;
2952 	}
2953 
2954 	/* Reserve room for dummy headers, this skb can pass
2955 	   through good chunk of routing engine.
2956 	 */
2957 	skb_reset_mac_header(skb);
2958 	skb_reset_network_header(skb);
2959 
2960 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2961 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2962 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2963 
2964 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2965 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2966 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2967 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2968 
2969 	if (iif) {
2970 		struct net_device *dev;
2971 
2972 		dev = __dev_get_by_index(net, iif);
2973 		if (dev == NULL) {
2974 			err = -ENODEV;
2975 			goto errout_free;
2976 		}
2977 
2978 		skb->protocol	= htons(ETH_P_IP);
2979 		skb->dev	= dev;
2980 		skb->mark	= mark;
2981 		local_bh_disable();
2982 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2983 		local_bh_enable();
2984 
2985 		rt = skb_rtable(skb);
2986 		if (err == 0 && rt->dst.error)
2987 			err = -rt->dst.error;
2988 	} else {
2989 		struct flowi fl = {
2990 			.nl_u = {
2991 				.ip4_u = {
2992 					.daddr = dst,
2993 					.saddr = src,
2994 					.tos = rtm->rtm_tos,
2995 				},
2996 			},
2997 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2998 			.mark = mark,
2999 		};
3000 		err = ip_route_output_key(net, &rt, &fl);
3001 	}
3002 
3003 	if (err)
3004 		goto errout_free;
3005 
3006 	skb_dst_set(skb, &rt->dst);
3007 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3008 		rt->rt_flags |= RTCF_NOTIFY;
3009 
3010 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3011 			   RTM_NEWROUTE, 0, 0);
3012 	if (err <= 0)
3013 		goto errout_free;
3014 
3015 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3016 errout:
3017 	return err;
3018 
3019 errout_free:
3020 	kfree_skb(skb);
3021 	goto errout;
3022 }
3023 
3024 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3025 {
3026 	struct rtable *rt;
3027 	int h, s_h;
3028 	int idx, s_idx;
3029 	struct net *net;
3030 
3031 	net = sock_net(skb->sk);
3032 
3033 	s_h = cb->args[0];
3034 	if (s_h < 0)
3035 		s_h = 0;
3036 	s_idx = idx = cb->args[1];
3037 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3038 		if (!rt_hash_table[h].chain)
3039 			continue;
3040 		rcu_read_lock_bh();
3041 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3042 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3043 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3044 				continue;
3045 			if (rt_is_expired(rt))
3046 				continue;
3047 			skb_dst_set_noref(skb, &rt->dst);
3048 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3049 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3050 					 1, NLM_F_MULTI) <= 0) {
3051 				skb_dst_drop(skb);
3052 				rcu_read_unlock_bh();
3053 				goto done;
3054 			}
3055 			skb_dst_drop(skb);
3056 		}
3057 		rcu_read_unlock_bh();
3058 	}
3059 
3060 done:
3061 	cb->args[0] = h;
3062 	cb->args[1] = idx;
3063 	return skb->len;
3064 }
3065 
3066 void ip_rt_multicast_event(struct in_device *in_dev)
3067 {
3068 	rt_cache_flush(dev_net(in_dev->dev), 0);
3069 }
3070 
3071 #ifdef CONFIG_SYSCTL
3072 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3073 					void __user *buffer,
3074 					size_t *lenp, loff_t *ppos)
3075 {
3076 	if (write) {
3077 		int flush_delay;
3078 		ctl_table ctl;
3079 		struct net *net;
3080 
3081 		memcpy(&ctl, __ctl, sizeof(ctl));
3082 		ctl.data = &flush_delay;
3083 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3084 
3085 		net = (struct net *)__ctl->extra1;
3086 		rt_cache_flush(net, flush_delay);
3087 		return 0;
3088 	}
3089 
3090 	return -EINVAL;
3091 }
3092 
3093 static ctl_table ipv4_route_table[] = {
3094 	{
3095 		.procname	= "gc_thresh",
3096 		.data		= &ipv4_dst_ops.gc_thresh,
3097 		.maxlen		= sizeof(int),
3098 		.mode		= 0644,
3099 		.proc_handler	= proc_dointvec,
3100 	},
3101 	{
3102 		.procname	= "max_size",
3103 		.data		= &ip_rt_max_size,
3104 		.maxlen		= sizeof(int),
3105 		.mode		= 0644,
3106 		.proc_handler	= proc_dointvec,
3107 	},
3108 	{
3109 		/*  Deprecated. Use gc_min_interval_ms */
3110 
3111 		.procname	= "gc_min_interval",
3112 		.data		= &ip_rt_gc_min_interval,
3113 		.maxlen		= sizeof(int),
3114 		.mode		= 0644,
3115 		.proc_handler	= proc_dointvec_jiffies,
3116 	},
3117 	{
3118 		.procname	= "gc_min_interval_ms",
3119 		.data		= &ip_rt_gc_min_interval,
3120 		.maxlen		= sizeof(int),
3121 		.mode		= 0644,
3122 		.proc_handler	= proc_dointvec_ms_jiffies,
3123 	},
3124 	{
3125 		.procname	= "gc_timeout",
3126 		.data		= &ip_rt_gc_timeout,
3127 		.maxlen		= sizeof(int),
3128 		.mode		= 0644,
3129 		.proc_handler	= proc_dointvec_jiffies,
3130 	},
3131 	{
3132 		.procname	= "gc_interval",
3133 		.data		= &ip_rt_gc_interval,
3134 		.maxlen		= sizeof(int),
3135 		.mode		= 0644,
3136 		.proc_handler	= proc_dointvec_jiffies,
3137 	},
3138 	{
3139 		.procname	= "redirect_load",
3140 		.data		= &ip_rt_redirect_load,
3141 		.maxlen		= sizeof(int),
3142 		.mode		= 0644,
3143 		.proc_handler	= proc_dointvec,
3144 	},
3145 	{
3146 		.procname	= "redirect_number",
3147 		.data		= &ip_rt_redirect_number,
3148 		.maxlen		= sizeof(int),
3149 		.mode		= 0644,
3150 		.proc_handler	= proc_dointvec,
3151 	},
3152 	{
3153 		.procname	= "redirect_silence",
3154 		.data		= &ip_rt_redirect_silence,
3155 		.maxlen		= sizeof(int),
3156 		.mode		= 0644,
3157 		.proc_handler	= proc_dointvec,
3158 	},
3159 	{
3160 		.procname	= "error_cost",
3161 		.data		= &ip_rt_error_cost,
3162 		.maxlen		= sizeof(int),
3163 		.mode		= 0644,
3164 		.proc_handler	= proc_dointvec,
3165 	},
3166 	{
3167 		.procname	= "error_burst",
3168 		.data		= &ip_rt_error_burst,
3169 		.maxlen		= sizeof(int),
3170 		.mode		= 0644,
3171 		.proc_handler	= proc_dointvec,
3172 	},
3173 	{
3174 		.procname	= "gc_elasticity",
3175 		.data		= &ip_rt_gc_elasticity,
3176 		.maxlen		= sizeof(int),
3177 		.mode		= 0644,
3178 		.proc_handler	= proc_dointvec,
3179 	},
3180 	{
3181 		.procname	= "mtu_expires",
3182 		.data		= &ip_rt_mtu_expires,
3183 		.maxlen		= sizeof(int),
3184 		.mode		= 0644,
3185 		.proc_handler	= proc_dointvec_jiffies,
3186 	},
3187 	{
3188 		.procname	= "min_pmtu",
3189 		.data		= &ip_rt_min_pmtu,
3190 		.maxlen		= sizeof(int),
3191 		.mode		= 0644,
3192 		.proc_handler	= proc_dointvec,
3193 	},
3194 	{
3195 		.procname	= "min_adv_mss",
3196 		.data		= &ip_rt_min_advmss,
3197 		.maxlen		= sizeof(int),
3198 		.mode		= 0644,
3199 		.proc_handler	= proc_dointvec,
3200 	},
3201 	{ }
3202 };
3203 
3204 static struct ctl_table empty[1];
3205 
3206 static struct ctl_table ipv4_skeleton[] =
3207 {
3208 	{ .procname = "route",
3209 	  .mode = 0555, .child = ipv4_route_table},
3210 	{ .procname = "neigh",
3211 	  .mode = 0555, .child = empty},
3212 	{ }
3213 };
3214 
3215 static __net_initdata struct ctl_path ipv4_path[] = {
3216 	{ .procname = "net", },
3217 	{ .procname = "ipv4", },
3218 	{ },
3219 };
3220 
3221 static struct ctl_table ipv4_route_flush_table[] = {
3222 	{
3223 		.procname	= "flush",
3224 		.maxlen		= sizeof(int),
3225 		.mode		= 0200,
3226 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3227 	},
3228 	{ },
3229 };
3230 
3231 static __net_initdata struct ctl_path ipv4_route_path[] = {
3232 	{ .procname = "net", },
3233 	{ .procname = "ipv4", },
3234 	{ .procname = "route", },
3235 	{ },
3236 };
3237 
3238 static __net_init int sysctl_route_net_init(struct net *net)
3239 {
3240 	struct ctl_table *tbl;
3241 
3242 	tbl = ipv4_route_flush_table;
3243 	if (!net_eq(net, &init_net)) {
3244 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3245 		if (tbl == NULL)
3246 			goto err_dup;
3247 	}
3248 	tbl[0].extra1 = net;
3249 
3250 	net->ipv4.route_hdr =
3251 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3252 	if (net->ipv4.route_hdr == NULL)
3253 		goto err_reg;
3254 	return 0;
3255 
3256 err_reg:
3257 	if (tbl != ipv4_route_flush_table)
3258 		kfree(tbl);
3259 err_dup:
3260 	return -ENOMEM;
3261 }
3262 
3263 static __net_exit void sysctl_route_net_exit(struct net *net)
3264 {
3265 	struct ctl_table *tbl;
3266 
3267 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3268 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3269 	BUG_ON(tbl == ipv4_route_flush_table);
3270 	kfree(tbl);
3271 }
3272 
3273 static __net_initdata struct pernet_operations sysctl_route_ops = {
3274 	.init = sysctl_route_net_init,
3275 	.exit = sysctl_route_net_exit,
3276 };
3277 #endif
3278 
3279 static __net_init int rt_genid_init(struct net *net)
3280 {
3281 	get_random_bytes(&net->ipv4.rt_genid,
3282 			 sizeof(net->ipv4.rt_genid));
3283 	return 0;
3284 }
3285 
3286 static __net_initdata struct pernet_operations rt_genid_ops = {
3287 	.init = rt_genid_init,
3288 };
3289 
3290 
3291 #ifdef CONFIG_NET_CLS_ROUTE
3292 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3293 #endif /* CONFIG_NET_CLS_ROUTE */
3294 
3295 static __initdata unsigned long rhash_entries;
3296 static int __init set_rhash_entries(char *str)
3297 {
3298 	if (!str)
3299 		return 0;
3300 	rhash_entries = simple_strtoul(str, &str, 0);
3301 	return 1;
3302 }
3303 __setup("rhash_entries=", set_rhash_entries);
3304 
3305 int __init ip_rt_init(void)
3306 {
3307 	int rc = 0;
3308 
3309 #ifdef CONFIG_NET_CLS_ROUTE
3310 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3311 	if (!ip_rt_acct)
3312 		panic("IP: failed to allocate ip_rt_acct\n");
3313 #endif
3314 
3315 	ipv4_dst_ops.kmem_cachep =
3316 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3317 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3318 
3319 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3320 
3321 	rt_hash_table = (struct rt_hash_bucket *)
3322 		alloc_large_system_hash("IP route cache",
3323 					sizeof(struct rt_hash_bucket),
3324 					rhash_entries,
3325 					(totalram_pages >= 128 * 1024) ?
3326 					15 : 17,
3327 					0,
3328 					&rt_hash_log,
3329 					&rt_hash_mask,
3330 					rhash_entries ? 0 : 512 * 1024);
3331 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3332 	rt_hash_lock_init();
3333 
3334 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3335 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3336 
3337 	devinet_init();
3338 	ip_fib_init();
3339 
3340 	/* All the timers, started at system startup tend
3341 	   to synchronize. Perturb it a bit.
3342 	 */
3343 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3344 	expires_ljiffies = jiffies;
3345 	schedule_delayed_work(&expires_work,
3346 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3347 
3348 	if (ip_rt_proc_init())
3349 		printk(KERN_ERR "Unable to create route proc files\n");
3350 #ifdef CONFIG_XFRM
3351 	xfrm_init();
3352 	xfrm4_init(ip_rt_max_size);
3353 #endif
3354 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3355 
3356 #ifdef CONFIG_SYSCTL
3357 	register_pernet_subsys(&sysctl_route_ops);
3358 #endif
3359 	register_pernet_subsys(&rt_genid_ops);
3360 	return rc;
3361 }
3362 
3363 #ifdef CONFIG_SYSCTL
3364 /*
3365  * We really need to sanitize the damn ipv4 init order, then all
3366  * this nonsense will go away.
3367  */
3368 void __init ip_static_sysctl_init(void)
3369 {
3370 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3371 }
3372 #endif
3373