xref: /linux/net/ipv4/route.c (revision 5bdef865eb358b6f3760e25e591ae115e9eeddef)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 
114 #define IP_MAX_MTU	0xFFF0
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly	= 8;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150 
151 
152 static struct dst_ops ipv4_dst_ops = {
153 	.family =		AF_INET,
154 	.protocol =		cpu_to_be16(ETH_P_IP),
155 	.gc =			rt_garbage_collect,
156 	.check =		ipv4_dst_check,
157 	.destroy =		ipv4_dst_destroy,
158 	.ifdown =		ipv4_dst_ifdown,
159 	.negative_advice =	ipv4_negative_advice,
160 	.link_failure =		ipv4_link_failure,
161 	.update_pmtu =		ip_rt_update_pmtu,
162 	.local_out =		__ip_local_out,
163 	.entries =		ATOMIC_INIT(0),
164 };
165 
166 #define ECN_OR_COST(class)	TC_PRIO_##class
167 
168 const __u8 ip_tos2prio[16] = {
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(FILLER),
171 	TC_PRIO_BESTEFFORT,
172 	ECN_OR_COST(BESTEFFORT),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_BULK,
176 	ECN_OR_COST(BULK),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE,
180 	ECN_OR_COST(INTERACTIVE),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK),
183 	TC_PRIO_INTERACTIVE_BULK,
184 	ECN_OR_COST(INTERACTIVE_BULK)
185 };
186 
187 
188 /*
189  * Route cache.
190  */
191 
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201 
202 struct rt_hash_bucket {
203 	struct rtable	*chain;
204 };
205 
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 	defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ	256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ	4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ	2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ	1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ	512
224 # else
225 #  define RT_HASH_LOCK_SZ	256
226 # endif
227 #endif
228 
229 static spinlock_t	*rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 
232 static __init void rt_hash_lock_init(void)
233 {
234 	int i;
235 
236 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 			GFP_KERNEL);
238 	if (!rt_hash_locks)
239 		panic("IP: failed to allocate rt_hash_locks\n");
240 
241 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 		spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251 
252 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
253 static unsigned			rt_hash_mask __read_mostly;
254 static unsigned int		rt_hash_log  __read_mostly;
255 
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 	(__raw_get_cpu_var(rt_cache_stat).field++)
259 
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 		int genid)
262 {
263 	return jhash_3words((__force u32)(__be32)(daddr),
264 			    (__force u32)(__be32)(saddr),
265 			    idx, genid)
266 		& rt_hash_mask;
267 }
268 
269 static inline int rt_genid(struct net *net)
270 {
271 	return atomic_read(&net->ipv4.rt_genid);
272 }
273 
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 	struct seq_net_private p;
277 	int bucket;
278 	int genid;
279 };
280 
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283 	struct rt_cache_iter_state *st = seq->private;
284 	struct rtable *r = NULL;
285 
286 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 		if (!rt_hash_table[st->bucket].chain)
288 			continue;
289 		rcu_read_lock_bh();
290 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 		while (r) {
292 			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 			    r->rt_genid == st->genid)
294 				return r;
295 			r = rcu_dereference(r->u.dst.rt_next);
296 		}
297 		rcu_read_unlock_bh();
298 	}
299 	return r;
300 }
301 
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 					  struct rtable *r)
304 {
305 	struct rt_cache_iter_state *st = seq->private;
306 
307 	r = r->u.dst.rt_next;
308 	while (!r) {
309 		rcu_read_unlock_bh();
310 		do {
311 			if (--st->bucket < 0)
312 				return NULL;
313 		} while (!rt_hash_table[st->bucket].chain);
314 		rcu_read_lock_bh();
315 		r = rt_hash_table[st->bucket].chain;
316 	}
317 	return rcu_dereference(r);
318 }
319 
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 					struct rtable *r)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 			continue;
327 		if (r->rt_genid == st->genid)
328 			break;
329 	}
330 	return r;
331 }
332 
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335 	struct rtable *r = rt_cache_get_first(seq);
336 
337 	if (r)
338 		while (pos && (r = rt_cache_get_next(seq, r)))
339 			--pos;
340 	return pos ? NULL : r;
341 }
342 
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 	if (*pos)
347 		return rt_cache_get_idx(seq, *pos - 1);
348 	st->genid = rt_genid(seq_file_net(seq));
349 	return SEQ_START_TOKEN;
350 }
351 
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354 	struct rtable *r;
355 
356 	if (v == SEQ_START_TOKEN)
357 		r = rt_cache_get_first(seq);
358 	else
359 		r = rt_cache_get_next(seq, v);
360 	++*pos;
361 	return r;
362 }
363 
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366 	if (v && v != SEQ_START_TOKEN)
367 		rcu_read_unlock_bh();
368 }
369 
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372 	if (v == SEQ_START_TOKEN)
373 		seq_printf(seq, "%-127s\n",
374 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 			   "HHUptod\tSpecDst");
377 	else {
378 		struct rtable *r = v;
379 		int len;
380 
381 		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 			r->u.dst.dev ? r->u.dst.dev->name : "*",
384 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 			r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 			dst_metric(&r->u.dst, RTAX_WINDOW),
390 			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 			r->fl.fl4_tos,
393 			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 				       dev_queue_xmit) : 0,
396 			r->rt_spec_dst, &len);
397 
398 		seq_printf(seq, "%*s\n", 127 - len, "");
399 	}
400 	return 0;
401 }
402 
403 static const struct seq_operations rt_cache_seq_ops = {
404 	.start  = rt_cache_seq_start,
405 	.next   = rt_cache_seq_next,
406 	.stop   = rt_cache_seq_stop,
407 	.show   = rt_cache_seq_show,
408 };
409 
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412 	return seq_open_net(inode, file, &rt_cache_seq_ops,
413 			sizeof(struct rt_cache_iter_state));
414 }
415 
416 static const struct file_operations rt_cache_seq_fops = {
417 	.owner	 = THIS_MODULE,
418 	.open	 = rt_cache_seq_open,
419 	.read	 = seq_read,
420 	.llseek	 = seq_lseek,
421 	.release = seq_release_net,
422 };
423 
424 
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427 	int cpu;
428 
429 	if (*pos == 0)
430 		return SEQ_START_TOKEN;
431 
432 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 		if (!cpu_possible(cpu))
434 			continue;
435 		*pos = cpu+1;
436 		return &per_cpu(rt_cache_stat, cpu);
437 	}
438 	return NULL;
439 }
440 
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443 	int cpu;
444 
445 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 		if (!cpu_possible(cpu))
447 			continue;
448 		*pos = cpu+1;
449 		return &per_cpu(rt_cache_stat, cpu);
450 	}
451 	return NULL;
452 
453 }
454 
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457 
458 }
459 
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462 	struct rt_cache_stat *st = v;
463 
464 	if (v == SEQ_START_TOKEN) {
465 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 		return 0;
467 	}
468 
469 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 		   atomic_read(&ipv4_dst_ops.entries),
472 		   st->in_hit,
473 		   st->in_slow_tot,
474 		   st->in_slow_mc,
475 		   st->in_no_route,
476 		   st->in_brd,
477 		   st->in_martian_dst,
478 		   st->in_martian_src,
479 
480 		   st->out_hit,
481 		   st->out_slow_tot,
482 		   st->out_slow_mc,
483 
484 		   st->gc_total,
485 		   st->gc_ignored,
486 		   st->gc_goal_miss,
487 		   st->gc_dst_overflow,
488 		   st->in_hlist_search,
489 		   st->out_hlist_search
490 		);
491 	return 0;
492 }
493 
494 static const struct seq_operations rt_cpu_seq_ops = {
495 	.start  = rt_cpu_seq_start,
496 	.next   = rt_cpu_seq_next,
497 	.stop   = rt_cpu_seq_stop,
498 	.show   = rt_cpu_seq_show,
499 };
500 
501 
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504 	return seq_open(file, &rt_cpu_seq_ops);
505 }
506 
507 static const struct file_operations rt_cpu_seq_fops = {
508 	.owner	 = THIS_MODULE,
509 	.open	 = rt_cpu_seq_open,
510 	.read	 = seq_read,
511 	.llseek	 = seq_lseek,
512 	.release = seq_release,
513 };
514 
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 			   int length, int *eof, void *data)
518 {
519 	unsigned int i;
520 
521 	if ((offset & 3) || (length & 3))
522 		return -EIO;
523 
524 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 		*eof = 1;
526 		return 0;
527 	}
528 
529 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 		length = sizeof(struct ip_rt_acct) * 256 - offset;
531 		*eof = 1;
532 	}
533 
534 	offset /= sizeof(u32);
535 
536 	if (length > 0) {
537 		u32 *dst = (u32 *) buffer;
538 
539 		*start = buffer;
540 		memset(dst, 0, length);
541 
542 		for_each_possible_cpu(i) {
543 			unsigned int j;
544 			u32 *src;
545 
546 			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 			for (j = 0; j < length/4; j++)
548 				dst[j] += src[j];
549 		}
550 	}
551 	return length;
552 }
553 #endif
554 
555 static int __net_init ip_rt_do_proc_init(struct net *net)
556 {
557 	struct proc_dir_entry *pde;
558 
559 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 			&rt_cache_seq_fops);
561 	if (!pde)
562 		goto err1;
563 
564 	pde = proc_create("rt_cache", S_IRUGO,
565 			  net->proc_net_stat, &rt_cpu_seq_fops);
566 	if (!pde)
567 		goto err2;
568 
569 #ifdef CONFIG_NET_CLS_ROUTE
570 	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 			ip_rt_acct_read, NULL);
572 	if (!pde)
573 		goto err3;
574 #endif
575 	return 0;
576 
577 #ifdef CONFIG_NET_CLS_ROUTE
578 err3:
579 	remove_proc_entry("rt_cache", net->proc_net_stat);
580 #endif
581 err2:
582 	remove_proc_entry("rt_cache", net->proc_net);
583 err1:
584 	return -ENOMEM;
585 }
586 
587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
588 {
589 	remove_proc_entry("rt_cache", net->proc_net_stat);
590 	remove_proc_entry("rt_cache", net->proc_net);
591 	remove_proc_entry("rt_acct", net->proc_net);
592 }
593 
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595 	.init = ip_rt_do_proc_init,
596 	.exit = ip_rt_do_proc_exit,
597 };
598 
599 static int __init ip_rt_proc_init(void)
600 {
601 	return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603 
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607 	return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610 
611 static inline void rt_free(struct rtable *rt)
612 {
613 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615 
616 static inline void rt_drop(struct rtable *rt)
617 {
618 	ip_rt_put(rt);
619 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621 
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624 	/* Kill broadcast/multicast entries very aggresively, if they
625 	   collide in hash table with more useful entries */
626 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 		rth->fl.iif && rth->u.dst.rt_next;
628 }
629 
630 static inline int rt_valuable(struct rtable *rth)
631 {
632 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 		rth->u.dst.expires;
634 }
635 
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638 	unsigned long age;
639 	int ret = 0;
640 
641 	if (atomic_read(&rth->u.dst.__refcnt))
642 		goto out;
643 
644 	ret = 1;
645 	if (rth->u.dst.expires &&
646 	    time_after_eq(jiffies, rth->u.dst.expires))
647 		goto out;
648 
649 	age = jiffies - rth->u.dst.lastuse;
650 	ret = 0;
651 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 	    (age <= tmo2 && rt_valuable(rth)))
653 		goto out;
654 	ret = 1;
655 out:	return ret;
656 }
657 
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665 	u32 score = jiffies - rt->u.dst.lastuse;
666 
667 	score = ~score & ~(3<<30);
668 
669 	if (rt_valuable(rt))
670 		score |= (1<<31);
671 
672 	if (!rt->fl.iif ||
673 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 		score |= (1<<30);
675 
676 	return score;
677 }
678 
679 static inline bool rt_caching(const struct net *net)
680 {
681 	return net->ipv4.current_rt_cache_rebuild_count <=
682 		net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684 
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 					const struct flowi *fl2)
687 {
688 	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 		(fl1->iif ^ fl2->iif)) == 0);
691 }
692 
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695 	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 		(fl1->mark ^ fl2->mark) |
698 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 		(fl1->oif ^ fl2->oif) |
701 		(fl1->iif ^ fl2->iif)) == 0;
702 }
703 
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707 }
708 
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711 	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713 
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721 	unsigned int i;
722 	struct rtable *rth, *next;
723 	struct rtable * tail;
724 
725 	for (i = 0; i <= rt_hash_mask; i++) {
726 		if (process_context && need_resched())
727 			cond_resched();
728 		rth = rt_hash_table[i].chain;
729 		if (!rth)
730 			continue;
731 
732 		spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734 		{
735 		struct rtable ** prev, * p;
736 
737 		rth = rt_hash_table[i].chain;
738 
739 		/* defer releasing the head of the list after spin_unlock */
740 		for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = *prev; p; p = next) {
749 			next = p->u.dst.rt_next;
750 			if (!rt_is_expired(p)) {
751 				prev = &p->u.dst.rt_next;
752 			} else {
753 				*prev = next;
754 				rt_free(p);
755 			}
756 		}
757 		}
758 #else
759 		rth = rt_hash_table[i].chain;
760 		rt_hash_table[i].chain = NULL;
761 		tail = NULL;
762 #endif
763 		spin_unlock_bh(rt_hash_lock_addr(i));
764 
765 		for (; rth != tail; rth = next) {
766 			next = rth->u.dst.rt_next;
767 			rt_free(rth);
768 		}
769 	}
770 }
771 
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779 
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782 
783 static void rt_check_expire(void)
784 {
785 	static unsigned int rover;
786 	unsigned int i = rover, goal;
787 	struct rtable *rth, *aux, **rthp;
788 	unsigned long samples = 0;
789 	unsigned long sum = 0, sum2 = 0;
790 	unsigned long delta;
791 	u64 mult;
792 
793 	delta = jiffies - expires_ljiffies;
794 	expires_ljiffies = jiffies;
795 	mult = ((u64)delta) << rt_hash_log;
796 	if (ip_rt_gc_timeout > 1)
797 		do_div(mult, ip_rt_gc_timeout);
798 	goal = (unsigned int)mult;
799 	if (goal > rt_hash_mask)
800 		goal = rt_hash_mask + 1;
801 	for (; goal > 0; goal--) {
802 		unsigned long tmo = ip_rt_gc_timeout;
803 		unsigned long length;
804 
805 		i = (i + 1) & rt_hash_mask;
806 		rthp = &rt_hash_table[i].chain;
807 
808 		if (need_resched())
809 			cond_resched();
810 
811 		samples++;
812 
813 		if (*rthp == NULL)
814 			continue;
815 		length = 0;
816 		spin_lock_bh(rt_hash_lock_addr(i));
817 		while ((rth = *rthp) != NULL) {
818 			prefetch(rth->u.dst.rt_next);
819 			if (rt_is_expired(rth)) {
820 				*rthp = rth->u.dst.rt_next;
821 				rt_free(rth);
822 				continue;
823 			}
824 			if (rth->u.dst.expires) {
825 				/* Entry is expired even if it is in use */
826 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
827 nofree:
828 					tmo >>= 1;
829 					rthp = &rth->u.dst.rt_next;
830 					/*
831 					 * We only count entries on
832 					 * a chain with equal hash inputs once
833 					 * so that entries for different QOS
834 					 * levels, and other non-hash input
835 					 * attributes don't unfairly skew
836 					 * the length computation
837 					 */
838 					for (aux = rt_hash_table[i].chain;;) {
839 						if (aux == rth) {
840 							length += ONE;
841 							break;
842 						}
843 						if (compare_hash_inputs(&aux->fl, &rth->fl))
844 							break;
845 						aux = aux->u.dst.rt_next;
846 					}
847 					continue;
848 				}
849 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 				goto nofree;
851 
852 			/* Cleanup aged off entries. */
853 			*rthp = rth->u.dst.rt_next;
854 			rt_free(rth);
855 		}
856 		spin_unlock_bh(rt_hash_lock_addr(i));
857 		sum += length;
858 		sum2 += length*length;
859 	}
860 	if (samples) {
861 		unsigned long avg = sum / samples;
862 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 		rt_chain_length_max = max_t(unsigned long,
864 					ip_rt_gc_elasticity,
865 					(avg + 4*sd) >> FRACT_BITS);
866 	}
867 	rover = i;
868 }
869 
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876 	rt_check_expire();
877 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879 
880 /*
881  * Pertubation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888 	unsigned char shuffle;
889 
890 	get_random_bytes(&shuffle, sizeof(shuffle));
891 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893 
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900 	rt_cache_invalidate(net);
901 	if (delay >= 0)
902 		rt_do_flush(!in_softirq());
903 }
904 
905 /*
906  * We change rt_genid and let gc do the cleanup
907  */
908 static void rt_secret_rebuild(unsigned long __net)
909 {
910 	struct net *net = (struct net *)__net;
911 	rt_cache_invalidate(net);
912 	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913 }
914 
915 static void rt_secret_rebuild_oneshot(struct net *net)
916 {
917 	del_timer_sync(&net->ipv4.rt_secret_timer);
918 	rt_cache_invalidate(net);
919 	if (ip_rt_secret_interval) {
920 		net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 		add_timer(&net->ipv4.rt_secret_timer);
922 	}
923 }
924 
925 static void rt_emergency_hash_rebuild(struct net *net)
926 {
927 	if (net_ratelimit()) {
928 		printk(KERN_WARNING "Route hash chain too long!\n");
929 		printk(KERN_WARNING "Adjust your secret_interval!\n");
930 	}
931 
932 	rt_secret_rebuild_oneshot(net);
933 }
934 
935 /*
936    Short description of GC goals.
937 
938    We want to build algorithm, which will keep routing cache
939    at some equilibrium point, when number of aged off entries
940    is kept approximately equal to newly generated ones.
941 
942    Current expiration strength is variable "expire".
943    We try to adjust it dynamically, so that if networking
944    is idle expires is large enough to keep enough of warm entries,
945    and when load increases it reduces to limit cache size.
946  */
947 
948 static int rt_garbage_collect(struct dst_ops *ops)
949 {
950 	static unsigned long expire = RT_GC_TIMEOUT;
951 	static unsigned long last_gc;
952 	static int rover;
953 	static int equilibrium;
954 	struct rtable *rth, **rthp;
955 	unsigned long now = jiffies;
956 	int goal;
957 
958 	/*
959 	 * Garbage collection is pretty expensive,
960 	 * do not make it too frequently.
961 	 */
962 
963 	RT_CACHE_STAT_INC(gc_total);
964 
965 	if (now - last_gc < ip_rt_gc_min_interval &&
966 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 		RT_CACHE_STAT_INC(gc_ignored);
968 		goto out;
969 	}
970 
971 	/* Calculate number of entries, which we want to expire now. */
972 	goal = atomic_read(&ipv4_dst_ops.entries) -
973 		(ip_rt_gc_elasticity << rt_hash_log);
974 	if (goal <= 0) {
975 		if (equilibrium < ipv4_dst_ops.gc_thresh)
976 			equilibrium = ipv4_dst_ops.gc_thresh;
977 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 		if (goal > 0) {
979 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 		}
982 	} else {
983 		/* We are in dangerous area. Try to reduce cache really
984 		 * aggressively.
985 		 */
986 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 	}
989 
990 	if (now - last_gc >= ip_rt_gc_min_interval)
991 		last_gc = now;
992 
993 	if (goal <= 0) {
994 		equilibrium += goal;
995 		goto work_done;
996 	}
997 
998 	do {
999 		int i, k;
1000 
1001 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 			unsigned long tmo = expire;
1003 
1004 			k = (k + 1) & rt_hash_mask;
1005 			rthp = &rt_hash_table[k].chain;
1006 			spin_lock_bh(rt_hash_lock_addr(k));
1007 			while ((rth = *rthp) != NULL) {
1008 				if (!rt_is_expired(rth) &&
1009 					!rt_may_expire(rth, tmo, expire)) {
1010 					tmo >>= 1;
1011 					rthp = &rth->u.dst.rt_next;
1012 					continue;
1013 				}
1014 				*rthp = rth->u.dst.rt_next;
1015 				rt_free(rth);
1016 				goal--;
1017 			}
1018 			spin_unlock_bh(rt_hash_lock_addr(k));
1019 			if (goal <= 0)
1020 				break;
1021 		}
1022 		rover = k;
1023 
1024 		if (goal <= 0)
1025 			goto work_done;
1026 
1027 		/* Goal is not achieved. We stop process if:
1028 
1029 		   - if expire reduced to zero. Otherwise, expire is halfed.
1030 		   - if table is not full.
1031 		   - if we are called from interrupt.
1032 		   - jiffies check is just fallback/debug loop breaker.
1033 		     We will not spin here for long time in any case.
1034 		 */
1035 
1036 		RT_CACHE_STAT_INC(gc_goal_miss);
1037 
1038 		if (expire == 0)
1039 			break;
1040 
1041 		expire >>= 1;
1042 #if RT_CACHE_DEBUG >= 2
1043 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 				atomic_read(&ipv4_dst_ops.entries), goal, i);
1045 #endif
1046 
1047 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 			goto out;
1049 	} while (!in_softirq() && time_before_eq(jiffies, now));
1050 
1051 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 		goto out;
1053 	if (net_ratelimit())
1054 		printk(KERN_WARNING "dst cache overflow\n");
1055 	RT_CACHE_STAT_INC(gc_dst_overflow);
1056 	return 1;
1057 
1058 work_done:
1059 	expire += ip_rt_gc_min_interval;
1060 	if (expire > ip_rt_gc_timeout ||
1061 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 		expire = ip_rt_gc_timeout;
1063 #if RT_CACHE_DEBUG >= 2
1064 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066 #endif
1067 out:	return 0;
1068 }
1069 
1070 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 			  struct rtable **rp, struct sk_buff *skb)
1072 {
1073 	struct rtable	*rth, **rthp;
1074 	unsigned long	now;
1075 	struct rtable *cand, **candp;
1076 	u32 		min_score;
1077 	int		chain_length;
1078 	int attempts = !in_softirq();
1079 
1080 restart:
1081 	chain_length = 0;
1082 	min_score = ~(u32)0;
1083 	cand = NULL;
1084 	candp = NULL;
1085 	now = jiffies;
1086 
1087 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088 		/*
1089 		 * If we're not caching, just tell the caller we
1090 		 * were successful and don't touch the route.  The
1091 		 * caller hold the sole reference to the cache entry, and
1092 		 * it will be released when the caller is done with it.
1093 		 * If we drop it here, the callers have no way to resolve routes
1094 		 * when we're not caching.  Instead, just point *rp at rt, so
1095 		 * the caller gets a single use out of the route
1096 		 * Note that we do rt_free on this new route entry, so that
1097 		 * once its refcount hits zero, we are still able to reap it
1098 		 * (Thanks Alexey)
1099 		 * Note also the rt_free uses call_rcu.  We don't actually
1100 		 * need rcu protection here, this is just our path to get
1101 		 * on the route gc list.
1102 		 */
1103 
1104 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105 			int err = arp_bind_neighbour(&rt->u.dst);
1106 			if (err) {
1107 				if (net_ratelimit())
1108 					printk(KERN_WARNING
1109 					    "Neighbour table failure & not caching routes.\n");
1110 				rt_drop(rt);
1111 				return err;
1112 			}
1113 		}
1114 
1115 		rt_free(rt);
1116 		goto skip_hashing;
1117 	}
1118 
1119 	rthp = &rt_hash_table[hash].chain;
1120 
1121 	spin_lock_bh(rt_hash_lock_addr(hash));
1122 	while ((rth = *rthp) != NULL) {
1123 		if (rt_is_expired(rth)) {
1124 			*rthp = rth->u.dst.rt_next;
1125 			rt_free(rth);
1126 			continue;
1127 		}
1128 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129 			/* Put it first */
1130 			*rthp = rth->u.dst.rt_next;
1131 			/*
1132 			 * Since lookup is lockfree, the deletion
1133 			 * must be visible to another weakly ordered CPU before
1134 			 * the insertion at the start of the hash chain.
1135 			 */
1136 			rcu_assign_pointer(rth->u.dst.rt_next,
1137 					   rt_hash_table[hash].chain);
1138 			/*
1139 			 * Since lookup is lockfree, the update writes
1140 			 * must be ordered for consistency on SMP.
1141 			 */
1142 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143 
1144 			dst_use(&rth->u.dst, now);
1145 			spin_unlock_bh(rt_hash_lock_addr(hash));
1146 
1147 			rt_drop(rt);
1148 			if (rp)
1149 				*rp = rth;
1150 			else
1151 				skb_dst_set(skb, &rth->u.dst);
1152 			return 0;
1153 		}
1154 
1155 		if (!atomic_read(&rth->u.dst.__refcnt)) {
1156 			u32 score = rt_score(rth);
1157 
1158 			if (score <= min_score) {
1159 				cand = rth;
1160 				candp = rthp;
1161 				min_score = score;
1162 			}
1163 		}
1164 
1165 		chain_length++;
1166 
1167 		rthp = &rth->u.dst.rt_next;
1168 	}
1169 
1170 	if (cand) {
1171 		/* ip_rt_gc_elasticity used to be average length of chain
1172 		 * length, when exceeded gc becomes really aggressive.
1173 		 *
1174 		 * The second limit is less certain. At the moment it allows
1175 		 * only 2 entries per bucket. We will see.
1176 		 */
1177 		if (chain_length > ip_rt_gc_elasticity) {
1178 			*candp = cand->u.dst.rt_next;
1179 			rt_free(cand);
1180 		}
1181 	} else {
1182 		if (chain_length > rt_chain_length_max) {
1183 			struct net *net = dev_net(rt->u.dst.dev);
1184 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185 			if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187 					rt->u.dst.dev->name, num);
1188 			}
1189 			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190 		}
1191 	}
1192 
1193 	/* Try to bind route to arp only if it is output
1194 	   route or unicast forwarding path.
1195 	 */
1196 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197 		int err = arp_bind_neighbour(&rt->u.dst);
1198 		if (err) {
1199 			spin_unlock_bh(rt_hash_lock_addr(hash));
1200 
1201 			if (err != -ENOBUFS) {
1202 				rt_drop(rt);
1203 				return err;
1204 			}
1205 
1206 			/* Neighbour tables are full and nothing
1207 			   can be released. Try to shrink route cache,
1208 			   it is most likely it holds some neighbour records.
1209 			 */
1210 			if (attempts-- > 0) {
1211 				int saved_elasticity = ip_rt_gc_elasticity;
1212 				int saved_int = ip_rt_gc_min_interval;
1213 				ip_rt_gc_elasticity	= 1;
1214 				ip_rt_gc_min_interval	= 0;
1215 				rt_garbage_collect(&ipv4_dst_ops);
1216 				ip_rt_gc_min_interval	= saved_int;
1217 				ip_rt_gc_elasticity	= saved_elasticity;
1218 				goto restart;
1219 			}
1220 
1221 			if (net_ratelimit())
1222 				printk(KERN_WARNING "Neighbour table overflow.\n");
1223 			rt_drop(rt);
1224 			return -ENOBUFS;
1225 		}
1226 	}
1227 
1228 	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229 
1230 #if RT_CACHE_DEBUG >= 2
1231 	if (rt->u.dst.rt_next) {
1232 		struct rtable *trt;
1233 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234 		       hash, &rt->rt_dst);
1235 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236 			printk(" . %pI4", &trt->rt_dst);
1237 		printk("\n");
1238 	}
1239 #endif
1240 	/*
1241 	 * Since lookup is lockfree, we must make sure
1242 	 * previous writes to rt are comitted to memory
1243 	 * before making rt visible to other CPUS.
1244 	 */
1245 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246 
1247 	spin_unlock_bh(rt_hash_lock_addr(hash));
1248 
1249 skip_hashing:
1250 	if (rp)
1251 		*rp = rt;
1252 	else
1253 		skb_dst_set(skb, &rt->u.dst);
1254 	return 0;
1255 }
1256 
1257 void rt_bind_peer(struct rtable *rt, int create)
1258 {
1259 	static DEFINE_SPINLOCK(rt_peer_lock);
1260 	struct inet_peer *peer;
1261 
1262 	peer = inet_getpeer(rt->rt_dst, create);
1263 
1264 	spin_lock_bh(&rt_peer_lock);
1265 	if (rt->peer == NULL) {
1266 		rt->peer = peer;
1267 		peer = NULL;
1268 	}
1269 	spin_unlock_bh(&rt_peer_lock);
1270 	if (peer)
1271 		inet_putpeer(peer);
1272 }
1273 
1274 /*
1275  * Peer allocation may fail only in serious out-of-memory conditions.  However
1276  * we still can generate some output.
1277  * Random ID selection looks a bit dangerous because we have no chances to
1278  * select ID being unique in a reasonable period of time.
1279  * But broken packet identifier may be better than no packet at all.
1280  */
1281 static void ip_select_fb_ident(struct iphdr *iph)
1282 {
1283 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1284 	static u32 ip_fallback_id;
1285 	u32 salt;
1286 
1287 	spin_lock_bh(&ip_fb_id_lock);
1288 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289 	iph->id = htons(salt & 0xFFFF);
1290 	ip_fallback_id = salt;
1291 	spin_unlock_bh(&ip_fb_id_lock);
1292 }
1293 
1294 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295 {
1296 	struct rtable *rt = (struct rtable *) dst;
1297 
1298 	if (rt) {
1299 		if (rt->peer == NULL)
1300 			rt_bind_peer(rt, 1);
1301 
1302 		/* If peer is attached to destination, it is never detached,
1303 		   so that we need not to grab a lock to dereference it.
1304 		 */
1305 		if (rt->peer) {
1306 			iph->id = htons(inet_getid(rt->peer, more));
1307 			return;
1308 		}
1309 	} else
1310 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311 		       __builtin_return_address(0));
1312 
1313 	ip_select_fb_ident(iph);
1314 }
1315 
1316 static void rt_del(unsigned hash, struct rtable *rt)
1317 {
1318 	struct rtable **rthp, *aux;
1319 
1320 	rthp = &rt_hash_table[hash].chain;
1321 	spin_lock_bh(rt_hash_lock_addr(hash));
1322 	ip_rt_put(rt);
1323 	while ((aux = *rthp) != NULL) {
1324 		if (aux == rt || rt_is_expired(aux)) {
1325 			*rthp = aux->u.dst.rt_next;
1326 			rt_free(aux);
1327 			continue;
1328 		}
1329 		rthp = &aux->u.dst.rt_next;
1330 	}
1331 	spin_unlock_bh(rt_hash_lock_addr(hash));
1332 }
1333 
1334 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335 		    __be32 saddr, struct net_device *dev)
1336 {
1337 	int i, k;
1338 	struct in_device *in_dev = in_dev_get(dev);
1339 	struct rtable *rth, **rthp;
1340 	__be32  skeys[2] = { saddr, 0 };
1341 	int  ikeys[2] = { dev->ifindex, 0 };
1342 	struct netevent_redirect netevent;
1343 	struct net *net;
1344 
1345 	if (!in_dev)
1346 		return;
1347 
1348 	net = dev_net(dev);
1349 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350 	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351 	    || ipv4_is_zeronet(new_gw))
1352 		goto reject_redirect;
1353 
1354 	if (!rt_caching(net))
1355 		goto reject_redirect;
1356 
1357 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 			goto reject_redirect;
1360 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 			goto reject_redirect;
1362 	} else {
1363 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364 			goto reject_redirect;
1365 	}
1366 
1367 	for (i = 0; i < 2; i++) {
1368 		for (k = 0; k < 2; k++) {
1369 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370 						rt_genid(net));
1371 
1372 			rthp=&rt_hash_table[hash].chain;
1373 
1374 			rcu_read_lock();
1375 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1376 				struct rtable *rt;
1377 
1378 				if (rth->fl.fl4_dst != daddr ||
1379 				    rth->fl.fl4_src != skeys[i] ||
1380 				    rth->fl.oif != ikeys[k] ||
1381 				    rth->fl.iif != 0 ||
1382 				    rt_is_expired(rth) ||
1383 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1384 					rthp = &rth->u.dst.rt_next;
1385 					continue;
1386 				}
1387 
1388 				if (rth->rt_dst != daddr ||
1389 				    rth->rt_src != saddr ||
1390 				    rth->u.dst.error ||
1391 				    rth->rt_gateway != old_gw ||
1392 				    rth->u.dst.dev != dev)
1393 					break;
1394 
1395 				dst_hold(&rth->u.dst);
1396 				rcu_read_unlock();
1397 
1398 				rt = dst_alloc(&ipv4_dst_ops);
1399 				if (rt == NULL) {
1400 					ip_rt_put(rth);
1401 					in_dev_put(in_dev);
1402 					return;
1403 				}
1404 
1405 				/* Copy all the information. */
1406 				*rt = *rth;
1407 				rt->u.dst.__use		= 1;
1408 				atomic_set(&rt->u.dst.__refcnt, 1);
1409 				rt->u.dst.child		= NULL;
1410 				if (rt->u.dst.dev)
1411 					dev_hold(rt->u.dst.dev);
1412 				if (rt->idev)
1413 					in_dev_hold(rt->idev);
1414 				rt->u.dst.obsolete	= 0;
1415 				rt->u.dst.lastuse	= jiffies;
1416 				rt->u.dst.path		= &rt->u.dst;
1417 				rt->u.dst.neighbour	= NULL;
1418 				rt->u.dst.hh		= NULL;
1419 #ifdef CONFIG_XFRM
1420 				rt->u.dst.xfrm		= NULL;
1421 #endif
1422 				rt->rt_genid		= rt_genid(net);
1423 				rt->rt_flags		|= RTCF_REDIRECTED;
1424 
1425 				/* Gateway is different ... */
1426 				rt->rt_gateway		= new_gw;
1427 
1428 				/* Redirect received -> path was valid */
1429 				dst_confirm(&rth->u.dst);
1430 
1431 				if (rt->peer)
1432 					atomic_inc(&rt->peer->refcnt);
1433 
1434 				if (arp_bind_neighbour(&rt->u.dst) ||
1435 				    !(rt->u.dst.neighbour->nud_state &
1436 					    NUD_VALID)) {
1437 					if (rt->u.dst.neighbour)
1438 						neigh_event_send(rt->u.dst.neighbour, NULL);
1439 					ip_rt_put(rth);
1440 					rt_drop(rt);
1441 					goto do_next;
1442 				}
1443 
1444 				netevent.old = &rth->u.dst;
1445 				netevent.new = &rt->u.dst;
1446 				call_netevent_notifiers(NETEVENT_REDIRECT,
1447 							&netevent);
1448 
1449 				rt_del(hash, rth);
1450 				if (!rt_intern_hash(hash, rt, &rt, NULL))
1451 					ip_rt_put(rt);
1452 				goto do_next;
1453 			}
1454 			rcu_read_unlock();
1455 		do_next:
1456 			;
1457 		}
1458 	}
1459 	in_dev_put(in_dev);
1460 	return;
1461 
1462 reject_redirect:
1463 #ifdef CONFIG_IP_ROUTE_VERBOSE
1464 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466 			"  Advised path = %pI4 -> %pI4\n",
1467 		       &old_gw, dev->name, &new_gw,
1468 		       &saddr, &daddr);
1469 #endif
1470 	in_dev_put(in_dev);
1471 }
1472 
1473 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474 {
1475 	struct rtable *rt = (struct rtable *)dst;
1476 	struct dst_entry *ret = dst;
1477 
1478 	if (rt) {
1479 		if (dst->obsolete) {
1480 			ip_rt_put(rt);
1481 			ret = NULL;
1482 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483 			   rt->u.dst.expires) {
1484 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485 						rt->fl.oif,
1486 						rt_genid(dev_net(dst->dev)));
1487 #if RT_CACHE_DEBUG >= 1
1488 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489 				&rt->rt_dst, rt->fl.fl4_tos);
1490 #endif
1491 			rt_del(hash, rt);
1492 			ret = NULL;
1493 		}
1494 	}
1495 	return ret;
1496 }
1497 
1498 /*
1499  * Algorithm:
1500  *	1. The first ip_rt_redirect_number redirects are sent
1501  *	   with exponential backoff, then we stop sending them at all,
1502  *	   assuming that the host ignores our redirects.
1503  *	2. If we did not see packets requiring redirects
1504  *	   during ip_rt_redirect_silence, we assume that the host
1505  *	   forgot redirected route and start to send redirects again.
1506  *
1507  * This algorithm is much cheaper and more intelligent than dumb load limiting
1508  * in icmp.c.
1509  *
1510  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1511  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1512  */
1513 
1514 void ip_rt_send_redirect(struct sk_buff *skb)
1515 {
1516 	struct rtable *rt = skb_rtable(skb);
1517 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1518 
1519 	if (!in_dev)
1520 		return;
1521 
1522 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1523 		goto out;
1524 
1525 	/* No redirected packets during ip_rt_redirect_silence;
1526 	 * reset the algorithm.
1527 	 */
1528 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1529 		rt->u.dst.rate_tokens = 0;
1530 
1531 	/* Too many ignored redirects; do not send anything
1532 	 * set u.dst.rate_last to the last seen redirected packet.
1533 	 */
1534 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1535 		rt->u.dst.rate_last = jiffies;
1536 		goto out;
1537 	}
1538 
1539 	/* Check for load limit; set rate_last to the latest sent
1540 	 * redirect.
1541 	 */
1542 	if (rt->u.dst.rate_tokens == 0 ||
1543 	    time_after(jiffies,
1544 		       (rt->u.dst.rate_last +
1545 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1546 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1547 		rt->u.dst.rate_last = jiffies;
1548 		++rt->u.dst.rate_tokens;
1549 #ifdef CONFIG_IP_ROUTE_VERBOSE
1550 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1551 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1552 		    net_ratelimit())
1553 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1554 				&rt->rt_src, rt->rt_iif,
1555 				&rt->rt_dst, &rt->rt_gateway);
1556 #endif
1557 	}
1558 out:
1559 	in_dev_put(in_dev);
1560 }
1561 
1562 static int ip_error(struct sk_buff *skb)
1563 {
1564 	struct rtable *rt = skb_rtable(skb);
1565 	unsigned long now;
1566 	int code;
1567 
1568 	switch (rt->u.dst.error) {
1569 		case EINVAL:
1570 		default:
1571 			goto out;
1572 		case EHOSTUNREACH:
1573 			code = ICMP_HOST_UNREACH;
1574 			break;
1575 		case ENETUNREACH:
1576 			code = ICMP_NET_UNREACH;
1577 			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1578 					IPSTATS_MIB_INNOROUTES);
1579 			break;
1580 		case EACCES:
1581 			code = ICMP_PKT_FILTERED;
1582 			break;
1583 	}
1584 
1585 	now = jiffies;
1586 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1587 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1588 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1589 	rt->u.dst.rate_last = now;
1590 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1591 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1592 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1593 	}
1594 
1595 out:	kfree_skb(skb);
1596 	return 0;
1597 }
1598 
1599 /*
1600  *	The last two values are not from the RFC but
1601  *	are needed for AMPRnet AX.25 paths.
1602  */
1603 
1604 static const unsigned short mtu_plateau[] =
1605 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1606 
1607 static inline unsigned short guess_mtu(unsigned short old_mtu)
1608 {
1609 	int i;
1610 
1611 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1612 		if (old_mtu > mtu_plateau[i])
1613 			return mtu_plateau[i];
1614 	return 68;
1615 }
1616 
1617 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1618 				 unsigned short new_mtu,
1619 				 struct net_device *dev)
1620 {
1621 	int i, k;
1622 	unsigned short old_mtu = ntohs(iph->tot_len);
1623 	struct rtable *rth;
1624 	int  ikeys[2] = { dev->ifindex, 0 };
1625 	__be32  skeys[2] = { iph->saddr, 0, };
1626 	__be32  daddr = iph->daddr;
1627 	unsigned short est_mtu = 0;
1628 
1629 	if (ipv4_config.no_pmtu_disc)
1630 		return 0;
1631 
1632 	for (k = 0; k < 2; k++) {
1633 		for (i = 0; i < 2; i++) {
1634 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1635 						rt_genid(net));
1636 
1637 			rcu_read_lock();
1638 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1639 			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1640 				unsigned short mtu = new_mtu;
1641 
1642 				if (rth->fl.fl4_dst != daddr ||
1643 				    rth->fl.fl4_src != skeys[i] ||
1644 				    rth->rt_dst != daddr ||
1645 				    rth->rt_src != iph->saddr ||
1646 				    rth->fl.oif != ikeys[k] ||
1647 				    rth->fl.iif != 0 ||
1648 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1649 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1650 				    rt_is_expired(rth))
1651 					continue;
1652 
1653 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1654 
1655 					/* BSD 4.2 compatibility hack :-( */
1656 					if (mtu == 0 &&
1657 					    old_mtu >= dst_mtu(&rth->u.dst) &&
1658 					    old_mtu >= 68 + (iph->ihl << 2))
1659 						old_mtu -= iph->ihl << 2;
1660 
1661 					mtu = guess_mtu(old_mtu);
1662 				}
1663 				if (mtu <= dst_mtu(&rth->u.dst)) {
1664 					if (mtu < dst_mtu(&rth->u.dst)) {
1665 						dst_confirm(&rth->u.dst);
1666 						if (mtu < ip_rt_min_pmtu) {
1667 							mtu = ip_rt_min_pmtu;
1668 							rth->u.dst.metrics[RTAX_LOCK-1] |=
1669 								(1 << RTAX_MTU);
1670 						}
1671 						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1672 						dst_set_expires(&rth->u.dst,
1673 							ip_rt_mtu_expires);
1674 					}
1675 					est_mtu = mtu;
1676 				}
1677 			}
1678 			rcu_read_unlock();
1679 		}
1680 	}
1681 	return est_mtu ? : new_mtu;
1682 }
1683 
1684 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685 {
1686 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1687 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1688 		if (mtu < ip_rt_min_pmtu) {
1689 			mtu = ip_rt_min_pmtu;
1690 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1691 		}
1692 		dst->metrics[RTAX_MTU-1] = mtu;
1693 		dst_set_expires(dst, ip_rt_mtu_expires);
1694 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1695 	}
1696 }
1697 
1698 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1699 {
1700 	return NULL;
1701 }
1702 
1703 static void ipv4_dst_destroy(struct dst_entry *dst)
1704 {
1705 	struct rtable *rt = (struct rtable *) dst;
1706 	struct inet_peer *peer = rt->peer;
1707 	struct in_device *idev = rt->idev;
1708 
1709 	if (peer) {
1710 		rt->peer = NULL;
1711 		inet_putpeer(peer);
1712 	}
1713 
1714 	if (idev) {
1715 		rt->idev = NULL;
1716 		in_dev_put(idev);
1717 	}
1718 }
1719 
1720 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1721 			    int how)
1722 {
1723 	struct rtable *rt = (struct rtable *) dst;
1724 	struct in_device *idev = rt->idev;
1725 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1726 		struct in_device *loopback_idev =
1727 			in_dev_get(dev_net(dev)->loopback_dev);
1728 		if (loopback_idev) {
1729 			rt->idev = loopback_idev;
1730 			in_dev_put(idev);
1731 		}
1732 	}
1733 }
1734 
1735 static void ipv4_link_failure(struct sk_buff *skb)
1736 {
1737 	struct rtable *rt;
1738 
1739 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1740 
1741 	rt = skb_rtable(skb);
1742 	if (rt)
1743 		dst_set_expires(&rt->u.dst, 0);
1744 }
1745 
1746 static int ip_rt_bug(struct sk_buff *skb)
1747 {
1748 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1749 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1750 		skb->dev ? skb->dev->name : "?");
1751 	kfree_skb(skb);
1752 	return 0;
1753 }
1754 
1755 /*
1756    We do not cache source address of outgoing interface,
1757    because it is used only by IP RR, TS and SRR options,
1758    so that it out of fast path.
1759 
1760    BTW remember: "addr" is allowed to be not aligned
1761    in IP options!
1762  */
1763 
1764 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1765 {
1766 	__be32 src;
1767 	struct fib_result res;
1768 
1769 	if (rt->fl.iif == 0)
1770 		src = rt->rt_src;
1771 	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1772 		src = FIB_RES_PREFSRC(res);
1773 		fib_res_put(&res);
1774 	} else
1775 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1776 					RT_SCOPE_UNIVERSE);
1777 	memcpy(addr, &src, 4);
1778 }
1779 
1780 #ifdef CONFIG_NET_CLS_ROUTE
1781 static void set_class_tag(struct rtable *rt, u32 tag)
1782 {
1783 	if (!(rt->u.dst.tclassid & 0xFFFF))
1784 		rt->u.dst.tclassid |= tag & 0xFFFF;
1785 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1786 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1787 }
1788 #endif
1789 
1790 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1791 {
1792 	struct fib_info *fi = res->fi;
1793 
1794 	if (fi) {
1795 		if (FIB_RES_GW(*res) &&
1796 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1797 			rt->rt_gateway = FIB_RES_GW(*res);
1798 		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1799 		       sizeof(rt->u.dst.metrics));
1800 		if (fi->fib_mtu == 0) {
1801 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1802 			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1803 			    rt->rt_gateway != rt->rt_dst &&
1804 			    rt->u.dst.dev->mtu > 576)
1805 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1806 		}
1807 #ifdef CONFIG_NET_CLS_ROUTE
1808 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1809 #endif
1810 	} else
1811 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1812 
1813 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1814 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1815 	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1816 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1817 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1818 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1819 				       ip_rt_min_advmss);
1820 	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1821 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1822 
1823 #ifdef CONFIG_NET_CLS_ROUTE
1824 #ifdef CONFIG_IP_MULTIPLE_TABLES
1825 	set_class_tag(rt, fib_rules_tclass(res));
1826 #endif
1827 	set_class_tag(rt, itag);
1828 #endif
1829 	rt->rt_type = res->type;
1830 }
1831 
1832 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1833 				u8 tos, struct net_device *dev, int our)
1834 {
1835 	unsigned hash;
1836 	struct rtable *rth;
1837 	__be32 spec_dst;
1838 	struct in_device *in_dev = in_dev_get(dev);
1839 	u32 itag = 0;
1840 
1841 	/* Primary sanity checks. */
1842 
1843 	if (in_dev == NULL)
1844 		return -EINVAL;
1845 
1846 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1847 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1848 		goto e_inval;
1849 
1850 	if (ipv4_is_zeronet(saddr)) {
1851 		if (!ipv4_is_local_multicast(daddr))
1852 			goto e_inval;
1853 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1854 	} else if (fib_validate_source(saddr, 0, tos, 0,
1855 					dev, &spec_dst, &itag) < 0)
1856 		goto e_inval;
1857 
1858 	rth = dst_alloc(&ipv4_dst_ops);
1859 	if (!rth)
1860 		goto e_nobufs;
1861 
1862 	rth->u.dst.output= ip_rt_bug;
1863 
1864 	atomic_set(&rth->u.dst.__refcnt, 1);
1865 	rth->u.dst.flags= DST_HOST;
1866 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1867 		rth->u.dst.flags |= DST_NOPOLICY;
1868 	rth->fl.fl4_dst	= daddr;
1869 	rth->rt_dst	= daddr;
1870 	rth->fl.fl4_tos	= tos;
1871 	rth->fl.mark    = skb->mark;
1872 	rth->fl.fl4_src	= saddr;
1873 	rth->rt_src	= saddr;
1874 #ifdef CONFIG_NET_CLS_ROUTE
1875 	rth->u.dst.tclassid = itag;
1876 #endif
1877 	rth->rt_iif	=
1878 	rth->fl.iif	= dev->ifindex;
1879 	rth->u.dst.dev	= init_net.loopback_dev;
1880 	dev_hold(rth->u.dst.dev);
1881 	rth->idev	= in_dev_get(rth->u.dst.dev);
1882 	rth->fl.oif	= 0;
1883 	rth->rt_gateway	= daddr;
1884 	rth->rt_spec_dst= spec_dst;
1885 	rth->rt_genid	= rt_genid(dev_net(dev));
1886 	rth->rt_flags	= RTCF_MULTICAST;
1887 	rth->rt_type	= RTN_MULTICAST;
1888 	if (our) {
1889 		rth->u.dst.input= ip_local_deliver;
1890 		rth->rt_flags |= RTCF_LOCAL;
1891 	}
1892 
1893 #ifdef CONFIG_IP_MROUTE
1894 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1895 		rth->u.dst.input = ip_mr_input;
1896 #endif
1897 	RT_CACHE_STAT_INC(in_slow_mc);
1898 
1899 	in_dev_put(in_dev);
1900 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1901 	return rt_intern_hash(hash, rth, NULL, skb);
1902 
1903 e_nobufs:
1904 	in_dev_put(in_dev);
1905 	return -ENOBUFS;
1906 
1907 e_inval:
1908 	in_dev_put(in_dev);
1909 	return -EINVAL;
1910 }
1911 
1912 
1913 static void ip_handle_martian_source(struct net_device *dev,
1914 				     struct in_device *in_dev,
1915 				     struct sk_buff *skb,
1916 				     __be32 daddr,
1917 				     __be32 saddr)
1918 {
1919 	RT_CACHE_STAT_INC(in_martian_src);
1920 #ifdef CONFIG_IP_ROUTE_VERBOSE
1921 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1922 		/*
1923 		 *	RFC1812 recommendation, if source is martian,
1924 		 *	the only hint is MAC header.
1925 		 */
1926 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1927 			&daddr, &saddr, dev->name);
1928 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1929 			int i;
1930 			const unsigned char *p = skb_mac_header(skb);
1931 			printk(KERN_WARNING "ll header: ");
1932 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1933 				printk("%02x", *p);
1934 				if (i < (dev->hard_header_len - 1))
1935 					printk(":");
1936 			}
1937 			printk("\n");
1938 		}
1939 	}
1940 #endif
1941 }
1942 
1943 static int __mkroute_input(struct sk_buff *skb,
1944 			   struct fib_result *res,
1945 			   struct in_device *in_dev,
1946 			   __be32 daddr, __be32 saddr, u32 tos,
1947 			   struct rtable **result)
1948 {
1949 
1950 	struct rtable *rth;
1951 	int err;
1952 	struct in_device *out_dev;
1953 	unsigned flags = 0;
1954 	__be32 spec_dst;
1955 	u32 itag;
1956 
1957 	/* get a working reference to the output device */
1958 	out_dev = in_dev_get(FIB_RES_DEV(*res));
1959 	if (out_dev == NULL) {
1960 		if (net_ratelimit())
1961 			printk(KERN_CRIT "Bug in ip_route_input" \
1962 			       "_slow(). Please, report\n");
1963 		return -EINVAL;
1964 	}
1965 
1966 
1967 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1968 				  in_dev->dev, &spec_dst, &itag);
1969 	if (err < 0) {
1970 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1971 					 saddr);
1972 
1973 		err = -EINVAL;
1974 		goto cleanup;
1975 	}
1976 
1977 	if (err)
1978 		flags |= RTCF_DIRECTSRC;
1979 
1980 	if (out_dev == in_dev && err &&
1981 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1982 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1983 		flags |= RTCF_DOREDIRECT;
1984 
1985 	if (skb->protocol != htons(ETH_P_IP)) {
1986 		/* Not IP (i.e. ARP). Do not create route, if it is
1987 		 * invalid for proxy arp. DNAT routes are always valid.
1988 		 */
1989 		if (out_dev == in_dev) {
1990 			err = -EINVAL;
1991 			goto cleanup;
1992 		}
1993 	}
1994 
1995 
1996 	rth = dst_alloc(&ipv4_dst_ops);
1997 	if (!rth) {
1998 		err = -ENOBUFS;
1999 		goto cleanup;
2000 	}
2001 
2002 	atomic_set(&rth->u.dst.__refcnt, 1);
2003 	rth->u.dst.flags= DST_HOST;
2004 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2005 		rth->u.dst.flags |= DST_NOPOLICY;
2006 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2007 		rth->u.dst.flags |= DST_NOXFRM;
2008 	rth->fl.fl4_dst	= daddr;
2009 	rth->rt_dst	= daddr;
2010 	rth->fl.fl4_tos	= tos;
2011 	rth->fl.mark    = skb->mark;
2012 	rth->fl.fl4_src	= saddr;
2013 	rth->rt_src	= saddr;
2014 	rth->rt_gateway	= daddr;
2015 	rth->rt_iif 	=
2016 		rth->fl.iif	= in_dev->dev->ifindex;
2017 	rth->u.dst.dev	= (out_dev)->dev;
2018 	dev_hold(rth->u.dst.dev);
2019 	rth->idev	= in_dev_get(rth->u.dst.dev);
2020 	rth->fl.oif 	= 0;
2021 	rth->rt_spec_dst= spec_dst;
2022 
2023 	rth->u.dst.input = ip_forward;
2024 	rth->u.dst.output = ip_output;
2025 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2026 
2027 	rt_set_nexthop(rth, res, itag);
2028 
2029 	rth->rt_flags = flags;
2030 
2031 	*result = rth;
2032 	err = 0;
2033  cleanup:
2034 	/* release the working reference to the output device */
2035 	in_dev_put(out_dev);
2036 	return err;
2037 }
2038 
2039 static int ip_mkroute_input(struct sk_buff *skb,
2040 			    struct fib_result *res,
2041 			    const struct flowi *fl,
2042 			    struct in_device *in_dev,
2043 			    __be32 daddr, __be32 saddr, u32 tos)
2044 {
2045 	struct rtable* rth = NULL;
2046 	int err;
2047 	unsigned hash;
2048 
2049 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2050 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2051 		fib_select_multipath(fl, res);
2052 #endif
2053 
2054 	/* create a routing cache entry */
2055 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2056 	if (err)
2057 		return err;
2058 
2059 	/* put it into the cache */
2060 	hash = rt_hash(daddr, saddr, fl->iif,
2061 		       rt_genid(dev_net(rth->u.dst.dev)));
2062 	return rt_intern_hash(hash, rth, NULL, skb);
2063 }
2064 
2065 /*
2066  *	NOTE. We drop all the packets that has local source
2067  *	addresses, because every properly looped back packet
2068  *	must have correct destination already attached by output routine.
2069  *
2070  *	Such approach solves two big problems:
2071  *	1. Not simplex devices are handled properly.
2072  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2073  */
2074 
2075 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076 			       u8 tos, struct net_device *dev)
2077 {
2078 	struct fib_result res;
2079 	struct in_device *in_dev = in_dev_get(dev);
2080 	struct flowi fl = { .nl_u = { .ip4_u =
2081 				      { .daddr = daddr,
2082 					.saddr = saddr,
2083 					.tos = tos,
2084 					.scope = RT_SCOPE_UNIVERSE,
2085 				      } },
2086 			    .mark = skb->mark,
2087 			    .iif = dev->ifindex };
2088 	unsigned	flags = 0;
2089 	u32		itag = 0;
2090 	struct rtable * rth;
2091 	unsigned	hash;
2092 	__be32		spec_dst;
2093 	int		err = -EINVAL;
2094 	int		free_res = 0;
2095 	struct net    * net = dev_net(dev);
2096 
2097 	/* IP on this device is disabled. */
2098 
2099 	if (!in_dev)
2100 		goto out;
2101 
2102 	/* Check for the most weird martians, which can be not detected
2103 	   by fib_lookup.
2104 	 */
2105 
2106 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2107 	    ipv4_is_loopback(saddr))
2108 		goto martian_source;
2109 
2110 	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2111 		goto brd_input;
2112 
2113 	/* Accept zero addresses only to limited broadcast;
2114 	 * I even do not know to fix it or not. Waiting for complains :-)
2115 	 */
2116 	if (ipv4_is_zeronet(saddr))
2117 		goto martian_source;
2118 
2119 	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2120 	    ipv4_is_loopback(daddr))
2121 		goto martian_destination;
2122 
2123 	/*
2124 	 *	Now we are ready to route packet.
2125 	 */
2126 	if ((err = fib_lookup(net, &fl, &res)) != 0) {
2127 		if (!IN_DEV_FORWARD(in_dev))
2128 			goto e_hostunreach;
2129 		goto no_route;
2130 	}
2131 	free_res = 1;
2132 
2133 	RT_CACHE_STAT_INC(in_slow_tot);
2134 
2135 	if (res.type == RTN_BROADCAST)
2136 		goto brd_input;
2137 
2138 	if (res.type == RTN_LOCAL) {
2139 		int result;
2140 		result = fib_validate_source(saddr, daddr, tos,
2141 					     net->loopback_dev->ifindex,
2142 					     dev, &spec_dst, &itag);
2143 		if (result < 0)
2144 			goto martian_source;
2145 		if (result)
2146 			flags |= RTCF_DIRECTSRC;
2147 		spec_dst = daddr;
2148 		goto local_input;
2149 	}
2150 
2151 	if (!IN_DEV_FORWARD(in_dev))
2152 		goto e_hostunreach;
2153 	if (res.type != RTN_UNICAST)
2154 		goto martian_destination;
2155 
2156 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2157 done:
2158 	in_dev_put(in_dev);
2159 	if (free_res)
2160 		fib_res_put(&res);
2161 out:	return err;
2162 
2163 brd_input:
2164 	if (skb->protocol != htons(ETH_P_IP))
2165 		goto e_inval;
2166 
2167 	if (ipv4_is_zeronet(saddr))
2168 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2169 	else {
2170 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2171 					  &itag);
2172 		if (err < 0)
2173 			goto martian_source;
2174 		if (err)
2175 			flags |= RTCF_DIRECTSRC;
2176 	}
2177 	flags |= RTCF_BROADCAST;
2178 	res.type = RTN_BROADCAST;
2179 	RT_CACHE_STAT_INC(in_brd);
2180 
2181 local_input:
2182 	rth = dst_alloc(&ipv4_dst_ops);
2183 	if (!rth)
2184 		goto e_nobufs;
2185 
2186 	rth->u.dst.output= ip_rt_bug;
2187 	rth->rt_genid = rt_genid(net);
2188 
2189 	atomic_set(&rth->u.dst.__refcnt, 1);
2190 	rth->u.dst.flags= DST_HOST;
2191 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2192 		rth->u.dst.flags |= DST_NOPOLICY;
2193 	rth->fl.fl4_dst	= daddr;
2194 	rth->rt_dst	= daddr;
2195 	rth->fl.fl4_tos	= tos;
2196 	rth->fl.mark    = skb->mark;
2197 	rth->fl.fl4_src	= saddr;
2198 	rth->rt_src	= saddr;
2199 #ifdef CONFIG_NET_CLS_ROUTE
2200 	rth->u.dst.tclassid = itag;
2201 #endif
2202 	rth->rt_iif	=
2203 	rth->fl.iif	= dev->ifindex;
2204 	rth->u.dst.dev	= net->loopback_dev;
2205 	dev_hold(rth->u.dst.dev);
2206 	rth->idev	= in_dev_get(rth->u.dst.dev);
2207 	rth->rt_gateway	= daddr;
2208 	rth->rt_spec_dst= spec_dst;
2209 	rth->u.dst.input= ip_local_deliver;
2210 	rth->rt_flags 	= flags|RTCF_LOCAL;
2211 	if (res.type == RTN_UNREACHABLE) {
2212 		rth->u.dst.input= ip_error;
2213 		rth->u.dst.error= -err;
2214 		rth->rt_flags 	&= ~RTCF_LOCAL;
2215 	}
2216 	rth->rt_type	= res.type;
2217 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2218 	err = rt_intern_hash(hash, rth, NULL, skb);
2219 	goto done;
2220 
2221 no_route:
2222 	RT_CACHE_STAT_INC(in_no_route);
2223 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2224 	res.type = RTN_UNREACHABLE;
2225 	if (err == -ESRCH)
2226 		err = -ENETUNREACH;
2227 	goto local_input;
2228 
2229 	/*
2230 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2231 	 */
2232 martian_destination:
2233 	RT_CACHE_STAT_INC(in_martian_dst);
2234 #ifdef CONFIG_IP_ROUTE_VERBOSE
2235 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2236 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2237 			&daddr, &saddr, dev->name);
2238 #endif
2239 
2240 e_hostunreach:
2241 	err = -EHOSTUNREACH;
2242 	goto done;
2243 
2244 e_inval:
2245 	err = -EINVAL;
2246 	goto done;
2247 
2248 e_nobufs:
2249 	err = -ENOBUFS;
2250 	goto done;
2251 
2252 martian_source:
2253 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2254 	goto e_inval;
2255 }
2256 
2257 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2258 		   u8 tos, struct net_device *dev)
2259 {
2260 	struct rtable * rth;
2261 	unsigned	hash;
2262 	int iif = dev->ifindex;
2263 	struct net *net;
2264 
2265 	net = dev_net(dev);
2266 
2267 	if (!rt_caching(net))
2268 		goto skip_cache;
2269 
2270 	tos &= IPTOS_RT_MASK;
2271 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2272 
2273 	rcu_read_lock();
2274 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2275 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2276 		if (((rth->fl.fl4_dst ^ daddr) |
2277 		     (rth->fl.fl4_src ^ saddr) |
2278 		     (rth->fl.iif ^ iif) |
2279 		     rth->fl.oif |
2280 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2281 		    rth->fl.mark == skb->mark &&
2282 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2283 		    !rt_is_expired(rth)) {
2284 			dst_use(&rth->u.dst, jiffies);
2285 			RT_CACHE_STAT_INC(in_hit);
2286 			rcu_read_unlock();
2287 			skb_dst_set(skb, &rth->u.dst);
2288 			return 0;
2289 		}
2290 		RT_CACHE_STAT_INC(in_hlist_search);
2291 	}
2292 	rcu_read_unlock();
2293 
2294 skip_cache:
2295 	/* Multicast recognition logic is moved from route cache to here.
2296 	   The problem was that too many Ethernet cards have broken/missing
2297 	   hardware multicast filters :-( As result the host on multicasting
2298 	   network acquires a lot of useless route cache entries, sort of
2299 	   SDR messages from all the world. Now we try to get rid of them.
2300 	   Really, provided software IP multicast filter is organized
2301 	   reasonably (at least, hashed), it does not result in a slowdown
2302 	   comparing with route cache reject entries.
2303 	   Note, that multicast routers are not affected, because
2304 	   route cache entry is created eventually.
2305 	 */
2306 	if (ipv4_is_multicast(daddr)) {
2307 		struct in_device *in_dev;
2308 
2309 		rcu_read_lock();
2310 		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2311 			int our = ip_check_mc(in_dev, daddr, saddr,
2312 				ip_hdr(skb)->protocol);
2313 			if (our
2314 #ifdef CONFIG_IP_MROUTE
2315 			    || (!ipv4_is_local_multicast(daddr) &&
2316 				IN_DEV_MFORWARD(in_dev))
2317 #endif
2318 			    ) {
2319 				rcu_read_unlock();
2320 				return ip_route_input_mc(skb, daddr, saddr,
2321 							 tos, dev, our);
2322 			}
2323 		}
2324 		rcu_read_unlock();
2325 		return -EINVAL;
2326 	}
2327 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2328 }
2329 
2330 static int __mkroute_output(struct rtable **result,
2331 			    struct fib_result *res,
2332 			    const struct flowi *fl,
2333 			    const struct flowi *oldflp,
2334 			    struct net_device *dev_out,
2335 			    unsigned flags)
2336 {
2337 	struct rtable *rth;
2338 	struct in_device *in_dev;
2339 	u32 tos = RT_FL_TOS(oldflp);
2340 	int err = 0;
2341 
2342 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2343 		return -EINVAL;
2344 
2345 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2346 		res->type = RTN_BROADCAST;
2347 	else if (ipv4_is_multicast(fl->fl4_dst))
2348 		res->type = RTN_MULTICAST;
2349 	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2350 		return -EINVAL;
2351 
2352 	if (dev_out->flags & IFF_LOOPBACK)
2353 		flags |= RTCF_LOCAL;
2354 
2355 	/* get work reference to inet device */
2356 	in_dev = in_dev_get(dev_out);
2357 	if (!in_dev)
2358 		return -EINVAL;
2359 
2360 	if (res->type == RTN_BROADCAST) {
2361 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2362 		if (res->fi) {
2363 			fib_info_put(res->fi);
2364 			res->fi = NULL;
2365 		}
2366 	} else if (res->type == RTN_MULTICAST) {
2367 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2368 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2369 				 oldflp->proto))
2370 			flags &= ~RTCF_LOCAL;
2371 		/* If multicast route do not exist use
2372 		   default one, but do not gateway in this case.
2373 		   Yes, it is hack.
2374 		 */
2375 		if (res->fi && res->prefixlen < 4) {
2376 			fib_info_put(res->fi);
2377 			res->fi = NULL;
2378 		}
2379 	}
2380 
2381 
2382 	rth = dst_alloc(&ipv4_dst_ops);
2383 	if (!rth) {
2384 		err = -ENOBUFS;
2385 		goto cleanup;
2386 	}
2387 
2388 	atomic_set(&rth->u.dst.__refcnt, 1);
2389 	rth->u.dst.flags= DST_HOST;
2390 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2391 		rth->u.dst.flags |= DST_NOXFRM;
2392 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2393 		rth->u.dst.flags |= DST_NOPOLICY;
2394 
2395 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2396 	rth->fl.fl4_tos	= tos;
2397 	rth->fl.fl4_src	= oldflp->fl4_src;
2398 	rth->fl.oif	= oldflp->oif;
2399 	rth->fl.mark    = oldflp->mark;
2400 	rth->rt_dst	= fl->fl4_dst;
2401 	rth->rt_src	= fl->fl4_src;
2402 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2403 	/* get references to the devices that are to be hold by the routing
2404 	   cache entry */
2405 	rth->u.dst.dev	= dev_out;
2406 	dev_hold(dev_out);
2407 	rth->idev	= in_dev_get(dev_out);
2408 	rth->rt_gateway = fl->fl4_dst;
2409 	rth->rt_spec_dst= fl->fl4_src;
2410 
2411 	rth->u.dst.output=ip_output;
2412 	rth->rt_genid = rt_genid(dev_net(dev_out));
2413 
2414 	RT_CACHE_STAT_INC(out_slow_tot);
2415 
2416 	if (flags & RTCF_LOCAL) {
2417 		rth->u.dst.input = ip_local_deliver;
2418 		rth->rt_spec_dst = fl->fl4_dst;
2419 	}
2420 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2421 		rth->rt_spec_dst = fl->fl4_src;
2422 		if (flags & RTCF_LOCAL &&
2423 		    !(dev_out->flags & IFF_LOOPBACK)) {
2424 			rth->u.dst.output = ip_mc_output;
2425 			RT_CACHE_STAT_INC(out_slow_mc);
2426 		}
2427 #ifdef CONFIG_IP_MROUTE
2428 		if (res->type == RTN_MULTICAST) {
2429 			if (IN_DEV_MFORWARD(in_dev) &&
2430 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2431 				rth->u.dst.input = ip_mr_input;
2432 				rth->u.dst.output = ip_mc_output;
2433 			}
2434 		}
2435 #endif
2436 	}
2437 
2438 	rt_set_nexthop(rth, res, 0);
2439 
2440 	rth->rt_flags = flags;
2441 
2442 	*result = rth;
2443  cleanup:
2444 	/* release work reference to inet device */
2445 	in_dev_put(in_dev);
2446 
2447 	return err;
2448 }
2449 
2450 static int ip_mkroute_output(struct rtable **rp,
2451 			     struct fib_result *res,
2452 			     const struct flowi *fl,
2453 			     const struct flowi *oldflp,
2454 			     struct net_device *dev_out,
2455 			     unsigned flags)
2456 {
2457 	struct rtable *rth = NULL;
2458 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2459 	unsigned hash;
2460 	if (err == 0) {
2461 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2462 			       rt_genid(dev_net(dev_out)));
2463 		err = rt_intern_hash(hash, rth, rp, NULL);
2464 	}
2465 
2466 	return err;
2467 }
2468 
2469 /*
2470  * Major route resolver routine.
2471  */
2472 
2473 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2474 				const struct flowi *oldflp)
2475 {
2476 	u32 tos	= RT_FL_TOS(oldflp);
2477 	struct flowi fl = { .nl_u = { .ip4_u =
2478 				      { .daddr = oldflp->fl4_dst,
2479 					.saddr = oldflp->fl4_src,
2480 					.tos = tos & IPTOS_RT_MASK,
2481 					.scope = ((tos & RTO_ONLINK) ?
2482 						  RT_SCOPE_LINK :
2483 						  RT_SCOPE_UNIVERSE),
2484 				      } },
2485 			    .mark = oldflp->mark,
2486 			    .iif = net->loopback_dev->ifindex,
2487 			    .oif = oldflp->oif };
2488 	struct fib_result res;
2489 	unsigned flags = 0;
2490 	struct net_device *dev_out = NULL;
2491 	int free_res = 0;
2492 	int err;
2493 
2494 
2495 	res.fi		= NULL;
2496 #ifdef CONFIG_IP_MULTIPLE_TABLES
2497 	res.r		= NULL;
2498 #endif
2499 
2500 	if (oldflp->fl4_src) {
2501 		err = -EINVAL;
2502 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2503 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2504 		    ipv4_is_zeronet(oldflp->fl4_src))
2505 			goto out;
2506 
2507 		/* I removed check for oif == dev_out->oif here.
2508 		   It was wrong for two reasons:
2509 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2510 		      is assigned to multiple interfaces.
2511 		   2. Moreover, we are allowed to send packets with saddr
2512 		      of another iface. --ANK
2513 		 */
2514 
2515 		if (oldflp->oif == 0
2516 		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2517 			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2518 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2519 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2520 			if (dev_out == NULL)
2521 				goto out;
2522 
2523 			/* Special hack: user can direct multicasts
2524 			   and limited broadcast via necessary interface
2525 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2526 			   This hack is not just for fun, it allows
2527 			   vic,vat and friends to work.
2528 			   They bind socket to loopback, set ttl to zero
2529 			   and expect that it will work.
2530 			   From the viewpoint of routing cache they are broken,
2531 			   because we are not allowed to build multicast path
2532 			   with loopback source addr (look, routing cache
2533 			   cannot know, that ttl is zero, so that packet
2534 			   will not leave this host and route is valid).
2535 			   Luckily, this hack is good workaround.
2536 			 */
2537 
2538 			fl.oif = dev_out->ifindex;
2539 			goto make_route;
2540 		}
2541 
2542 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2543 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2544 			dev_out = ip_dev_find(net, oldflp->fl4_src);
2545 			if (dev_out == NULL)
2546 				goto out;
2547 			dev_put(dev_out);
2548 			dev_out = NULL;
2549 		}
2550 	}
2551 
2552 
2553 	if (oldflp->oif) {
2554 		dev_out = dev_get_by_index(net, oldflp->oif);
2555 		err = -ENODEV;
2556 		if (dev_out == NULL)
2557 			goto out;
2558 
2559 		/* RACE: Check return value of inet_select_addr instead. */
2560 		if (__in_dev_get_rtnl(dev_out) == NULL) {
2561 			dev_put(dev_out);
2562 			goto out;	/* Wrong error code */
2563 		}
2564 
2565 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2566 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2567 			if (!fl.fl4_src)
2568 				fl.fl4_src = inet_select_addr(dev_out, 0,
2569 							      RT_SCOPE_LINK);
2570 			goto make_route;
2571 		}
2572 		if (!fl.fl4_src) {
2573 			if (ipv4_is_multicast(oldflp->fl4_dst))
2574 				fl.fl4_src = inet_select_addr(dev_out, 0,
2575 							      fl.fl4_scope);
2576 			else if (!oldflp->fl4_dst)
2577 				fl.fl4_src = inet_select_addr(dev_out, 0,
2578 							      RT_SCOPE_HOST);
2579 		}
2580 	}
2581 
2582 	if (!fl.fl4_dst) {
2583 		fl.fl4_dst = fl.fl4_src;
2584 		if (!fl.fl4_dst)
2585 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2586 		if (dev_out)
2587 			dev_put(dev_out);
2588 		dev_out = net->loopback_dev;
2589 		dev_hold(dev_out);
2590 		fl.oif = net->loopback_dev->ifindex;
2591 		res.type = RTN_LOCAL;
2592 		flags |= RTCF_LOCAL;
2593 		goto make_route;
2594 	}
2595 
2596 	if (fib_lookup(net, &fl, &res)) {
2597 		res.fi = NULL;
2598 		if (oldflp->oif) {
2599 			/* Apparently, routing tables are wrong. Assume,
2600 			   that the destination is on link.
2601 
2602 			   WHY? DW.
2603 			   Because we are allowed to send to iface
2604 			   even if it has NO routes and NO assigned
2605 			   addresses. When oif is specified, routing
2606 			   tables are looked up with only one purpose:
2607 			   to catch if destination is gatewayed, rather than
2608 			   direct. Moreover, if MSG_DONTROUTE is set,
2609 			   we send packet, ignoring both routing tables
2610 			   and ifaddr state. --ANK
2611 
2612 
2613 			   We could make it even if oif is unknown,
2614 			   likely IPv6, but we do not.
2615 			 */
2616 
2617 			if (fl.fl4_src == 0)
2618 				fl.fl4_src = inet_select_addr(dev_out, 0,
2619 							      RT_SCOPE_LINK);
2620 			res.type = RTN_UNICAST;
2621 			goto make_route;
2622 		}
2623 		if (dev_out)
2624 			dev_put(dev_out);
2625 		err = -ENETUNREACH;
2626 		goto out;
2627 	}
2628 	free_res = 1;
2629 
2630 	if (res.type == RTN_LOCAL) {
2631 		if (!fl.fl4_src)
2632 			fl.fl4_src = fl.fl4_dst;
2633 		if (dev_out)
2634 			dev_put(dev_out);
2635 		dev_out = net->loopback_dev;
2636 		dev_hold(dev_out);
2637 		fl.oif = dev_out->ifindex;
2638 		if (res.fi)
2639 			fib_info_put(res.fi);
2640 		res.fi = NULL;
2641 		flags |= RTCF_LOCAL;
2642 		goto make_route;
2643 	}
2644 
2645 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2646 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2647 		fib_select_multipath(&fl, &res);
2648 	else
2649 #endif
2650 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2651 		fib_select_default(net, &fl, &res);
2652 
2653 	if (!fl.fl4_src)
2654 		fl.fl4_src = FIB_RES_PREFSRC(res);
2655 
2656 	if (dev_out)
2657 		dev_put(dev_out);
2658 	dev_out = FIB_RES_DEV(res);
2659 	dev_hold(dev_out);
2660 	fl.oif = dev_out->ifindex;
2661 
2662 
2663 make_route:
2664 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2665 
2666 
2667 	if (free_res)
2668 		fib_res_put(&res);
2669 	if (dev_out)
2670 		dev_put(dev_out);
2671 out:	return err;
2672 }
2673 
2674 int __ip_route_output_key(struct net *net, struct rtable **rp,
2675 			  const struct flowi *flp)
2676 {
2677 	unsigned hash;
2678 	struct rtable *rth;
2679 
2680 	if (!rt_caching(net))
2681 		goto slow_output;
2682 
2683 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2684 
2685 	rcu_read_lock_bh();
2686 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2687 		rth = rcu_dereference(rth->u.dst.rt_next)) {
2688 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2689 		    rth->fl.fl4_src == flp->fl4_src &&
2690 		    rth->fl.iif == 0 &&
2691 		    rth->fl.oif == flp->oif &&
2692 		    rth->fl.mark == flp->mark &&
2693 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2694 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2695 		    net_eq(dev_net(rth->u.dst.dev), net) &&
2696 		    !rt_is_expired(rth)) {
2697 			dst_use(&rth->u.dst, jiffies);
2698 			RT_CACHE_STAT_INC(out_hit);
2699 			rcu_read_unlock_bh();
2700 			*rp = rth;
2701 			return 0;
2702 		}
2703 		RT_CACHE_STAT_INC(out_hlist_search);
2704 	}
2705 	rcu_read_unlock_bh();
2706 
2707 slow_output:
2708 	return ip_route_output_slow(net, rp, flp);
2709 }
2710 
2711 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2712 
2713 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2714 {
2715 }
2716 
2717 static struct dst_ops ipv4_dst_blackhole_ops = {
2718 	.family			=	AF_INET,
2719 	.protocol		=	cpu_to_be16(ETH_P_IP),
2720 	.destroy		=	ipv4_dst_destroy,
2721 	.check			=	ipv4_dst_check,
2722 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2723 	.entries		=	ATOMIC_INIT(0),
2724 };
2725 
2726 
2727 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2728 {
2729 	struct rtable *ort = *rp;
2730 	struct rtable *rt = (struct rtable *)
2731 		dst_alloc(&ipv4_dst_blackhole_ops);
2732 
2733 	if (rt) {
2734 		struct dst_entry *new = &rt->u.dst;
2735 
2736 		atomic_set(&new->__refcnt, 1);
2737 		new->__use = 1;
2738 		new->input = dst_discard;
2739 		new->output = dst_discard;
2740 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2741 
2742 		new->dev = ort->u.dst.dev;
2743 		if (new->dev)
2744 			dev_hold(new->dev);
2745 
2746 		rt->fl = ort->fl;
2747 
2748 		rt->idev = ort->idev;
2749 		if (rt->idev)
2750 			in_dev_hold(rt->idev);
2751 		rt->rt_genid = rt_genid(net);
2752 		rt->rt_flags = ort->rt_flags;
2753 		rt->rt_type = ort->rt_type;
2754 		rt->rt_dst = ort->rt_dst;
2755 		rt->rt_src = ort->rt_src;
2756 		rt->rt_iif = ort->rt_iif;
2757 		rt->rt_gateway = ort->rt_gateway;
2758 		rt->rt_spec_dst = ort->rt_spec_dst;
2759 		rt->peer = ort->peer;
2760 		if (rt->peer)
2761 			atomic_inc(&rt->peer->refcnt);
2762 
2763 		dst_free(new);
2764 	}
2765 
2766 	dst_release(&(*rp)->u.dst);
2767 	*rp = rt;
2768 	return (rt ? 0 : -ENOMEM);
2769 }
2770 
2771 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2772 			 struct sock *sk, int flags)
2773 {
2774 	int err;
2775 
2776 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2777 		return err;
2778 
2779 	if (flp->proto) {
2780 		if (!flp->fl4_src)
2781 			flp->fl4_src = (*rp)->rt_src;
2782 		if (!flp->fl4_dst)
2783 			flp->fl4_dst = (*rp)->rt_dst;
2784 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2785 				    flags ? XFRM_LOOKUP_WAIT : 0);
2786 		if (err == -EREMOTE)
2787 			err = ipv4_dst_blackhole(net, rp, flp);
2788 
2789 		return err;
2790 	}
2791 
2792 	return 0;
2793 }
2794 
2795 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2796 
2797 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2798 {
2799 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2800 }
2801 
2802 static int rt_fill_info(struct net *net,
2803 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2804 			int nowait, unsigned int flags)
2805 {
2806 	struct rtable *rt = skb_rtable(skb);
2807 	struct rtmsg *r;
2808 	struct nlmsghdr *nlh;
2809 	long expires;
2810 	u32 id = 0, ts = 0, tsage = 0, error;
2811 
2812 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813 	if (nlh == NULL)
2814 		return -EMSGSIZE;
2815 
2816 	r = nlmsg_data(nlh);
2817 	r->rtm_family	 = AF_INET;
2818 	r->rtm_dst_len	= 32;
2819 	r->rtm_src_len	= 0;
2820 	r->rtm_tos	= rt->fl.fl4_tos;
2821 	r->rtm_table	= RT_TABLE_MAIN;
2822 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823 	r->rtm_type	= rt->rt_type;
2824 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2825 	r->rtm_protocol = RTPROT_UNSPEC;
2826 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827 	if (rt->rt_flags & RTCF_NOTIFY)
2828 		r->rtm_flags |= RTM_F_NOTIFY;
2829 
2830 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831 
2832 	if (rt->fl.fl4_src) {
2833 		r->rtm_src_len = 32;
2834 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2835 	}
2836 	if (rt->u.dst.dev)
2837 		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2838 #ifdef CONFIG_NET_CLS_ROUTE
2839 	if (rt->u.dst.tclassid)
2840 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2841 #endif
2842 	if (rt->fl.iif)
2843 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844 	else if (rt->rt_src != rt->fl.fl4_src)
2845 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846 
2847 	if (rt->rt_dst != rt->rt_gateway)
2848 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849 
2850 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2851 		goto nla_put_failure;
2852 
2853 	error = rt->u.dst.error;
2854 	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2855 	if (rt->peer) {
2856 		id = rt->peer->ip_id_count;
2857 		if (rt->peer->tcp_ts_stamp) {
2858 			ts = rt->peer->tcp_ts;
2859 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2860 		}
2861 	}
2862 
2863 	if (rt->fl.iif) {
2864 #ifdef CONFIG_IP_MROUTE
2865 		__be32 dst = rt->rt_dst;
2866 
2867 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2868 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2869 			int err = ipmr_get_route(net, skb, r, nowait);
2870 			if (err <= 0) {
2871 				if (!nowait) {
2872 					if (err == 0)
2873 						return 0;
2874 					goto nla_put_failure;
2875 				} else {
2876 					if (err == -EMSGSIZE)
2877 						goto nla_put_failure;
2878 					error = err;
2879 				}
2880 			}
2881 		} else
2882 #endif
2883 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2884 	}
2885 
2886 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2887 			       expires, error) < 0)
2888 		goto nla_put_failure;
2889 
2890 	return nlmsg_end(skb, nlh);
2891 
2892 nla_put_failure:
2893 	nlmsg_cancel(skb, nlh);
2894 	return -EMSGSIZE;
2895 }
2896 
2897 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2898 {
2899 	struct net *net = sock_net(in_skb->sk);
2900 	struct rtmsg *rtm;
2901 	struct nlattr *tb[RTA_MAX+1];
2902 	struct rtable *rt = NULL;
2903 	__be32 dst = 0;
2904 	__be32 src = 0;
2905 	u32 iif;
2906 	int err;
2907 	struct sk_buff *skb;
2908 
2909 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2910 	if (err < 0)
2911 		goto errout;
2912 
2913 	rtm = nlmsg_data(nlh);
2914 
2915 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2916 	if (skb == NULL) {
2917 		err = -ENOBUFS;
2918 		goto errout;
2919 	}
2920 
2921 	/* Reserve room for dummy headers, this skb can pass
2922 	   through good chunk of routing engine.
2923 	 */
2924 	skb_reset_mac_header(skb);
2925 	skb_reset_network_header(skb);
2926 
2927 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2928 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2929 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2930 
2931 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2932 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2933 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2934 
2935 	if (iif) {
2936 		struct net_device *dev;
2937 
2938 		dev = __dev_get_by_index(net, iif);
2939 		if (dev == NULL) {
2940 			err = -ENODEV;
2941 			goto errout_free;
2942 		}
2943 
2944 		skb->protocol	= htons(ETH_P_IP);
2945 		skb->dev	= dev;
2946 		local_bh_disable();
2947 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2948 		local_bh_enable();
2949 
2950 		rt = skb_rtable(skb);
2951 		if (err == 0 && rt->u.dst.error)
2952 			err = -rt->u.dst.error;
2953 	} else {
2954 		struct flowi fl = {
2955 			.nl_u = {
2956 				.ip4_u = {
2957 					.daddr = dst,
2958 					.saddr = src,
2959 					.tos = rtm->rtm_tos,
2960 				},
2961 			},
2962 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2963 		};
2964 		err = ip_route_output_key(net, &rt, &fl);
2965 	}
2966 
2967 	if (err)
2968 		goto errout_free;
2969 
2970 	skb_dst_set(skb, &rt->u.dst);
2971 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2972 		rt->rt_flags |= RTCF_NOTIFY;
2973 
2974 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2975 			   RTM_NEWROUTE, 0, 0);
2976 	if (err <= 0)
2977 		goto errout_free;
2978 
2979 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2980 errout:
2981 	return err;
2982 
2983 errout_free:
2984 	kfree_skb(skb);
2985 	goto errout;
2986 }
2987 
2988 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2989 {
2990 	struct rtable *rt;
2991 	int h, s_h;
2992 	int idx, s_idx;
2993 	struct net *net;
2994 
2995 	net = sock_net(skb->sk);
2996 
2997 	s_h = cb->args[0];
2998 	if (s_h < 0)
2999 		s_h = 0;
3000 	s_idx = idx = cb->args[1];
3001 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3002 		if (!rt_hash_table[h].chain)
3003 			continue;
3004 		rcu_read_lock_bh();
3005 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3006 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3007 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3008 				continue;
3009 			if (rt_is_expired(rt))
3010 				continue;
3011 			skb_dst_set(skb, dst_clone(&rt->u.dst));
3012 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3013 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3014 					 1, NLM_F_MULTI) <= 0) {
3015 				skb_dst_drop(skb);
3016 				rcu_read_unlock_bh();
3017 				goto done;
3018 			}
3019 			skb_dst_drop(skb);
3020 		}
3021 		rcu_read_unlock_bh();
3022 	}
3023 
3024 done:
3025 	cb->args[0] = h;
3026 	cb->args[1] = idx;
3027 	return skb->len;
3028 }
3029 
3030 void ip_rt_multicast_event(struct in_device *in_dev)
3031 {
3032 	rt_cache_flush(dev_net(in_dev->dev), 0);
3033 }
3034 
3035 #ifdef CONFIG_SYSCTL
3036 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3037 					struct file *filp, void __user *buffer,
3038 					size_t *lenp, loff_t *ppos)
3039 {
3040 	if (write) {
3041 		int flush_delay;
3042 		ctl_table ctl;
3043 		struct net *net;
3044 
3045 		memcpy(&ctl, __ctl, sizeof(ctl));
3046 		ctl.data = &flush_delay;
3047 		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3048 
3049 		net = (struct net *)__ctl->extra1;
3050 		rt_cache_flush(net, flush_delay);
3051 		return 0;
3052 	}
3053 
3054 	return -EINVAL;
3055 }
3056 
3057 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3058 						void __user *oldval,
3059 						size_t __user *oldlenp,
3060 						void __user *newval,
3061 						size_t newlen)
3062 {
3063 	int delay;
3064 	struct net *net;
3065 	if (newlen != sizeof(int))
3066 		return -EINVAL;
3067 	if (get_user(delay, (int __user *)newval))
3068 		return -EFAULT;
3069 	net = (struct net *)table->extra1;
3070 	rt_cache_flush(net, delay);
3071 	return 0;
3072 }
3073 
3074 static void rt_secret_reschedule(int old)
3075 {
3076 	struct net *net;
3077 	int new = ip_rt_secret_interval;
3078 	int diff = new - old;
3079 
3080 	if (!diff)
3081 		return;
3082 
3083 	rtnl_lock();
3084 	for_each_net(net) {
3085 		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3086 
3087 		if (!new)
3088 			continue;
3089 
3090 		if (deleted) {
3091 			long time = net->ipv4.rt_secret_timer.expires - jiffies;
3092 
3093 			if (time <= 0 || (time += diff) <= 0)
3094 				time = 0;
3095 
3096 			net->ipv4.rt_secret_timer.expires = time;
3097 		} else
3098 			net->ipv4.rt_secret_timer.expires = new;
3099 
3100 		net->ipv4.rt_secret_timer.expires += jiffies;
3101 		add_timer(&net->ipv4.rt_secret_timer);
3102 	}
3103 	rtnl_unlock();
3104 }
3105 
3106 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3107 					  struct file *filp,
3108 					  void __user *buffer, size_t *lenp,
3109 					  loff_t *ppos)
3110 {
3111 	int old = ip_rt_secret_interval;
3112 	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3113 
3114 	rt_secret_reschedule(old);
3115 
3116 	return ret;
3117 }
3118 
3119 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3120 						   void __user *oldval,
3121 						   size_t __user *oldlenp,
3122 						   void __user *newval,
3123 						   size_t newlen)
3124 {
3125 	int old = ip_rt_secret_interval;
3126 	int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3127 
3128 	rt_secret_reschedule(old);
3129 
3130 	return ret;
3131 }
3132 
3133 static ctl_table ipv4_route_table[] = {
3134 	{
3135 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
3136 		.procname	= "gc_thresh",
3137 		.data		= &ipv4_dst_ops.gc_thresh,
3138 		.maxlen		= sizeof(int),
3139 		.mode		= 0644,
3140 		.proc_handler	= proc_dointvec,
3141 	},
3142 	{
3143 		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
3144 		.procname	= "max_size",
3145 		.data		= &ip_rt_max_size,
3146 		.maxlen		= sizeof(int),
3147 		.mode		= 0644,
3148 		.proc_handler	= proc_dointvec,
3149 	},
3150 	{
3151 		/*  Deprecated. Use gc_min_interval_ms */
3152 
3153 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3154 		.procname	= "gc_min_interval",
3155 		.data		= &ip_rt_gc_min_interval,
3156 		.maxlen		= sizeof(int),
3157 		.mode		= 0644,
3158 		.proc_handler	= proc_dointvec_jiffies,
3159 		.strategy	= sysctl_jiffies,
3160 	},
3161 	{
3162 		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3163 		.procname	= "gc_min_interval_ms",
3164 		.data		= &ip_rt_gc_min_interval,
3165 		.maxlen		= sizeof(int),
3166 		.mode		= 0644,
3167 		.proc_handler	= proc_dointvec_ms_jiffies,
3168 		.strategy	= sysctl_ms_jiffies,
3169 	},
3170 	{
3171 		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
3172 		.procname	= "gc_timeout",
3173 		.data		= &ip_rt_gc_timeout,
3174 		.maxlen		= sizeof(int),
3175 		.mode		= 0644,
3176 		.proc_handler	= proc_dointvec_jiffies,
3177 		.strategy	= sysctl_jiffies,
3178 	},
3179 	{
3180 		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
3181 		.procname	= "gc_interval",
3182 		.data		= &ip_rt_gc_interval,
3183 		.maxlen		= sizeof(int),
3184 		.mode		= 0644,
3185 		.proc_handler	= proc_dointvec_jiffies,
3186 		.strategy	= sysctl_jiffies,
3187 	},
3188 	{
3189 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
3190 		.procname	= "redirect_load",
3191 		.data		= &ip_rt_redirect_load,
3192 		.maxlen		= sizeof(int),
3193 		.mode		= 0644,
3194 		.proc_handler	= proc_dointvec,
3195 	},
3196 	{
3197 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
3198 		.procname	= "redirect_number",
3199 		.data		= &ip_rt_redirect_number,
3200 		.maxlen		= sizeof(int),
3201 		.mode		= 0644,
3202 		.proc_handler	= proc_dointvec,
3203 	},
3204 	{
3205 		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
3206 		.procname	= "redirect_silence",
3207 		.data		= &ip_rt_redirect_silence,
3208 		.maxlen		= sizeof(int),
3209 		.mode		= 0644,
3210 		.proc_handler	= proc_dointvec,
3211 	},
3212 	{
3213 		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
3214 		.procname	= "error_cost",
3215 		.data		= &ip_rt_error_cost,
3216 		.maxlen		= sizeof(int),
3217 		.mode		= 0644,
3218 		.proc_handler	= proc_dointvec,
3219 	},
3220 	{
3221 		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
3222 		.procname	= "error_burst",
3223 		.data		= &ip_rt_error_burst,
3224 		.maxlen		= sizeof(int),
3225 		.mode		= 0644,
3226 		.proc_handler	= proc_dointvec,
3227 	},
3228 	{
3229 		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
3230 		.procname	= "gc_elasticity",
3231 		.data		= &ip_rt_gc_elasticity,
3232 		.maxlen		= sizeof(int),
3233 		.mode		= 0644,
3234 		.proc_handler	= proc_dointvec,
3235 	},
3236 	{
3237 		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
3238 		.procname	= "mtu_expires",
3239 		.data		= &ip_rt_mtu_expires,
3240 		.maxlen		= sizeof(int),
3241 		.mode		= 0644,
3242 		.proc_handler	= proc_dointvec_jiffies,
3243 		.strategy	= sysctl_jiffies,
3244 	},
3245 	{
3246 		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
3247 		.procname	= "min_pmtu",
3248 		.data		= &ip_rt_min_pmtu,
3249 		.maxlen		= sizeof(int),
3250 		.mode		= 0644,
3251 		.proc_handler	= proc_dointvec,
3252 	},
3253 	{
3254 		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
3255 		.procname	= "min_adv_mss",
3256 		.data		= &ip_rt_min_advmss,
3257 		.maxlen		= sizeof(int),
3258 		.mode		= 0644,
3259 		.proc_handler	= proc_dointvec,
3260 	},
3261 	{
3262 		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3263 		.procname	= "secret_interval",
3264 		.data		= &ip_rt_secret_interval,
3265 		.maxlen		= sizeof(int),
3266 		.mode		= 0644,
3267 		.proc_handler	= ipv4_sysctl_rt_secret_interval,
3268 		.strategy	= ipv4_sysctl_rt_secret_interval_strategy,
3269 	},
3270 	{ .ctl_name = 0 }
3271 };
3272 
3273 static struct ctl_table empty[1];
3274 
3275 static struct ctl_table ipv4_skeleton[] =
3276 {
3277 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3278 	  .mode = 0555, .child = ipv4_route_table},
3279 	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3280 	  .mode = 0555, .child = empty},
3281 	{ }
3282 };
3283 
3284 static __net_initdata struct ctl_path ipv4_path[] = {
3285 	{ .procname = "net", .ctl_name = CTL_NET, },
3286 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3287 	{ },
3288 };
3289 
3290 static struct ctl_table ipv4_route_flush_table[] = {
3291 	{
3292 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
3293 		.procname	= "flush",
3294 		.maxlen		= sizeof(int),
3295 		.mode		= 0200,
3296 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3297 		.strategy	= ipv4_sysctl_rtcache_flush_strategy,
3298 	},
3299 	{ .ctl_name = 0 },
3300 };
3301 
3302 static __net_initdata struct ctl_path ipv4_route_path[] = {
3303 	{ .procname = "net", .ctl_name = CTL_NET, },
3304 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
3305 	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3306 	{ },
3307 };
3308 
3309 static __net_init int sysctl_route_net_init(struct net *net)
3310 {
3311 	struct ctl_table *tbl;
3312 
3313 	tbl = ipv4_route_flush_table;
3314 	if (net != &init_net) {
3315 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3316 		if (tbl == NULL)
3317 			goto err_dup;
3318 	}
3319 	tbl[0].extra1 = net;
3320 
3321 	net->ipv4.route_hdr =
3322 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3323 	if (net->ipv4.route_hdr == NULL)
3324 		goto err_reg;
3325 	return 0;
3326 
3327 err_reg:
3328 	if (tbl != ipv4_route_flush_table)
3329 		kfree(tbl);
3330 err_dup:
3331 	return -ENOMEM;
3332 }
3333 
3334 static __net_exit void sysctl_route_net_exit(struct net *net)
3335 {
3336 	struct ctl_table *tbl;
3337 
3338 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3339 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3340 	BUG_ON(tbl == ipv4_route_flush_table);
3341 	kfree(tbl);
3342 }
3343 
3344 static __net_initdata struct pernet_operations sysctl_route_ops = {
3345 	.init = sysctl_route_net_init,
3346 	.exit = sysctl_route_net_exit,
3347 };
3348 #endif
3349 
3350 
3351 static __net_init int rt_secret_timer_init(struct net *net)
3352 {
3353 	atomic_set(&net->ipv4.rt_genid,
3354 			(int) ((num_physpages ^ (num_physpages>>8)) ^
3355 			(jiffies ^ (jiffies >> 7))));
3356 
3357 	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3358 	net->ipv4.rt_secret_timer.data = (unsigned long)net;
3359 	init_timer_deferrable(&net->ipv4.rt_secret_timer);
3360 
3361 	if (ip_rt_secret_interval) {
3362 		net->ipv4.rt_secret_timer.expires =
3363 			jiffies + net_random() % ip_rt_secret_interval +
3364 			ip_rt_secret_interval;
3365 		add_timer(&net->ipv4.rt_secret_timer);
3366 	}
3367 	return 0;
3368 }
3369 
3370 static __net_exit void rt_secret_timer_exit(struct net *net)
3371 {
3372 	del_timer_sync(&net->ipv4.rt_secret_timer);
3373 }
3374 
3375 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3376 	.init = rt_secret_timer_init,
3377 	.exit = rt_secret_timer_exit,
3378 };
3379 
3380 
3381 #ifdef CONFIG_NET_CLS_ROUTE
3382 struct ip_rt_acct *ip_rt_acct __read_mostly;
3383 #endif /* CONFIG_NET_CLS_ROUTE */
3384 
3385 static __initdata unsigned long rhash_entries;
3386 static int __init set_rhash_entries(char *str)
3387 {
3388 	if (!str)
3389 		return 0;
3390 	rhash_entries = simple_strtoul(str, &str, 0);
3391 	return 1;
3392 }
3393 __setup("rhash_entries=", set_rhash_entries);
3394 
3395 int __init ip_rt_init(void)
3396 {
3397 	int rc = 0;
3398 
3399 #ifdef CONFIG_NET_CLS_ROUTE
3400 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3401 	if (!ip_rt_acct)
3402 		panic("IP: failed to allocate ip_rt_acct\n");
3403 #endif
3404 
3405 	ipv4_dst_ops.kmem_cachep =
3406 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3407 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3408 
3409 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3410 
3411 	rt_hash_table = (struct rt_hash_bucket *)
3412 		alloc_large_system_hash("IP route cache",
3413 					sizeof(struct rt_hash_bucket),
3414 					rhash_entries,
3415 					(num_physpages >= 128 * 1024) ?
3416 					15 : 17,
3417 					0,
3418 					&rt_hash_log,
3419 					&rt_hash_mask,
3420 					rhash_entries ? 0 : 512 * 1024);
3421 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3422 	rt_hash_lock_init();
3423 
3424 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3425 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3426 
3427 	devinet_init();
3428 	ip_fib_init();
3429 
3430 	/* All the timers, started at system startup tend
3431 	   to synchronize. Perturb it a bit.
3432 	 */
3433 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434 	expires_ljiffies = jiffies;
3435 	schedule_delayed_work(&expires_work,
3436 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437 
3438 	if (register_pernet_subsys(&rt_secret_timer_ops))
3439 		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3440 
3441 	if (ip_rt_proc_init())
3442 		printk(KERN_ERR "Unable to create route proc files\n");
3443 #ifdef CONFIG_XFRM
3444 	xfrm_init();
3445 	xfrm4_init();
3446 #endif
3447 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3448 
3449 #ifdef CONFIG_SYSCTL
3450 	register_pernet_subsys(&sysctl_route_ops);
3451 #endif
3452 	return rc;
3453 }
3454 
3455 #ifdef CONFIG_SYSCTL
3456 /*
3457  * We really need to sanitize the damn ipv4 init order, then all
3458  * this nonsense will go away.
3459  */
3460 void __init ip_static_sysctl_init(void)
3461 {
3462 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3463 }
3464 #endif
3465 
3466 EXPORT_SYMBOL(__ip_select_ident);
3467 EXPORT_SYMBOL(ip_route_input);
3468 EXPORT_SYMBOL(ip_route_output_key);
3469