xref: /linux/net/ipv4/route.c (revision 800c5eb7b5eba6cb2a32738d763fd59f0fbcdde4)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 
114 #define RT_FL_TOS(oldflp4) \
115 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116 
117 #define IP_MAX_MTU	0xFFF0
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly	= 8;
131 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly	= 256;
134 static int rt_chain_length_max __read_mostly	= 20;
135 static int redirect_genid;
136 
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139 
140 /*
141  *	Interface to generic destination cache.
142  */
143 
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
147 static void		 ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void		 ipv4_link_failure(struct sk_buff *skb);
150 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152 
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 			    int how)
155 {
156 }
157 
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160 	struct rtable *rt = (struct rtable *) dst;
161 	struct inet_peer *peer;
162 	u32 *p = NULL;
163 
164 	if (!rt->peer)
165 		rt_bind_peer(rt, rt->rt_dst, 1);
166 
167 	peer = rt->peer;
168 	if (peer) {
169 		u32 *old_p = __DST_METRICS_PTR(old);
170 		unsigned long prev, new;
171 
172 		p = peer->metrics;
173 		if (inet_metrics_new(peer))
174 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175 
176 		new = (unsigned long) p;
177 		prev = cmpxchg(&dst->_metrics, old, new);
178 
179 		if (prev != old) {
180 			p = __DST_METRICS_PTR(prev);
181 			if (prev & DST_METRICS_READ_ONLY)
182 				p = NULL;
183 		} else {
184 			if (rt->fi) {
185 				fib_info_put(rt->fi);
186 				rt->fi = NULL;
187 			}
188 		}
189 	}
190 	return p;
191 }
192 
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194 
195 static struct dst_ops ipv4_dst_ops = {
196 	.family =		AF_INET,
197 	.protocol =		cpu_to_be16(ETH_P_IP),
198 	.gc =			rt_garbage_collect,
199 	.check =		ipv4_dst_check,
200 	.default_advmss =	ipv4_default_advmss,
201 	.mtu =			ipv4_mtu,
202 	.cow_metrics =		ipv4_cow_metrics,
203 	.destroy =		ipv4_dst_destroy,
204 	.ifdown =		ipv4_dst_ifdown,
205 	.negative_advice =	ipv4_negative_advice,
206 	.link_failure =		ipv4_link_failure,
207 	.update_pmtu =		ip_rt_update_pmtu,
208 	.local_out =		__ip_local_out,
209 	.neigh_lookup =		ipv4_neigh_lookup,
210 };
211 
212 #define ECN_OR_COST(class)	TC_PRIO_##class
213 
214 const __u8 ip_tos2prio[16] = {
215 	TC_PRIO_BESTEFFORT,
216 	ECN_OR_COST(BESTEFFORT),
217 	TC_PRIO_BESTEFFORT,
218 	ECN_OR_COST(BESTEFFORT),
219 	TC_PRIO_BULK,
220 	ECN_OR_COST(BULK),
221 	TC_PRIO_BULK,
222 	ECN_OR_COST(BULK),
223 	TC_PRIO_INTERACTIVE,
224 	ECN_OR_COST(INTERACTIVE),
225 	TC_PRIO_INTERACTIVE,
226 	ECN_OR_COST(INTERACTIVE),
227 	TC_PRIO_INTERACTIVE_BULK,
228 	ECN_OR_COST(INTERACTIVE_BULK),
229 	TC_PRIO_INTERACTIVE_BULK,
230 	ECN_OR_COST(INTERACTIVE_BULK)
231 };
232 
233 
234 /*
235  * Route cache.
236  */
237 
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247 
248 struct rt_hash_bucket {
249 	struct rtable __rcu	*chain;
250 };
251 
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 	defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ	256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ	4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ	2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ	1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ	512
270 # else
271 #  define RT_HASH_LOCK_SZ	256
272 # endif
273 #endif
274 
275 static spinlock_t	*rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277 
278 static __init void rt_hash_lock_init(void)
279 {
280 	int i;
281 
282 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 			GFP_KERNEL);
284 	if (!rt_hash_locks)
285 		panic("IP: failed to allocate rt_hash_locks\n");
286 
287 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 		spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292 
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297 
298 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
299 static unsigned			rt_hash_mask __read_mostly;
300 static unsigned int		rt_hash_log  __read_mostly;
301 
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304 
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306 				   int genid)
307 {
308 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 			    idx, genid)
310 		& rt_hash_mask;
311 }
312 
313 static inline int rt_genid(struct net *net)
314 {
315 	return atomic_read(&net->ipv4.rt_genid);
316 }
317 
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 	struct seq_net_private p;
321 	int bucket;
322 	int genid;
323 };
324 
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327 	struct rt_cache_iter_state *st = seq->private;
328 	struct rtable *r = NULL;
329 
330 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332 			continue;
333 		rcu_read_lock_bh();
334 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335 		while (r) {
336 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 			    r->rt_genid == st->genid)
338 				return r;
339 			r = rcu_dereference_bh(r->dst.rt_next);
340 		}
341 		rcu_read_unlock_bh();
342 	}
343 	return r;
344 }
345 
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347 					  struct rtable *r)
348 {
349 	struct rt_cache_iter_state *st = seq->private;
350 
351 	r = rcu_dereference_bh(r->dst.rt_next);
352 	while (!r) {
353 		rcu_read_unlock_bh();
354 		do {
355 			if (--st->bucket < 0)
356 				return NULL;
357 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358 		rcu_read_lock_bh();
359 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 	}
361 	return r;
362 }
363 
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365 					struct rtable *r)
366 {
367 	struct rt_cache_iter_state *st = seq->private;
368 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 		if (dev_net(r->dst.dev) != seq_file_net(seq))
370 			continue;
371 		if (r->rt_genid == st->genid)
372 			break;
373 	}
374 	return r;
375 }
376 
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379 	struct rtable *r = rt_cache_get_first(seq);
380 
381 	if (r)
382 		while (pos && (r = rt_cache_get_next(seq, r)))
383 			--pos;
384 	return pos ? NULL : r;
385 }
386 
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389 	struct rt_cache_iter_state *st = seq->private;
390 	if (*pos)
391 		return rt_cache_get_idx(seq, *pos - 1);
392 	st->genid = rt_genid(seq_file_net(seq));
393 	return SEQ_START_TOKEN;
394 }
395 
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398 	struct rtable *r;
399 
400 	if (v == SEQ_START_TOKEN)
401 		r = rt_cache_get_first(seq);
402 	else
403 		r = rt_cache_get_next(seq, v);
404 	++*pos;
405 	return r;
406 }
407 
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410 	if (v && v != SEQ_START_TOKEN)
411 		rcu_read_unlock_bh();
412 }
413 
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416 	if (v == SEQ_START_TOKEN)
417 		seq_printf(seq, "%-127s\n",
418 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 			   "HHUptod\tSpecDst");
421 	else {
422 		struct rtable *r = v;
423 		struct neighbour *n;
424 		int len, HHUptod;
425 
426 		rcu_read_lock();
427 		n = dst_get_neighbour_noref(&r->dst);
428 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 		rcu_read_unlock();
430 
431 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433 			r->dst.dev ? r->dst.dev->name : "*",
434 			(__force u32)r->rt_dst,
435 			(__force u32)r->rt_gateway,
436 			r->rt_flags, atomic_read(&r->dst.__refcnt),
437 			r->dst.__use, 0, (__force u32)r->rt_src,
438 			dst_metric_advmss(&r->dst) + 40,
439 			dst_metric(&r->dst, RTAX_WINDOW),
440 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 			      dst_metric(&r->dst, RTAX_RTTVAR)),
442 			r->rt_key_tos,
443 			-1,
444 			HHUptod,
445 			r->rt_spec_dst, &len);
446 
447 		seq_printf(seq, "%*s\n", 127 - len, "");
448 	}
449 	return 0;
450 }
451 
452 static const struct seq_operations rt_cache_seq_ops = {
453 	.start  = rt_cache_seq_start,
454 	.next   = rt_cache_seq_next,
455 	.stop   = rt_cache_seq_stop,
456 	.show   = rt_cache_seq_show,
457 };
458 
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461 	return seq_open_net(inode, file, &rt_cache_seq_ops,
462 			sizeof(struct rt_cache_iter_state));
463 }
464 
465 static const struct file_operations rt_cache_seq_fops = {
466 	.owner	 = THIS_MODULE,
467 	.open	 = rt_cache_seq_open,
468 	.read	 = seq_read,
469 	.llseek	 = seq_lseek,
470 	.release = seq_release_net,
471 };
472 
473 
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476 	int cpu;
477 
478 	if (*pos == 0)
479 		return SEQ_START_TOKEN;
480 
481 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482 		if (!cpu_possible(cpu))
483 			continue;
484 		*pos = cpu+1;
485 		return &per_cpu(rt_cache_stat, cpu);
486 	}
487 	return NULL;
488 }
489 
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492 	int cpu;
493 
494 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495 		if (!cpu_possible(cpu))
496 			continue;
497 		*pos = cpu+1;
498 		return &per_cpu(rt_cache_stat, cpu);
499 	}
500 	return NULL;
501 
502 }
503 
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506 
507 }
508 
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511 	struct rt_cache_stat *st = v;
512 
513 	if (v == SEQ_START_TOKEN) {
514 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515 		return 0;
516 	}
517 
518 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520 		   dst_entries_get_slow(&ipv4_dst_ops),
521 		   st->in_hit,
522 		   st->in_slow_tot,
523 		   st->in_slow_mc,
524 		   st->in_no_route,
525 		   st->in_brd,
526 		   st->in_martian_dst,
527 		   st->in_martian_src,
528 
529 		   st->out_hit,
530 		   st->out_slow_tot,
531 		   st->out_slow_mc,
532 
533 		   st->gc_total,
534 		   st->gc_ignored,
535 		   st->gc_goal_miss,
536 		   st->gc_dst_overflow,
537 		   st->in_hlist_search,
538 		   st->out_hlist_search
539 		);
540 	return 0;
541 }
542 
543 static const struct seq_operations rt_cpu_seq_ops = {
544 	.start  = rt_cpu_seq_start,
545 	.next   = rt_cpu_seq_next,
546 	.stop   = rt_cpu_seq_stop,
547 	.show   = rt_cpu_seq_show,
548 };
549 
550 
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553 	return seq_open(file, &rt_cpu_seq_ops);
554 }
555 
556 static const struct file_operations rt_cpu_seq_fops = {
557 	.owner	 = THIS_MODULE,
558 	.open	 = rt_cpu_seq_open,
559 	.read	 = seq_read,
560 	.llseek	 = seq_lseek,
561 	.release = seq_release,
562 };
563 
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567 	struct ip_rt_acct *dst, *src;
568 	unsigned int i, j;
569 
570 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 	if (!dst)
572 		return -ENOMEM;
573 
574 	for_each_possible_cpu(i) {
575 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 		for (j = 0; j < 256; j++) {
577 			dst[j].o_bytes   += src[j].o_bytes;
578 			dst[j].o_packets += src[j].o_packets;
579 			dst[j].i_bytes   += src[j].i_bytes;
580 			dst[j].i_packets += src[j].i_packets;
581 		}
582 	}
583 
584 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 	kfree(dst);
586 	return 0;
587 }
588 
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591 	return single_open(file, rt_acct_proc_show, NULL);
592 }
593 
594 static const struct file_operations rt_acct_proc_fops = {
595 	.owner		= THIS_MODULE,
596 	.open		= rt_acct_proc_open,
597 	.read		= seq_read,
598 	.llseek		= seq_lseek,
599 	.release	= single_release,
600 };
601 #endif
602 
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605 	struct proc_dir_entry *pde;
606 
607 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 			&rt_cache_seq_fops);
609 	if (!pde)
610 		goto err1;
611 
612 	pde = proc_create("rt_cache", S_IRUGO,
613 			  net->proc_net_stat, &rt_cpu_seq_fops);
614 	if (!pde)
615 		goto err2;
616 
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619 	if (!pde)
620 		goto err3;
621 #endif
622 	return 0;
623 
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626 	remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629 	remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631 	return -ENOMEM;
632 }
633 
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636 	remove_proc_entry("rt_cache", net->proc_net_stat);
637 	remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639 	remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642 
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644 	.init = ip_rt_do_proc_init,
645 	.exit = ip_rt_do_proc_exit,
646 };
647 
648 static int __init ip_rt_proc_init(void)
649 {
650 	return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652 
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656 	return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659 
660 static inline void rt_free(struct rtable *rt)
661 {
662 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664 
665 static inline void rt_drop(struct rtable *rt)
666 {
667 	ip_rt_put(rt);
668 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670 
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673 	/* Kill broadcast/multicast entries very aggresively, if they
674 	   collide in hash table with more useful entries */
675 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676 		rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678 
679 static inline int rt_valuable(struct rtable *rth)
680 {
681 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682 		(rth->peer && rth->peer->pmtu_expires);
683 }
684 
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687 	unsigned long age;
688 	int ret = 0;
689 
690 	if (atomic_read(&rth->dst.__refcnt))
691 		goto out;
692 
693 	age = jiffies - rth->dst.lastuse;
694 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 	    (age <= tmo2 && rt_valuable(rth)))
696 		goto out;
697 	ret = 1;
698 out:	return ret;
699 }
700 
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708 	u32 score = jiffies - rt->dst.lastuse;
709 
710 	score = ~score & ~(3<<30);
711 
712 	if (rt_valuable(rt))
713 		score |= (1<<31);
714 
715 	if (rt_is_output_route(rt) ||
716 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 		score |= (1<<30);
718 
719 	return score;
720 }
721 
722 static inline bool rt_caching(const struct net *net)
723 {
724 	return net->ipv4.current_rt_cache_rebuild_count <=
725 		net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727 
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729 				       const struct rtable *rt2)
730 {
731 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735 
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 		(rt1->rt_mark ^ rt2->rt_mark) |
741 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
742 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
743 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745 
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750 
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755 
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763 	unsigned int i;
764 	struct rtable *rth, *next;
765 
766 	for (i = 0; i <= rt_hash_mask; i++) {
767 		struct rtable __rcu **pprev;
768 		struct rtable *list;
769 
770 		if (process_context && need_resched())
771 			cond_resched();
772 		rth = rcu_access_pointer(rt_hash_table[i].chain);
773 		if (!rth)
774 			continue;
775 
776 		spin_lock_bh(rt_hash_lock_addr(i));
777 
778 		list = NULL;
779 		pprev = &rt_hash_table[i].chain;
780 		rth = rcu_dereference_protected(*pprev,
781 			lockdep_is_held(rt_hash_lock_addr(i)));
782 
783 		while (rth) {
784 			next = rcu_dereference_protected(rth->dst.rt_next,
785 				lockdep_is_held(rt_hash_lock_addr(i)));
786 
787 			if (!net ||
788 			    net_eq(dev_net(rth->dst.dev), net)) {
789 				rcu_assign_pointer(*pprev, next);
790 				rcu_assign_pointer(rth->dst.rt_next, list);
791 				list = rth;
792 			} else {
793 				pprev = &rth->dst.rt_next;
794 			}
795 			rth = next;
796 		}
797 
798 		spin_unlock_bh(rt_hash_lock_addr(i));
799 
800 		for (; list; list = next) {
801 			next = rcu_dereference_protected(list->dst.rt_next, 1);
802 			rt_free(list);
803 		}
804 	}
805 }
806 
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814 
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817 
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827 	const struct rtable *aux = head;
828 
829 	while (aux != rth) {
830 		if (compare_hash_inputs(aux, rth))
831 			return 0;
832 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833 	}
834 	return ONE;
835 }
836 
837 static void rt_check_expire(void)
838 {
839 	static unsigned int rover;
840 	unsigned int i = rover, goal;
841 	struct rtable *rth;
842 	struct rtable __rcu **rthp;
843 	unsigned long samples = 0;
844 	unsigned long sum = 0, sum2 = 0;
845 	unsigned long delta;
846 	u64 mult;
847 
848 	delta = jiffies - expires_ljiffies;
849 	expires_ljiffies = jiffies;
850 	mult = ((u64)delta) << rt_hash_log;
851 	if (ip_rt_gc_timeout > 1)
852 		do_div(mult, ip_rt_gc_timeout);
853 	goal = (unsigned int)mult;
854 	if (goal > rt_hash_mask)
855 		goal = rt_hash_mask + 1;
856 	for (; goal > 0; goal--) {
857 		unsigned long tmo = ip_rt_gc_timeout;
858 		unsigned long length;
859 
860 		i = (i + 1) & rt_hash_mask;
861 		rthp = &rt_hash_table[i].chain;
862 
863 		if (need_resched())
864 			cond_resched();
865 
866 		samples++;
867 
868 		if (rcu_dereference_raw(*rthp) == NULL)
869 			continue;
870 		length = 0;
871 		spin_lock_bh(rt_hash_lock_addr(i));
872 		while ((rth = rcu_dereference_protected(*rthp,
873 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 			prefetch(rth->dst.rt_next);
875 			if (rt_is_expired(rth)) {
876 				*rthp = rth->dst.rt_next;
877 				rt_free(rth);
878 				continue;
879 			}
880 			if (rth->dst.expires) {
881 				/* Entry is expired even if it is in use */
882 				if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884 					tmo >>= 1;
885 					rthp = &rth->dst.rt_next;
886 					/*
887 					 * We only count entries on
888 					 * a chain with equal hash inputs once
889 					 * so that entries for different QOS
890 					 * levels, and other non-hash input
891 					 * attributes don't unfairly skew
892 					 * the length computation
893 					 */
894 					length += has_noalias(rt_hash_table[i].chain, rth);
895 					continue;
896 				}
897 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 				goto nofree;
899 
900 			/* Cleanup aged off entries. */
901 			*rthp = rth->dst.rt_next;
902 			rt_free(rth);
903 		}
904 		spin_unlock_bh(rt_hash_lock_addr(i));
905 		sum += length;
906 		sum2 += length*length;
907 	}
908 	if (samples) {
909 		unsigned long avg = sum / samples;
910 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 		rt_chain_length_max = max_t(unsigned long,
912 					ip_rt_gc_elasticity,
913 					(avg + 4*sd) >> FRACT_BITS);
914 	}
915 	rover = i;
916 }
917 
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
922 static void rt_worker_func(struct work_struct *work)
923 {
924 	rt_check_expire();
925 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927 
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
934 static void rt_cache_invalidate(struct net *net)
935 {
936 	unsigned char shuffle;
937 
938 	get_random_bytes(&shuffle, sizeof(shuffle));
939 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940 	redirect_genid++;
941 }
942 
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949 	rt_cache_invalidate(net);
950 	if (delay >= 0)
951 		rt_do_flush(net, !in_softirq());
952 }
953 
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957 	rt_do_flush(net, !in_softirq());
958 }
959 
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962 	if (net_ratelimit())
963 		printk(KERN_WARNING "Route hash chain too long!\n");
964 	rt_cache_invalidate(net);
965 }
966 
967 /*
968    Short description of GC goals.
969 
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973 
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979 
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982 	static unsigned long expire = RT_GC_TIMEOUT;
983 	static unsigned long last_gc;
984 	static int rover;
985 	static int equilibrium;
986 	struct rtable *rth;
987 	struct rtable __rcu **rthp;
988 	unsigned long now = jiffies;
989 	int goal;
990 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
991 
992 	/*
993 	 * Garbage collection is pretty expensive,
994 	 * do not make it too frequently.
995 	 */
996 
997 	RT_CACHE_STAT_INC(gc_total);
998 
999 	if (now - last_gc < ip_rt_gc_min_interval &&
1000 	    entries < ip_rt_max_size) {
1001 		RT_CACHE_STAT_INC(gc_ignored);
1002 		goto out;
1003 	}
1004 
1005 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 	/* Calculate number of entries, which we want to expire now. */
1007 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 	if (goal <= 0) {
1009 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 			equilibrium = ipv4_dst_ops.gc_thresh;
1011 		goal = entries - equilibrium;
1012 		if (goal > 0) {
1013 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 			goal = entries - equilibrium;
1015 		}
1016 	} else {
1017 		/* We are in dangerous area. Try to reduce cache really
1018 		 * aggressively.
1019 		 */
1020 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 		equilibrium = entries - goal;
1022 	}
1023 
1024 	if (now - last_gc >= ip_rt_gc_min_interval)
1025 		last_gc = now;
1026 
1027 	if (goal <= 0) {
1028 		equilibrium += goal;
1029 		goto work_done;
1030 	}
1031 
1032 	do {
1033 		int i, k;
1034 
1035 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 			unsigned long tmo = expire;
1037 
1038 			k = (k + 1) & rt_hash_mask;
1039 			rthp = &rt_hash_table[k].chain;
1040 			spin_lock_bh(rt_hash_lock_addr(k));
1041 			while ((rth = rcu_dereference_protected(*rthp,
1042 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 				if (!rt_is_expired(rth) &&
1044 					!rt_may_expire(rth, tmo, expire)) {
1045 					tmo >>= 1;
1046 					rthp = &rth->dst.rt_next;
1047 					continue;
1048 				}
1049 				*rthp = rth->dst.rt_next;
1050 				rt_free(rth);
1051 				goal--;
1052 			}
1053 			spin_unlock_bh(rt_hash_lock_addr(k));
1054 			if (goal <= 0)
1055 				break;
1056 		}
1057 		rover = k;
1058 
1059 		if (goal <= 0)
1060 			goto work_done;
1061 
1062 		/* Goal is not achieved. We stop process if:
1063 
1064 		   - if expire reduced to zero. Otherwise, expire is halfed.
1065 		   - if table is not full.
1066 		   - if we are called from interrupt.
1067 		   - jiffies check is just fallback/debug loop breaker.
1068 		     We will not spin here for long time in any case.
1069 		 */
1070 
1071 		RT_CACHE_STAT_INC(gc_goal_miss);
1072 
1073 		if (expire == 0)
1074 			break;
1075 
1076 		expire >>= 1;
1077 
1078 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 			goto out;
1080 	} while (!in_softirq() && time_before_eq(jiffies, now));
1081 
1082 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 		goto out;
1084 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 		goto out;
1086 	if (net_ratelimit())
1087 		printk(KERN_WARNING "dst cache overflow\n");
1088 	RT_CACHE_STAT_INC(gc_dst_overflow);
1089 	return 1;
1090 
1091 work_done:
1092 	expire += ip_rt_gc_min_interval;
1093 	if (expire > ip_rt_gc_timeout ||
1094 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096 		expire = ip_rt_gc_timeout;
1097 out:	return 0;
1098 }
1099 
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105 	int length = 0;
1106 	const struct rtable *rth = head;
1107 
1108 	while (rth) {
1109 		length += has_noalias(head, rth);
1110 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111 	}
1112 	return length >> FRACT_BITS;
1113 }
1114 
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117 	static const __be32 inaddr_any = 0;
1118 	struct net_device *dev = dst->dev;
1119 	const __be32 *pkey = daddr;
1120 	const struct rtable *rt;
1121 	struct neighbour *n;
1122 
1123 	rt = (const struct rtable *) dst;
1124 
1125 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1126 		pkey = &inaddr_any;
1127 	else if (rt->rt_gateway)
1128 		pkey = (const __be32 *) &rt->rt_gateway;
1129 
1130 	n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1131 	if (n)
1132 		return n;
1133 	return neigh_create(&arp_tbl, pkey, dev);
1134 }
1135 
1136 static int rt_bind_neighbour(struct rtable *rt)
1137 {
1138 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1139 	if (IS_ERR(n))
1140 		return PTR_ERR(n);
1141 	dst_set_neighbour(&rt->dst, n);
1142 
1143 	return 0;
1144 }
1145 
1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147 				     struct sk_buff *skb, int ifindex)
1148 {
1149 	struct rtable	*rth, *cand;
1150 	struct rtable __rcu **rthp, **candp;
1151 	unsigned long	now;
1152 	u32 		min_score;
1153 	int		chain_length;
1154 	int attempts = !in_softirq();
1155 
1156 restart:
1157 	chain_length = 0;
1158 	min_score = ~(u32)0;
1159 	cand = NULL;
1160 	candp = NULL;
1161 	now = jiffies;
1162 
1163 	if (!rt_caching(dev_net(rt->dst.dev))) {
1164 		/*
1165 		 * If we're not caching, just tell the caller we
1166 		 * were successful and don't touch the route.  The
1167 		 * caller hold the sole reference to the cache entry, and
1168 		 * it will be released when the caller is done with it.
1169 		 * If we drop it here, the callers have no way to resolve routes
1170 		 * when we're not caching.  Instead, just point *rp at rt, so
1171 		 * the caller gets a single use out of the route
1172 		 * Note that we do rt_free on this new route entry, so that
1173 		 * once its refcount hits zero, we are still able to reap it
1174 		 * (Thanks Alexey)
1175 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1176 		 * we set DST_NOCACHE so that dst_release() can free dst without
1177 		 * waiting a grace period.
1178 		 */
1179 
1180 		rt->dst.flags |= DST_NOCACHE;
1181 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182 			int err = rt_bind_neighbour(rt);
1183 			if (err) {
1184 				if (net_ratelimit())
1185 					printk(KERN_WARNING
1186 					    "Neighbour table failure & not caching routes.\n");
1187 				ip_rt_put(rt);
1188 				return ERR_PTR(err);
1189 			}
1190 		}
1191 
1192 		goto skip_hashing;
1193 	}
1194 
1195 	rthp = &rt_hash_table[hash].chain;
1196 
1197 	spin_lock_bh(rt_hash_lock_addr(hash));
1198 	while ((rth = rcu_dereference_protected(*rthp,
1199 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1200 		if (rt_is_expired(rth)) {
1201 			*rthp = rth->dst.rt_next;
1202 			rt_free(rth);
1203 			continue;
1204 		}
1205 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1206 			/* Put it first */
1207 			*rthp = rth->dst.rt_next;
1208 			/*
1209 			 * Since lookup is lockfree, the deletion
1210 			 * must be visible to another weakly ordered CPU before
1211 			 * the insertion at the start of the hash chain.
1212 			 */
1213 			rcu_assign_pointer(rth->dst.rt_next,
1214 					   rt_hash_table[hash].chain);
1215 			/*
1216 			 * Since lookup is lockfree, the update writes
1217 			 * must be ordered for consistency on SMP.
1218 			 */
1219 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220 
1221 			dst_use(&rth->dst, now);
1222 			spin_unlock_bh(rt_hash_lock_addr(hash));
1223 
1224 			rt_drop(rt);
1225 			if (skb)
1226 				skb_dst_set(skb, &rth->dst);
1227 			return rth;
1228 		}
1229 
1230 		if (!atomic_read(&rth->dst.__refcnt)) {
1231 			u32 score = rt_score(rth);
1232 
1233 			if (score <= min_score) {
1234 				cand = rth;
1235 				candp = rthp;
1236 				min_score = score;
1237 			}
1238 		}
1239 
1240 		chain_length++;
1241 
1242 		rthp = &rth->dst.rt_next;
1243 	}
1244 
1245 	if (cand) {
1246 		/* ip_rt_gc_elasticity used to be average length of chain
1247 		 * length, when exceeded gc becomes really aggressive.
1248 		 *
1249 		 * The second limit is less certain. At the moment it allows
1250 		 * only 2 entries per bucket. We will see.
1251 		 */
1252 		if (chain_length > ip_rt_gc_elasticity) {
1253 			*candp = cand->dst.rt_next;
1254 			rt_free(cand);
1255 		}
1256 	} else {
1257 		if (chain_length > rt_chain_length_max &&
1258 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1259 			struct net *net = dev_net(rt->dst.dev);
1260 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1261 			if (!rt_caching(net)) {
1262 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1263 					rt->dst.dev->name, num);
1264 			}
1265 			rt_emergency_hash_rebuild(net);
1266 			spin_unlock_bh(rt_hash_lock_addr(hash));
1267 
1268 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1269 					ifindex, rt_genid(net));
1270 			goto restart;
1271 		}
1272 	}
1273 
1274 	/* Try to bind route to arp only if it is output
1275 	   route or unicast forwarding path.
1276 	 */
1277 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1278 		int err = rt_bind_neighbour(rt);
1279 		if (err) {
1280 			spin_unlock_bh(rt_hash_lock_addr(hash));
1281 
1282 			if (err != -ENOBUFS) {
1283 				rt_drop(rt);
1284 				return ERR_PTR(err);
1285 			}
1286 
1287 			/* Neighbour tables are full and nothing
1288 			   can be released. Try to shrink route cache,
1289 			   it is most likely it holds some neighbour records.
1290 			 */
1291 			if (attempts-- > 0) {
1292 				int saved_elasticity = ip_rt_gc_elasticity;
1293 				int saved_int = ip_rt_gc_min_interval;
1294 				ip_rt_gc_elasticity	= 1;
1295 				ip_rt_gc_min_interval	= 0;
1296 				rt_garbage_collect(&ipv4_dst_ops);
1297 				ip_rt_gc_min_interval	= saved_int;
1298 				ip_rt_gc_elasticity	= saved_elasticity;
1299 				goto restart;
1300 			}
1301 
1302 			if (net_ratelimit())
1303 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1304 			rt_drop(rt);
1305 			return ERR_PTR(-ENOBUFS);
1306 		}
1307 	}
1308 
1309 	rt->dst.rt_next = rt_hash_table[hash].chain;
1310 
1311 	/*
1312 	 * Since lookup is lockfree, we must make sure
1313 	 * previous writes to rt are committed to memory
1314 	 * before making rt visible to other CPUS.
1315 	 */
1316 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1317 
1318 	spin_unlock_bh(rt_hash_lock_addr(hash));
1319 
1320 skip_hashing:
1321 	if (skb)
1322 		skb_dst_set(skb, &rt->dst);
1323 	return rt;
1324 }
1325 
1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327 
1328 static u32 rt_peer_genid(void)
1329 {
1330 	return atomic_read(&__rt_peer_genid);
1331 }
1332 
1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1334 {
1335 	struct inet_peer *peer;
1336 
1337 	peer = inet_getpeer_v4(daddr, create);
1338 
1339 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1340 		inet_putpeer(peer);
1341 	else
1342 		rt->rt_peer_genid = rt_peer_genid();
1343 }
1344 
1345 /*
1346  * Peer allocation may fail only in serious out-of-memory conditions.  However
1347  * we still can generate some output.
1348  * Random ID selection looks a bit dangerous because we have no chances to
1349  * select ID being unique in a reasonable period of time.
1350  * But broken packet identifier may be better than no packet at all.
1351  */
1352 static void ip_select_fb_ident(struct iphdr *iph)
1353 {
1354 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 	static u32 ip_fallback_id;
1356 	u32 salt;
1357 
1358 	spin_lock_bh(&ip_fb_id_lock);
1359 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1360 	iph->id = htons(salt & 0xFFFF);
1361 	ip_fallback_id = salt;
1362 	spin_unlock_bh(&ip_fb_id_lock);
1363 }
1364 
1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366 {
1367 	struct rtable *rt = (struct rtable *) dst;
1368 
1369 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1370 		if (rt->peer == NULL)
1371 			rt_bind_peer(rt, rt->rt_dst, 1);
1372 
1373 		/* If peer is attached to destination, it is never detached,
1374 		   so that we need not to grab a lock to dereference it.
1375 		 */
1376 		if (rt->peer) {
1377 			iph->id = htons(inet_getid(rt->peer, more));
1378 			return;
1379 		}
1380 	} else if (!rt)
1381 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1382 		       __builtin_return_address(0));
1383 
1384 	ip_select_fb_ident(iph);
1385 }
1386 EXPORT_SYMBOL(__ip_select_ident);
1387 
1388 static void rt_del(unsigned hash, struct rtable *rt)
1389 {
1390 	struct rtable __rcu **rthp;
1391 	struct rtable *aux;
1392 
1393 	rthp = &rt_hash_table[hash].chain;
1394 	spin_lock_bh(rt_hash_lock_addr(hash));
1395 	ip_rt_put(rt);
1396 	while ((aux = rcu_dereference_protected(*rthp,
1397 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1398 		if (aux == rt || rt_is_expired(aux)) {
1399 			*rthp = aux->dst.rt_next;
1400 			rt_free(aux);
1401 			continue;
1402 		}
1403 		rthp = &aux->dst.rt_next;
1404 	}
1405 	spin_unlock_bh(rt_hash_lock_addr(hash));
1406 }
1407 
1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1409 {
1410 	struct rtable *rt = (struct rtable *) dst;
1411 	__be32 orig_gw = rt->rt_gateway;
1412 	struct neighbour *n, *old_n;
1413 
1414 	dst_confirm(&rt->dst);
1415 
1416 	rt->rt_gateway = peer->redirect_learned.a4;
1417 
1418 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1419 	if (IS_ERR(n)) {
1420 		rt->rt_gateway = orig_gw;
1421 		return;
1422 	}
1423 	old_n = xchg(&rt->dst._neighbour, n);
1424 	if (old_n)
1425 		neigh_release(old_n);
1426 	if (!(n->nud_state & NUD_VALID)) {
1427 		neigh_event_send(n, NULL);
1428 	} else {
1429 		rt->rt_flags |= RTCF_REDIRECTED;
1430 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431 	}
1432 }
1433 
1434 /* called in rcu_read_lock() section */
1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 		    __be32 saddr, struct net_device *dev)
1437 {
1438 	int s, i;
1439 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1440 	__be32 skeys[2] = { saddr, 0 };
1441 	int    ikeys[2] = { dev->ifindex, 0 };
1442 	struct inet_peer *peer;
1443 	struct net *net;
1444 
1445 	if (!in_dev)
1446 		return;
1447 
1448 	net = dev_net(dev);
1449 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 	    ipv4_is_zeronet(new_gw))
1452 		goto reject_redirect;
1453 
1454 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 			goto reject_redirect;
1457 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 			goto reject_redirect;
1459 	} else {
1460 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461 			goto reject_redirect;
1462 	}
1463 
1464 	for (s = 0; s < 2; s++) {
1465 		for (i = 0; i < 2; i++) {
1466 			unsigned int hash;
1467 			struct rtable __rcu **rthp;
1468 			struct rtable *rt;
1469 
1470 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471 
1472 			rthp = &rt_hash_table[hash].chain;
1473 
1474 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 				rthp = &rt->dst.rt_next;
1476 
1477 				if (rt->rt_key_dst != daddr ||
1478 				    rt->rt_key_src != skeys[s] ||
1479 				    rt->rt_oif != ikeys[i] ||
1480 				    rt_is_input_route(rt) ||
1481 				    rt_is_expired(rt) ||
1482 				    !net_eq(dev_net(rt->dst.dev), net) ||
1483 				    rt->dst.error ||
1484 				    rt->dst.dev != dev ||
1485 				    rt->rt_gateway != old_gw)
1486 					continue;
1487 
1488 				if (!rt->peer)
1489 					rt_bind_peer(rt, rt->rt_dst, 1);
1490 
1491 				peer = rt->peer;
1492 				if (peer) {
1493 					if (peer->redirect_learned.a4 != new_gw ||
1494 					    peer->redirect_genid != redirect_genid) {
1495 						peer->redirect_learned.a4 = new_gw;
1496 						peer->redirect_genid = redirect_genid;
1497 						atomic_inc(&__rt_peer_genid);
1498 					}
1499 					check_peer_redir(&rt->dst, peer);
1500 				}
1501 			}
1502 		}
1503 	}
1504 	return;
1505 
1506 reject_redirect:
1507 #ifdef CONFIG_IP_ROUTE_VERBOSE
1508 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1509 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1510 			"  Advised path = %pI4 -> %pI4\n",
1511 		       &old_gw, dev->name, &new_gw,
1512 		       &saddr, &daddr);
1513 #endif
1514 	;
1515 }
1516 
1517 static bool peer_pmtu_expired(struct inet_peer *peer)
1518 {
1519 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520 
1521 	return orig &&
1522 	       time_after_eq(jiffies, orig) &&
1523 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525 
1526 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1527 {
1528 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1529 
1530 	return orig &&
1531 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1532 }
1533 
1534 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1535 {
1536 	struct rtable *rt = (struct rtable *)dst;
1537 	struct dst_entry *ret = dst;
1538 
1539 	if (rt) {
1540 		if (dst->obsolete > 0) {
1541 			ip_rt_put(rt);
1542 			ret = NULL;
1543 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1544 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1545 						rt->rt_oif,
1546 						rt_genid(dev_net(dst->dev)));
1547 			rt_del(hash, rt);
1548 			ret = NULL;
1549 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1550 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1551 		}
1552 	}
1553 	return ret;
1554 }
1555 
1556 /*
1557  * Algorithm:
1558  *	1. The first ip_rt_redirect_number redirects are sent
1559  *	   with exponential backoff, then we stop sending them at all,
1560  *	   assuming that the host ignores our redirects.
1561  *	2. If we did not see packets requiring redirects
1562  *	   during ip_rt_redirect_silence, we assume that the host
1563  *	   forgot redirected route and start to send redirects again.
1564  *
1565  * This algorithm is much cheaper and more intelligent than dumb load limiting
1566  * in icmp.c.
1567  *
1568  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1569  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1570  */
1571 
1572 void ip_rt_send_redirect(struct sk_buff *skb)
1573 {
1574 	struct rtable *rt = skb_rtable(skb);
1575 	struct in_device *in_dev;
1576 	struct inet_peer *peer;
1577 	int log_martians;
1578 
1579 	rcu_read_lock();
1580 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1581 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1582 		rcu_read_unlock();
1583 		return;
1584 	}
1585 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1586 	rcu_read_unlock();
1587 
1588 	if (!rt->peer)
1589 		rt_bind_peer(rt, rt->rt_dst, 1);
1590 	peer = rt->peer;
1591 	if (!peer) {
1592 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1593 		return;
1594 	}
1595 
1596 	/* No redirected packets during ip_rt_redirect_silence;
1597 	 * reset the algorithm.
1598 	 */
1599 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1600 		peer->rate_tokens = 0;
1601 
1602 	/* Too many ignored redirects; do not send anything
1603 	 * set dst.rate_last to the last seen redirected packet.
1604 	 */
1605 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1606 		peer->rate_last = jiffies;
1607 		return;
1608 	}
1609 
1610 	/* Check for load limit; set rate_last to the latest sent
1611 	 * redirect.
1612 	 */
1613 	if (peer->rate_tokens == 0 ||
1614 	    time_after(jiffies,
1615 		       (peer->rate_last +
1616 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1617 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1618 		peer->rate_last = jiffies;
1619 		++peer->rate_tokens;
1620 #ifdef CONFIG_IP_ROUTE_VERBOSE
1621 		if (log_martians &&
1622 		    peer->rate_tokens == ip_rt_redirect_number &&
1623 		    net_ratelimit())
1624 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1625 			       &ip_hdr(skb)->saddr, rt->rt_iif,
1626 				&rt->rt_dst, &rt->rt_gateway);
1627 #endif
1628 	}
1629 }
1630 
1631 static int ip_error(struct sk_buff *skb)
1632 {
1633 	struct rtable *rt = skb_rtable(skb);
1634 	struct inet_peer *peer;
1635 	unsigned long now;
1636 	bool send;
1637 	int code;
1638 
1639 	switch (rt->dst.error) {
1640 	case EINVAL:
1641 	default:
1642 		goto out;
1643 	case EHOSTUNREACH:
1644 		code = ICMP_HOST_UNREACH;
1645 		break;
1646 	case ENETUNREACH:
1647 		code = ICMP_NET_UNREACH;
1648 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1649 				IPSTATS_MIB_INNOROUTES);
1650 		break;
1651 	case EACCES:
1652 		code = ICMP_PKT_FILTERED;
1653 		break;
1654 	}
1655 
1656 	if (!rt->peer)
1657 		rt_bind_peer(rt, rt->rt_dst, 1);
1658 	peer = rt->peer;
1659 
1660 	send = true;
1661 	if (peer) {
1662 		now = jiffies;
1663 		peer->rate_tokens += now - peer->rate_last;
1664 		if (peer->rate_tokens > ip_rt_error_burst)
1665 			peer->rate_tokens = ip_rt_error_burst;
1666 		peer->rate_last = now;
1667 		if (peer->rate_tokens >= ip_rt_error_cost)
1668 			peer->rate_tokens -= ip_rt_error_cost;
1669 		else
1670 			send = false;
1671 	}
1672 	if (send)
1673 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1674 
1675 out:	kfree_skb(skb);
1676 	return 0;
1677 }
1678 
1679 /*
1680  *	The last two values are not from the RFC but
1681  *	are needed for AMPRnet AX.25 paths.
1682  */
1683 
1684 static const unsigned short mtu_plateau[] =
1685 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1686 
1687 static inline unsigned short guess_mtu(unsigned short old_mtu)
1688 {
1689 	int i;
1690 
1691 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1692 		if (old_mtu > mtu_plateau[i])
1693 			return mtu_plateau[i];
1694 	return 68;
1695 }
1696 
1697 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1698 				 unsigned short new_mtu,
1699 				 struct net_device *dev)
1700 {
1701 	unsigned short old_mtu = ntohs(iph->tot_len);
1702 	unsigned short est_mtu = 0;
1703 	struct inet_peer *peer;
1704 
1705 	peer = inet_getpeer_v4(iph->daddr, 1);
1706 	if (peer) {
1707 		unsigned short mtu = new_mtu;
1708 
1709 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1710 			/* BSD 4.2 derived systems incorrectly adjust
1711 			 * tot_len by the IP header length, and report
1712 			 * a zero MTU in the ICMP message.
1713 			 */
1714 			if (mtu == 0 &&
1715 			    old_mtu >= 68 + (iph->ihl << 2))
1716 				old_mtu -= iph->ihl << 2;
1717 			mtu = guess_mtu(old_mtu);
1718 		}
1719 
1720 		if (mtu < ip_rt_min_pmtu)
1721 			mtu = ip_rt_min_pmtu;
1722 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1723 			unsigned long pmtu_expires;
1724 
1725 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1726 			if (!pmtu_expires)
1727 				pmtu_expires = 1UL;
1728 
1729 			est_mtu = mtu;
1730 			peer->pmtu_learned = mtu;
1731 			peer->pmtu_expires = pmtu_expires;
1732 			atomic_inc(&__rt_peer_genid);
1733 		}
1734 
1735 		inet_putpeer(peer);
1736 	}
1737 	return est_mtu ? : new_mtu;
1738 }
1739 
1740 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1741 {
1742 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1743 
1744 	if (!expires)
1745 		return;
1746 	if (time_before(jiffies, expires)) {
1747 		u32 orig_dst_mtu = dst_mtu(dst);
1748 		if (peer->pmtu_learned < orig_dst_mtu) {
1749 			if (!peer->pmtu_orig)
1750 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1751 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1752 		}
1753 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1754 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1755 }
1756 
1757 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1758 {
1759 	struct rtable *rt = (struct rtable *) dst;
1760 	struct inet_peer *peer;
1761 
1762 	dst_confirm(dst);
1763 
1764 	if (!rt->peer)
1765 		rt_bind_peer(rt, rt->rt_dst, 1);
1766 	peer = rt->peer;
1767 	if (peer) {
1768 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1769 
1770 		if (mtu < ip_rt_min_pmtu)
1771 			mtu = ip_rt_min_pmtu;
1772 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1773 
1774 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1775 			if (!pmtu_expires)
1776 				pmtu_expires = 1UL;
1777 
1778 			peer->pmtu_learned = mtu;
1779 			peer->pmtu_expires = pmtu_expires;
1780 
1781 			atomic_inc(&__rt_peer_genid);
1782 			rt->rt_peer_genid = rt_peer_genid();
1783 		}
1784 		check_peer_pmtu(dst, peer);
1785 	}
1786 }
1787 
1788 
1789 static void ipv4_validate_peer(struct rtable *rt)
1790 {
1791 	if (rt->rt_peer_genid != rt_peer_genid()) {
1792 		struct inet_peer *peer;
1793 
1794 		if (!rt->peer)
1795 			rt_bind_peer(rt, rt->rt_dst, 0);
1796 
1797 		peer = rt->peer;
1798 		if (peer) {
1799 			check_peer_pmtu(&rt->dst, peer);
1800 
1801 			if (peer->redirect_genid != redirect_genid)
1802 				peer->redirect_learned.a4 = 0;
1803 			if (peer->redirect_learned.a4 &&
1804 			    peer->redirect_learned.a4 != rt->rt_gateway)
1805 				check_peer_redir(&rt->dst, peer);
1806 		}
1807 
1808 		rt->rt_peer_genid = rt_peer_genid();
1809 	}
1810 }
1811 
1812 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1813 {
1814 	struct rtable *rt = (struct rtable *) dst;
1815 
1816 	if (rt_is_expired(rt))
1817 		return NULL;
1818 	ipv4_validate_peer(rt);
1819 	return dst;
1820 }
1821 
1822 static void ipv4_dst_destroy(struct dst_entry *dst)
1823 {
1824 	struct rtable *rt = (struct rtable *) dst;
1825 	struct inet_peer *peer = rt->peer;
1826 
1827 	if (rt->fi) {
1828 		fib_info_put(rt->fi);
1829 		rt->fi = NULL;
1830 	}
1831 	if (peer) {
1832 		rt->peer = NULL;
1833 		inet_putpeer(peer);
1834 	}
1835 }
1836 
1837 
1838 static void ipv4_link_failure(struct sk_buff *skb)
1839 {
1840 	struct rtable *rt;
1841 
1842 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1843 
1844 	rt = skb_rtable(skb);
1845 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1846 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1847 }
1848 
1849 static int ip_rt_bug(struct sk_buff *skb)
1850 {
1851 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1852 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1853 		skb->dev ? skb->dev->name : "?");
1854 	kfree_skb(skb);
1855 	WARN_ON(1);
1856 	return 0;
1857 }
1858 
1859 /*
1860    We do not cache source address of outgoing interface,
1861    because it is used only by IP RR, TS and SRR options,
1862    so that it out of fast path.
1863 
1864    BTW remember: "addr" is allowed to be not aligned
1865    in IP options!
1866  */
1867 
1868 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1869 {
1870 	__be32 src;
1871 
1872 	if (rt_is_output_route(rt))
1873 		src = ip_hdr(skb)->saddr;
1874 	else {
1875 		struct fib_result res;
1876 		struct flowi4 fl4;
1877 		struct iphdr *iph;
1878 
1879 		iph = ip_hdr(skb);
1880 
1881 		memset(&fl4, 0, sizeof(fl4));
1882 		fl4.daddr = iph->daddr;
1883 		fl4.saddr = iph->saddr;
1884 		fl4.flowi4_tos = RT_TOS(iph->tos);
1885 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1886 		fl4.flowi4_iif = skb->dev->ifindex;
1887 		fl4.flowi4_mark = skb->mark;
1888 
1889 		rcu_read_lock();
1890 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1891 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1892 		else
1893 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1894 					RT_SCOPE_UNIVERSE);
1895 		rcu_read_unlock();
1896 	}
1897 	memcpy(addr, &src, 4);
1898 }
1899 
1900 #ifdef CONFIG_IP_ROUTE_CLASSID
1901 static void set_class_tag(struct rtable *rt, u32 tag)
1902 {
1903 	if (!(rt->dst.tclassid & 0xFFFF))
1904 		rt->dst.tclassid |= tag & 0xFFFF;
1905 	if (!(rt->dst.tclassid & 0xFFFF0000))
1906 		rt->dst.tclassid |= tag & 0xFFFF0000;
1907 }
1908 #endif
1909 
1910 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1911 {
1912 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1913 
1914 	if (advmss == 0) {
1915 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1916 			       ip_rt_min_advmss);
1917 		if (advmss > 65535 - 40)
1918 			advmss = 65535 - 40;
1919 	}
1920 	return advmss;
1921 }
1922 
1923 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1924 {
1925 	const struct rtable *rt = (const struct rtable *) dst;
1926 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1927 
1928 	if (mtu && rt_is_output_route(rt))
1929 		return mtu;
1930 
1931 	mtu = dst->dev->mtu;
1932 
1933 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1934 
1935 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1936 			mtu = 576;
1937 	}
1938 
1939 	if (mtu > IP_MAX_MTU)
1940 		mtu = IP_MAX_MTU;
1941 
1942 	return mtu;
1943 }
1944 
1945 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1946 			    struct fib_info *fi)
1947 {
1948 	struct inet_peer *peer;
1949 	int create = 0;
1950 
1951 	/* If a peer entry exists for this destination, we must hook
1952 	 * it up in order to get at cached metrics.
1953 	 */
1954 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1955 		create = 1;
1956 
1957 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1958 	if (peer) {
1959 		rt->rt_peer_genid = rt_peer_genid();
1960 		if (inet_metrics_new(peer))
1961 			memcpy(peer->metrics, fi->fib_metrics,
1962 			       sizeof(u32) * RTAX_MAX);
1963 		dst_init_metrics(&rt->dst, peer->metrics, false);
1964 
1965 		check_peer_pmtu(&rt->dst, peer);
1966 		if (peer->redirect_genid != redirect_genid)
1967 			peer->redirect_learned.a4 = 0;
1968 		if (peer->redirect_learned.a4 &&
1969 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1970 			rt->rt_gateway = peer->redirect_learned.a4;
1971 			rt->rt_flags |= RTCF_REDIRECTED;
1972 		}
1973 	} else {
1974 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1975 			rt->fi = fi;
1976 			atomic_inc(&fi->fib_clntref);
1977 		}
1978 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1979 	}
1980 }
1981 
1982 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1983 			   const struct fib_result *res,
1984 			   struct fib_info *fi, u16 type, u32 itag)
1985 {
1986 	struct dst_entry *dst = &rt->dst;
1987 
1988 	if (fi) {
1989 		if (FIB_RES_GW(*res) &&
1990 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1991 			rt->rt_gateway = FIB_RES_GW(*res);
1992 		rt_init_metrics(rt, fl4, fi);
1993 #ifdef CONFIG_IP_ROUTE_CLASSID
1994 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1995 #endif
1996 	}
1997 
1998 	if (dst_mtu(dst) > IP_MAX_MTU)
1999 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2000 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2001 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2002 
2003 #ifdef CONFIG_IP_ROUTE_CLASSID
2004 #ifdef CONFIG_IP_MULTIPLE_TABLES
2005 	set_class_tag(rt, fib_rules_tclass(res));
2006 #endif
2007 	set_class_tag(rt, itag);
2008 #endif
2009 }
2010 
2011 static struct rtable *rt_dst_alloc(struct net_device *dev,
2012 				   bool nopolicy, bool noxfrm)
2013 {
2014 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2015 			 DST_HOST |
2016 			 (nopolicy ? DST_NOPOLICY : 0) |
2017 			 (noxfrm ? DST_NOXFRM : 0));
2018 }
2019 
2020 /* called in rcu_read_lock() section */
2021 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2022 				u8 tos, struct net_device *dev, int our)
2023 {
2024 	unsigned int hash;
2025 	struct rtable *rth;
2026 	__be32 spec_dst;
2027 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2028 	u32 itag = 0;
2029 	int err;
2030 
2031 	/* Primary sanity checks. */
2032 
2033 	if (in_dev == NULL)
2034 		return -EINVAL;
2035 
2036 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2037 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2038 		goto e_inval;
2039 
2040 	if (ipv4_is_zeronet(saddr)) {
2041 		if (!ipv4_is_local_multicast(daddr))
2042 			goto e_inval;
2043 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2044 	} else {
2045 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2046 					  &itag);
2047 		if (err < 0)
2048 			goto e_err;
2049 	}
2050 	rth = rt_dst_alloc(init_net.loopback_dev,
2051 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2052 	if (!rth)
2053 		goto e_nobufs;
2054 
2055 #ifdef CONFIG_IP_ROUTE_CLASSID
2056 	rth->dst.tclassid = itag;
2057 #endif
2058 	rth->dst.output = ip_rt_bug;
2059 
2060 	rth->rt_key_dst	= daddr;
2061 	rth->rt_key_src	= saddr;
2062 	rth->rt_genid	= rt_genid(dev_net(dev));
2063 	rth->rt_flags	= RTCF_MULTICAST;
2064 	rth->rt_type	= RTN_MULTICAST;
2065 	rth->rt_key_tos	= tos;
2066 	rth->rt_dst	= daddr;
2067 	rth->rt_src	= saddr;
2068 	rth->rt_route_iif = dev->ifindex;
2069 	rth->rt_iif	= dev->ifindex;
2070 	rth->rt_oif	= 0;
2071 	rth->rt_mark    = skb->mark;
2072 	rth->rt_gateway	= daddr;
2073 	rth->rt_spec_dst= spec_dst;
2074 	rth->rt_peer_genid = 0;
2075 	rth->peer = NULL;
2076 	rth->fi = NULL;
2077 	if (our) {
2078 		rth->dst.input= ip_local_deliver;
2079 		rth->rt_flags |= RTCF_LOCAL;
2080 	}
2081 
2082 #ifdef CONFIG_IP_MROUTE
2083 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2084 		rth->dst.input = ip_mr_input;
2085 #endif
2086 	RT_CACHE_STAT_INC(in_slow_mc);
2087 
2088 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2089 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2090 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2091 
2092 e_nobufs:
2093 	return -ENOBUFS;
2094 e_inval:
2095 	return -EINVAL;
2096 e_err:
2097 	return err;
2098 }
2099 
2100 
2101 static void ip_handle_martian_source(struct net_device *dev,
2102 				     struct in_device *in_dev,
2103 				     struct sk_buff *skb,
2104 				     __be32 daddr,
2105 				     __be32 saddr)
2106 {
2107 	RT_CACHE_STAT_INC(in_martian_src);
2108 #ifdef CONFIG_IP_ROUTE_VERBOSE
2109 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2110 		/*
2111 		 *	RFC1812 recommendation, if source is martian,
2112 		 *	the only hint is MAC header.
2113 		 */
2114 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2115 			&daddr, &saddr, dev->name);
2116 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2117 			int i;
2118 			const unsigned char *p = skb_mac_header(skb);
2119 			printk(KERN_WARNING "ll header: ");
2120 			for (i = 0; i < dev->hard_header_len; i++, p++) {
2121 				printk("%02x", *p);
2122 				if (i < (dev->hard_header_len - 1))
2123 					printk(":");
2124 			}
2125 			printk("\n");
2126 		}
2127 	}
2128 #endif
2129 }
2130 
2131 /* called in rcu_read_lock() section */
2132 static int __mkroute_input(struct sk_buff *skb,
2133 			   const struct fib_result *res,
2134 			   struct in_device *in_dev,
2135 			   __be32 daddr, __be32 saddr, u32 tos,
2136 			   struct rtable **result)
2137 {
2138 	struct rtable *rth;
2139 	int err;
2140 	struct in_device *out_dev;
2141 	unsigned int flags = 0;
2142 	__be32 spec_dst;
2143 	u32 itag;
2144 
2145 	/* get a working reference to the output device */
2146 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2147 	if (out_dev == NULL) {
2148 		if (net_ratelimit())
2149 			printk(KERN_CRIT "Bug in ip_route_input" \
2150 			       "_slow(). Please, report\n");
2151 		return -EINVAL;
2152 	}
2153 
2154 
2155 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2156 				  in_dev->dev, &spec_dst, &itag);
2157 	if (err < 0) {
2158 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2159 					 saddr);
2160 
2161 		goto cleanup;
2162 	}
2163 
2164 	if (err)
2165 		flags |= RTCF_DIRECTSRC;
2166 
2167 	if (out_dev == in_dev && err &&
2168 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2169 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2170 		flags |= RTCF_DOREDIRECT;
2171 
2172 	if (skb->protocol != htons(ETH_P_IP)) {
2173 		/* Not IP (i.e. ARP). Do not create route, if it is
2174 		 * invalid for proxy arp. DNAT routes are always valid.
2175 		 *
2176 		 * Proxy arp feature have been extended to allow, ARP
2177 		 * replies back to the same interface, to support
2178 		 * Private VLAN switch technologies. See arp.c.
2179 		 */
2180 		if (out_dev == in_dev &&
2181 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2182 			err = -EINVAL;
2183 			goto cleanup;
2184 		}
2185 	}
2186 
2187 	rth = rt_dst_alloc(out_dev->dev,
2188 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2189 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2190 	if (!rth) {
2191 		err = -ENOBUFS;
2192 		goto cleanup;
2193 	}
2194 
2195 	rth->rt_key_dst	= daddr;
2196 	rth->rt_key_src	= saddr;
2197 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2198 	rth->rt_flags = flags;
2199 	rth->rt_type = res->type;
2200 	rth->rt_key_tos	= tos;
2201 	rth->rt_dst	= daddr;
2202 	rth->rt_src	= saddr;
2203 	rth->rt_route_iif = in_dev->dev->ifindex;
2204 	rth->rt_iif 	= in_dev->dev->ifindex;
2205 	rth->rt_oif 	= 0;
2206 	rth->rt_mark    = skb->mark;
2207 	rth->rt_gateway	= daddr;
2208 	rth->rt_spec_dst= spec_dst;
2209 	rth->rt_peer_genid = 0;
2210 	rth->peer = NULL;
2211 	rth->fi = NULL;
2212 
2213 	rth->dst.input = ip_forward;
2214 	rth->dst.output = ip_output;
2215 
2216 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2217 
2218 	*result = rth;
2219 	err = 0;
2220  cleanup:
2221 	return err;
2222 }
2223 
2224 static int ip_mkroute_input(struct sk_buff *skb,
2225 			    struct fib_result *res,
2226 			    const struct flowi4 *fl4,
2227 			    struct in_device *in_dev,
2228 			    __be32 daddr, __be32 saddr, u32 tos)
2229 {
2230 	struct rtable* rth = NULL;
2231 	int err;
2232 	unsigned hash;
2233 
2234 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2235 	if (res->fi && res->fi->fib_nhs > 1)
2236 		fib_select_multipath(res);
2237 #endif
2238 
2239 	/* create a routing cache entry */
2240 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2241 	if (err)
2242 		return err;
2243 
2244 	/* put it into the cache */
2245 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2246 		       rt_genid(dev_net(rth->dst.dev)));
2247 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2248 	if (IS_ERR(rth))
2249 		return PTR_ERR(rth);
2250 	return 0;
2251 }
2252 
2253 /*
2254  *	NOTE. We drop all the packets that has local source
2255  *	addresses, because every properly looped back packet
2256  *	must have correct destination already attached by output routine.
2257  *
2258  *	Such approach solves two big problems:
2259  *	1. Not simplex devices are handled properly.
2260  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2261  *	called with rcu_read_lock()
2262  */
2263 
2264 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2265 			       u8 tos, struct net_device *dev)
2266 {
2267 	struct fib_result res;
2268 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2269 	struct flowi4	fl4;
2270 	unsigned	flags = 0;
2271 	u32		itag = 0;
2272 	struct rtable * rth;
2273 	unsigned	hash;
2274 	__be32		spec_dst;
2275 	int		err = -EINVAL;
2276 	struct net    * net = dev_net(dev);
2277 
2278 	/* IP on this device is disabled. */
2279 
2280 	if (!in_dev)
2281 		goto out;
2282 
2283 	/* Check for the most weird martians, which can be not detected
2284 	   by fib_lookup.
2285 	 */
2286 
2287 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2288 	    ipv4_is_loopback(saddr))
2289 		goto martian_source;
2290 
2291 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2292 		goto brd_input;
2293 
2294 	/* Accept zero addresses only to limited broadcast;
2295 	 * I even do not know to fix it or not. Waiting for complains :-)
2296 	 */
2297 	if (ipv4_is_zeronet(saddr))
2298 		goto martian_source;
2299 
2300 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2301 		goto martian_destination;
2302 
2303 	/*
2304 	 *	Now we are ready to route packet.
2305 	 */
2306 	fl4.flowi4_oif = 0;
2307 	fl4.flowi4_iif = dev->ifindex;
2308 	fl4.flowi4_mark = skb->mark;
2309 	fl4.flowi4_tos = tos;
2310 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2311 	fl4.daddr = daddr;
2312 	fl4.saddr = saddr;
2313 	err = fib_lookup(net, &fl4, &res);
2314 	if (err != 0) {
2315 		if (!IN_DEV_FORWARD(in_dev))
2316 			goto e_hostunreach;
2317 		goto no_route;
2318 	}
2319 
2320 	RT_CACHE_STAT_INC(in_slow_tot);
2321 
2322 	if (res.type == RTN_BROADCAST)
2323 		goto brd_input;
2324 
2325 	if (res.type == RTN_LOCAL) {
2326 		err = fib_validate_source(skb, saddr, daddr, tos,
2327 					  net->loopback_dev->ifindex,
2328 					  dev, &spec_dst, &itag);
2329 		if (err < 0)
2330 			goto martian_source_keep_err;
2331 		if (err)
2332 			flags |= RTCF_DIRECTSRC;
2333 		spec_dst = daddr;
2334 		goto local_input;
2335 	}
2336 
2337 	if (!IN_DEV_FORWARD(in_dev))
2338 		goto e_hostunreach;
2339 	if (res.type != RTN_UNICAST)
2340 		goto martian_destination;
2341 
2342 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2343 out:	return err;
2344 
2345 brd_input:
2346 	if (skb->protocol != htons(ETH_P_IP))
2347 		goto e_inval;
2348 
2349 	if (ipv4_is_zeronet(saddr))
2350 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2351 	else {
2352 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2353 					  &itag);
2354 		if (err < 0)
2355 			goto martian_source_keep_err;
2356 		if (err)
2357 			flags |= RTCF_DIRECTSRC;
2358 	}
2359 	flags |= RTCF_BROADCAST;
2360 	res.type = RTN_BROADCAST;
2361 	RT_CACHE_STAT_INC(in_brd);
2362 
2363 local_input:
2364 	rth = rt_dst_alloc(net->loopback_dev,
2365 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2366 	if (!rth)
2367 		goto e_nobufs;
2368 
2369 	rth->dst.input= ip_local_deliver;
2370 	rth->dst.output= ip_rt_bug;
2371 #ifdef CONFIG_IP_ROUTE_CLASSID
2372 	rth->dst.tclassid = itag;
2373 #endif
2374 
2375 	rth->rt_key_dst	= daddr;
2376 	rth->rt_key_src	= saddr;
2377 	rth->rt_genid = rt_genid(net);
2378 	rth->rt_flags 	= flags|RTCF_LOCAL;
2379 	rth->rt_type	= res.type;
2380 	rth->rt_key_tos	= tos;
2381 	rth->rt_dst	= daddr;
2382 	rth->rt_src	= saddr;
2383 #ifdef CONFIG_IP_ROUTE_CLASSID
2384 	rth->dst.tclassid = itag;
2385 #endif
2386 	rth->rt_route_iif = dev->ifindex;
2387 	rth->rt_iif	= dev->ifindex;
2388 	rth->rt_oif	= 0;
2389 	rth->rt_mark    = skb->mark;
2390 	rth->rt_gateway	= daddr;
2391 	rth->rt_spec_dst= spec_dst;
2392 	rth->rt_peer_genid = 0;
2393 	rth->peer = NULL;
2394 	rth->fi = NULL;
2395 	if (res.type == RTN_UNREACHABLE) {
2396 		rth->dst.input= ip_error;
2397 		rth->dst.error= -err;
2398 		rth->rt_flags 	&= ~RTCF_LOCAL;
2399 	}
2400 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2401 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2402 	err = 0;
2403 	if (IS_ERR(rth))
2404 		err = PTR_ERR(rth);
2405 	goto out;
2406 
2407 no_route:
2408 	RT_CACHE_STAT_INC(in_no_route);
2409 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2410 	res.type = RTN_UNREACHABLE;
2411 	if (err == -ESRCH)
2412 		err = -ENETUNREACH;
2413 	goto local_input;
2414 
2415 	/*
2416 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2417 	 */
2418 martian_destination:
2419 	RT_CACHE_STAT_INC(in_martian_dst);
2420 #ifdef CONFIG_IP_ROUTE_VERBOSE
2421 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2422 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2423 			&daddr, &saddr, dev->name);
2424 #endif
2425 
2426 e_hostunreach:
2427 	err = -EHOSTUNREACH;
2428 	goto out;
2429 
2430 e_inval:
2431 	err = -EINVAL;
2432 	goto out;
2433 
2434 e_nobufs:
2435 	err = -ENOBUFS;
2436 	goto out;
2437 
2438 martian_source:
2439 	err = -EINVAL;
2440 martian_source_keep_err:
2441 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2442 	goto out;
2443 }
2444 
2445 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2446 			   u8 tos, struct net_device *dev, bool noref)
2447 {
2448 	struct rtable * rth;
2449 	unsigned	hash;
2450 	int iif = dev->ifindex;
2451 	struct net *net;
2452 	int res;
2453 
2454 	net = dev_net(dev);
2455 
2456 	rcu_read_lock();
2457 
2458 	if (!rt_caching(net))
2459 		goto skip_cache;
2460 
2461 	tos &= IPTOS_RT_MASK;
2462 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2463 
2464 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2465 	     rth = rcu_dereference(rth->dst.rt_next)) {
2466 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2467 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2468 		     (rth->rt_route_iif ^ iif) |
2469 		     (rth->rt_key_tos ^ tos)) == 0 &&
2470 		    rth->rt_mark == skb->mark &&
2471 		    net_eq(dev_net(rth->dst.dev), net) &&
2472 		    !rt_is_expired(rth)) {
2473 			ipv4_validate_peer(rth);
2474 			if (noref) {
2475 				dst_use_noref(&rth->dst, jiffies);
2476 				skb_dst_set_noref(skb, &rth->dst);
2477 			} else {
2478 				dst_use(&rth->dst, jiffies);
2479 				skb_dst_set(skb, &rth->dst);
2480 			}
2481 			RT_CACHE_STAT_INC(in_hit);
2482 			rcu_read_unlock();
2483 			return 0;
2484 		}
2485 		RT_CACHE_STAT_INC(in_hlist_search);
2486 	}
2487 
2488 skip_cache:
2489 	/* Multicast recognition logic is moved from route cache to here.
2490 	   The problem was that too many Ethernet cards have broken/missing
2491 	   hardware multicast filters :-( As result the host on multicasting
2492 	   network acquires a lot of useless route cache entries, sort of
2493 	   SDR messages from all the world. Now we try to get rid of them.
2494 	   Really, provided software IP multicast filter is organized
2495 	   reasonably (at least, hashed), it does not result in a slowdown
2496 	   comparing with route cache reject entries.
2497 	   Note, that multicast routers are not affected, because
2498 	   route cache entry is created eventually.
2499 	 */
2500 	if (ipv4_is_multicast(daddr)) {
2501 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2502 
2503 		if (in_dev) {
2504 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2505 						  ip_hdr(skb)->protocol);
2506 			if (our
2507 #ifdef CONFIG_IP_MROUTE
2508 				||
2509 			    (!ipv4_is_local_multicast(daddr) &&
2510 			     IN_DEV_MFORWARD(in_dev))
2511 #endif
2512 			   ) {
2513 				int res = ip_route_input_mc(skb, daddr, saddr,
2514 							    tos, dev, our);
2515 				rcu_read_unlock();
2516 				return res;
2517 			}
2518 		}
2519 		rcu_read_unlock();
2520 		return -EINVAL;
2521 	}
2522 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2523 	rcu_read_unlock();
2524 	return res;
2525 }
2526 EXPORT_SYMBOL(ip_route_input_common);
2527 
2528 /* called with rcu_read_lock() */
2529 static struct rtable *__mkroute_output(const struct fib_result *res,
2530 				       const struct flowi4 *fl4,
2531 				       __be32 orig_daddr, __be32 orig_saddr,
2532 				       int orig_oif, __u8 orig_rtos,
2533 				       struct net_device *dev_out,
2534 				       unsigned int flags)
2535 {
2536 	struct fib_info *fi = res->fi;
2537 	struct in_device *in_dev;
2538 	u16 type = res->type;
2539 	struct rtable *rth;
2540 
2541 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2542 		return ERR_PTR(-EINVAL);
2543 
2544 	if (ipv4_is_lbcast(fl4->daddr))
2545 		type = RTN_BROADCAST;
2546 	else if (ipv4_is_multicast(fl4->daddr))
2547 		type = RTN_MULTICAST;
2548 	else if (ipv4_is_zeronet(fl4->daddr))
2549 		return ERR_PTR(-EINVAL);
2550 
2551 	if (dev_out->flags & IFF_LOOPBACK)
2552 		flags |= RTCF_LOCAL;
2553 
2554 	in_dev = __in_dev_get_rcu(dev_out);
2555 	if (!in_dev)
2556 		return ERR_PTR(-EINVAL);
2557 
2558 	if (type == RTN_BROADCAST) {
2559 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2560 		fi = NULL;
2561 	} else if (type == RTN_MULTICAST) {
2562 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2563 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2564 				     fl4->flowi4_proto))
2565 			flags &= ~RTCF_LOCAL;
2566 		/* If multicast route do not exist use
2567 		 * default one, but do not gateway in this case.
2568 		 * Yes, it is hack.
2569 		 */
2570 		if (fi && res->prefixlen < 4)
2571 			fi = NULL;
2572 	}
2573 
2574 	rth = rt_dst_alloc(dev_out,
2575 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2576 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2577 	if (!rth)
2578 		return ERR_PTR(-ENOBUFS);
2579 
2580 	rth->dst.output = ip_output;
2581 
2582 	rth->rt_key_dst	= orig_daddr;
2583 	rth->rt_key_src	= orig_saddr;
2584 	rth->rt_genid = rt_genid(dev_net(dev_out));
2585 	rth->rt_flags	= flags;
2586 	rth->rt_type	= type;
2587 	rth->rt_key_tos	= orig_rtos;
2588 	rth->rt_dst	= fl4->daddr;
2589 	rth->rt_src	= fl4->saddr;
2590 	rth->rt_route_iif = 0;
2591 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2592 	rth->rt_oif	= orig_oif;
2593 	rth->rt_mark    = fl4->flowi4_mark;
2594 	rth->rt_gateway = fl4->daddr;
2595 	rth->rt_spec_dst= fl4->saddr;
2596 	rth->rt_peer_genid = 0;
2597 	rth->peer = NULL;
2598 	rth->fi = NULL;
2599 
2600 	RT_CACHE_STAT_INC(out_slow_tot);
2601 
2602 	if (flags & RTCF_LOCAL) {
2603 		rth->dst.input = ip_local_deliver;
2604 		rth->rt_spec_dst = fl4->daddr;
2605 	}
2606 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2607 		rth->rt_spec_dst = fl4->saddr;
2608 		if (flags & RTCF_LOCAL &&
2609 		    !(dev_out->flags & IFF_LOOPBACK)) {
2610 			rth->dst.output = ip_mc_output;
2611 			RT_CACHE_STAT_INC(out_slow_mc);
2612 		}
2613 #ifdef CONFIG_IP_MROUTE
2614 		if (type == RTN_MULTICAST) {
2615 			if (IN_DEV_MFORWARD(in_dev) &&
2616 			    !ipv4_is_local_multicast(fl4->daddr)) {
2617 				rth->dst.input = ip_mr_input;
2618 				rth->dst.output = ip_mc_output;
2619 			}
2620 		}
2621 #endif
2622 	}
2623 
2624 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2625 
2626 	return rth;
2627 }
2628 
2629 /*
2630  * Major route resolver routine.
2631  * called with rcu_read_lock();
2632  */
2633 
2634 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2635 {
2636 	struct net_device *dev_out = NULL;
2637 	__u8 tos = RT_FL_TOS(fl4);
2638 	unsigned int flags = 0;
2639 	struct fib_result res;
2640 	struct rtable *rth;
2641 	__be32 orig_daddr;
2642 	__be32 orig_saddr;
2643 	int orig_oif;
2644 
2645 	res.fi		= NULL;
2646 #ifdef CONFIG_IP_MULTIPLE_TABLES
2647 	res.r		= NULL;
2648 #endif
2649 
2650 	orig_daddr = fl4->daddr;
2651 	orig_saddr = fl4->saddr;
2652 	orig_oif = fl4->flowi4_oif;
2653 
2654 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2655 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2656 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2657 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2658 
2659 	rcu_read_lock();
2660 	if (fl4->saddr) {
2661 		rth = ERR_PTR(-EINVAL);
2662 		if (ipv4_is_multicast(fl4->saddr) ||
2663 		    ipv4_is_lbcast(fl4->saddr) ||
2664 		    ipv4_is_zeronet(fl4->saddr))
2665 			goto out;
2666 
2667 		/* I removed check for oif == dev_out->oif here.
2668 		   It was wrong for two reasons:
2669 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2670 		      is assigned to multiple interfaces.
2671 		   2. Moreover, we are allowed to send packets with saddr
2672 		      of another iface. --ANK
2673 		 */
2674 
2675 		if (fl4->flowi4_oif == 0 &&
2676 		    (ipv4_is_multicast(fl4->daddr) ||
2677 		     ipv4_is_lbcast(fl4->daddr))) {
2678 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2679 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2680 			if (dev_out == NULL)
2681 				goto out;
2682 
2683 			/* Special hack: user can direct multicasts
2684 			   and limited broadcast via necessary interface
2685 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2686 			   This hack is not just for fun, it allows
2687 			   vic,vat and friends to work.
2688 			   They bind socket to loopback, set ttl to zero
2689 			   and expect that it will work.
2690 			   From the viewpoint of routing cache they are broken,
2691 			   because we are not allowed to build multicast path
2692 			   with loopback source addr (look, routing cache
2693 			   cannot know, that ttl is zero, so that packet
2694 			   will not leave this host and route is valid).
2695 			   Luckily, this hack is good workaround.
2696 			 */
2697 
2698 			fl4->flowi4_oif = dev_out->ifindex;
2699 			goto make_route;
2700 		}
2701 
2702 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2703 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2704 			if (!__ip_dev_find(net, fl4->saddr, false))
2705 				goto out;
2706 		}
2707 	}
2708 
2709 
2710 	if (fl4->flowi4_oif) {
2711 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2712 		rth = ERR_PTR(-ENODEV);
2713 		if (dev_out == NULL)
2714 			goto out;
2715 
2716 		/* RACE: Check return value of inet_select_addr instead. */
2717 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2718 			rth = ERR_PTR(-ENETUNREACH);
2719 			goto out;
2720 		}
2721 		if (ipv4_is_local_multicast(fl4->daddr) ||
2722 		    ipv4_is_lbcast(fl4->daddr)) {
2723 			if (!fl4->saddr)
2724 				fl4->saddr = inet_select_addr(dev_out, 0,
2725 							      RT_SCOPE_LINK);
2726 			goto make_route;
2727 		}
2728 		if (fl4->saddr) {
2729 			if (ipv4_is_multicast(fl4->daddr))
2730 				fl4->saddr = inet_select_addr(dev_out, 0,
2731 							      fl4->flowi4_scope);
2732 			else if (!fl4->daddr)
2733 				fl4->saddr = inet_select_addr(dev_out, 0,
2734 							      RT_SCOPE_HOST);
2735 		}
2736 	}
2737 
2738 	if (!fl4->daddr) {
2739 		fl4->daddr = fl4->saddr;
2740 		if (!fl4->daddr)
2741 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2742 		dev_out = net->loopback_dev;
2743 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2744 		res.type = RTN_LOCAL;
2745 		flags |= RTCF_LOCAL;
2746 		goto make_route;
2747 	}
2748 
2749 	if (fib_lookup(net, fl4, &res)) {
2750 		res.fi = NULL;
2751 		if (fl4->flowi4_oif) {
2752 			/* Apparently, routing tables are wrong. Assume,
2753 			   that the destination is on link.
2754 
2755 			   WHY? DW.
2756 			   Because we are allowed to send to iface
2757 			   even if it has NO routes and NO assigned
2758 			   addresses. When oif is specified, routing
2759 			   tables are looked up with only one purpose:
2760 			   to catch if destination is gatewayed, rather than
2761 			   direct. Moreover, if MSG_DONTROUTE is set,
2762 			   we send packet, ignoring both routing tables
2763 			   and ifaddr state. --ANK
2764 
2765 
2766 			   We could make it even if oif is unknown,
2767 			   likely IPv6, but we do not.
2768 			 */
2769 
2770 			if (fl4->saddr == 0)
2771 				fl4->saddr = inet_select_addr(dev_out, 0,
2772 							      RT_SCOPE_LINK);
2773 			res.type = RTN_UNICAST;
2774 			goto make_route;
2775 		}
2776 		rth = ERR_PTR(-ENETUNREACH);
2777 		goto out;
2778 	}
2779 
2780 	if (res.type == RTN_LOCAL) {
2781 		if (!fl4->saddr) {
2782 			if (res.fi->fib_prefsrc)
2783 				fl4->saddr = res.fi->fib_prefsrc;
2784 			else
2785 				fl4->saddr = fl4->daddr;
2786 		}
2787 		dev_out = net->loopback_dev;
2788 		fl4->flowi4_oif = dev_out->ifindex;
2789 		res.fi = NULL;
2790 		flags |= RTCF_LOCAL;
2791 		goto make_route;
2792 	}
2793 
2794 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2795 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2796 		fib_select_multipath(&res);
2797 	else
2798 #endif
2799 	if (!res.prefixlen &&
2800 	    res.table->tb_num_default > 1 &&
2801 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2802 		fib_select_default(&res);
2803 
2804 	if (!fl4->saddr)
2805 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2806 
2807 	dev_out = FIB_RES_DEV(res);
2808 	fl4->flowi4_oif = dev_out->ifindex;
2809 
2810 
2811 make_route:
2812 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2813 			       tos, dev_out, flags);
2814 	if (!IS_ERR(rth)) {
2815 		unsigned int hash;
2816 
2817 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2818 			       rt_genid(dev_net(dev_out)));
2819 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2820 	}
2821 
2822 out:
2823 	rcu_read_unlock();
2824 	return rth;
2825 }
2826 
2827 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2828 {
2829 	struct rtable *rth;
2830 	unsigned int hash;
2831 
2832 	if (!rt_caching(net))
2833 		goto slow_output;
2834 
2835 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2836 
2837 	rcu_read_lock_bh();
2838 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2839 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2840 		if (rth->rt_key_dst == flp4->daddr &&
2841 		    rth->rt_key_src == flp4->saddr &&
2842 		    rt_is_output_route(rth) &&
2843 		    rth->rt_oif == flp4->flowi4_oif &&
2844 		    rth->rt_mark == flp4->flowi4_mark &&
2845 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2846 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2847 		    net_eq(dev_net(rth->dst.dev), net) &&
2848 		    !rt_is_expired(rth)) {
2849 			ipv4_validate_peer(rth);
2850 			dst_use(&rth->dst, jiffies);
2851 			RT_CACHE_STAT_INC(out_hit);
2852 			rcu_read_unlock_bh();
2853 			if (!flp4->saddr)
2854 				flp4->saddr = rth->rt_src;
2855 			if (!flp4->daddr)
2856 				flp4->daddr = rth->rt_dst;
2857 			return rth;
2858 		}
2859 		RT_CACHE_STAT_INC(out_hlist_search);
2860 	}
2861 	rcu_read_unlock_bh();
2862 
2863 slow_output:
2864 	return ip_route_output_slow(net, flp4);
2865 }
2866 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2867 
2868 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2869 {
2870 	return NULL;
2871 }
2872 
2873 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2874 {
2875 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2876 
2877 	return mtu ? : dst->dev->mtu;
2878 }
2879 
2880 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2881 {
2882 }
2883 
2884 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2885 					  unsigned long old)
2886 {
2887 	return NULL;
2888 }
2889 
2890 static struct dst_ops ipv4_dst_blackhole_ops = {
2891 	.family			=	AF_INET,
2892 	.protocol		=	cpu_to_be16(ETH_P_IP),
2893 	.destroy		=	ipv4_dst_destroy,
2894 	.check			=	ipv4_blackhole_dst_check,
2895 	.mtu			=	ipv4_blackhole_mtu,
2896 	.default_advmss		=	ipv4_default_advmss,
2897 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2898 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2899 	.neigh_lookup		=	ipv4_neigh_lookup,
2900 };
2901 
2902 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2903 {
2904 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2905 	struct rtable *ort = (struct rtable *) dst_orig;
2906 
2907 	if (rt) {
2908 		struct dst_entry *new = &rt->dst;
2909 
2910 		new->__use = 1;
2911 		new->input = dst_discard;
2912 		new->output = dst_discard;
2913 		dst_copy_metrics(new, &ort->dst);
2914 
2915 		new->dev = ort->dst.dev;
2916 		if (new->dev)
2917 			dev_hold(new->dev);
2918 
2919 		rt->rt_key_dst = ort->rt_key_dst;
2920 		rt->rt_key_src = ort->rt_key_src;
2921 		rt->rt_key_tos = ort->rt_key_tos;
2922 		rt->rt_route_iif = ort->rt_route_iif;
2923 		rt->rt_iif = ort->rt_iif;
2924 		rt->rt_oif = ort->rt_oif;
2925 		rt->rt_mark = ort->rt_mark;
2926 
2927 		rt->rt_genid = rt_genid(net);
2928 		rt->rt_flags = ort->rt_flags;
2929 		rt->rt_type = ort->rt_type;
2930 		rt->rt_dst = ort->rt_dst;
2931 		rt->rt_src = ort->rt_src;
2932 		rt->rt_gateway = ort->rt_gateway;
2933 		rt->rt_spec_dst = ort->rt_spec_dst;
2934 		rt->peer = ort->peer;
2935 		if (rt->peer)
2936 			atomic_inc(&rt->peer->refcnt);
2937 		rt->fi = ort->fi;
2938 		if (rt->fi)
2939 			atomic_inc(&rt->fi->fib_clntref);
2940 
2941 		dst_free(new);
2942 	}
2943 
2944 	dst_release(dst_orig);
2945 
2946 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2947 }
2948 
2949 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2950 				    struct sock *sk)
2951 {
2952 	struct rtable *rt = __ip_route_output_key(net, flp4);
2953 
2954 	if (IS_ERR(rt))
2955 		return rt;
2956 
2957 	if (flp4->flowi4_proto)
2958 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2959 						   flowi4_to_flowi(flp4),
2960 						   sk, 0);
2961 
2962 	return rt;
2963 }
2964 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2965 
2966 static int rt_fill_info(struct net *net,
2967 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2968 			int nowait, unsigned int flags)
2969 {
2970 	struct rtable *rt = skb_rtable(skb);
2971 	struct rtmsg *r;
2972 	struct nlmsghdr *nlh;
2973 	unsigned long expires = 0;
2974 	const struct inet_peer *peer = rt->peer;
2975 	u32 id = 0, ts = 0, tsage = 0, error;
2976 
2977 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2978 	if (nlh == NULL)
2979 		return -EMSGSIZE;
2980 
2981 	r = nlmsg_data(nlh);
2982 	r->rtm_family	 = AF_INET;
2983 	r->rtm_dst_len	= 32;
2984 	r->rtm_src_len	= 0;
2985 	r->rtm_tos	= rt->rt_key_tos;
2986 	r->rtm_table	= RT_TABLE_MAIN;
2987 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2988 	r->rtm_type	= rt->rt_type;
2989 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2990 	r->rtm_protocol = RTPROT_UNSPEC;
2991 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2992 	if (rt->rt_flags & RTCF_NOTIFY)
2993 		r->rtm_flags |= RTM_F_NOTIFY;
2994 
2995 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2996 
2997 	if (rt->rt_key_src) {
2998 		r->rtm_src_len = 32;
2999 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3000 	}
3001 	if (rt->dst.dev)
3002 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3003 #ifdef CONFIG_IP_ROUTE_CLASSID
3004 	if (rt->dst.tclassid)
3005 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3006 #endif
3007 	if (rt_is_input_route(rt))
3008 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3009 	else if (rt->rt_src != rt->rt_key_src)
3010 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3011 
3012 	if (rt->rt_dst != rt->rt_gateway)
3013 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3014 
3015 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3016 		goto nla_put_failure;
3017 
3018 	if (rt->rt_mark)
3019 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3020 
3021 	error = rt->dst.error;
3022 	if (peer) {
3023 		inet_peer_refcheck(rt->peer);
3024 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3025 		if (peer->tcp_ts_stamp) {
3026 			ts = peer->tcp_ts;
3027 			tsage = get_seconds() - peer->tcp_ts_stamp;
3028 		}
3029 		expires = ACCESS_ONCE(peer->pmtu_expires);
3030 		if (expires) {
3031 			if (time_before(jiffies, expires))
3032 				expires -= jiffies;
3033 			else
3034 				expires = 0;
3035 		}
3036 	}
3037 
3038 	if (rt_is_input_route(rt)) {
3039 #ifdef CONFIG_IP_MROUTE
3040 		__be32 dst = rt->rt_dst;
3041 
3042 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3043 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3044 			int err = ipmr_get_route(net, skb,
3045 						 rt->rt_src, rt->rt_dst,
3046 						 r, nowait);
3047 			if (err <= 0) {
3048 				if (!nowait) {
3049 					if (err == 0)
3050 						return 0;
3051 					goto nla_put_failure;
3052 				} else {
3053 					if (err == -EMSGSIZE)
3054 						goto nla_put_failure;
3055 					error = err;
3056 				}
3057 			}
3058 		} else
3059 #endif
3060 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3061 	}
3062 
3063 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3064 			       expires, error) < 0)
3065 		goto nla_put_failure;
3066 
3067 	return nlmsg_end(skb, nlh);
3068 
3069 nla_put_failure:
3070 	nlmsg_cancel(skb, nlh);
3071 	return -EMSGSIZE;
3072 }
3073 
3074 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3075 {
3076 	struct net *net = sock_net(in_skb->sk);
3077 	struct rtmsg *rtm;
3078 	struct nlattr *tb[RTA_MAX+1];
3079 	struct rtable *rt = NULL;
3080 	__be32 dst = 0;
3081 	__be32 src = 0;
3082 	u32 iif;
3083 	int err;
3084 	int mark;
3085 	struct sk_buff *skb;
3086 
3087 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3088 	if (err < 0)
3089 		goto errout;
3090 
3091 	rtm = nlmsg_data(nlh);
3092 
3093 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3094 	if (skb == NULL) {
3095 		err = -ENOBUFS;
3096 		goto errout;
3097 	}
3098 
3099 	/* Reserve room for dummy headers, this skb can pass
3100 	   through good chunk of routing engine.
3101 	 */
3102 	skb_reset_mac_header(skb);
3103 	skb_reset_network_header(skb);
3104 
3105 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3106 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3107 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3108 
3109 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3110 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3111 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3112 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3113 
3114 	if (iif) {
3115 		struct net_device *dev;
3116 
3117 		dev = __dev_get_by_index(net, iif);
3118 		if (dev == NULL) {
3119 			err = -ENODEV;
3120 			goto errout_free;
3121 		}
3122 
3123 		skb->protocol	= htons(ETH_P_IP);
3124 		skb->dev	= dev;
3125 		skb->mark	= mark;
3126 		local_bh_disable();
3127 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3128 		local_bh_enable();
3129 
3130 		rt = skb_rtable(skb);
3131 		if (err == 0 && rt->dst.error)
3132 			err = -rt->dst.error;
3133 	} else {
3134 		struct flowi4 fl4 = {
3135 			.daddr = dst,
3136 			.saddr = src,
3137 			.flowi4_tos = rtm->rtm_tos,
3138 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3139 			.flowi4_mark = mark,
3140 		};
3141 		rt = ip_route_output_key(net, &fl4);
3142 
3143 		err = 0;
3144 		if (IS_ERR(rt))
3145 			err = PTR_ERR(rt);
3146 	}
3147 
3148 	if (err)
3149 		goto errout_free;
3150 
3151 	skb_dst_set(skb, &rt->dst);
3152 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3153 		rt->rt_flags |= RTCF_NOTIFY;
3154 
3155 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3156 			   RTM_NEWROUTE, 0, 0);
3157 	if (err <= 0)
3158 		goto errout_free;
3159 
3160 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3161 errout:
3162 	return err;
3163 
3164 errout_free:
3165 	kfree_skb(skb);
3166 	goto errout;
3167 }
3168 
3169 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3170 {
3171 	struct rtable *rt;
3172 	int h, s_h;
3173 	int idx, s_idx;
3174 	struct net *net;
3175 
3176 	net = sock_net(skb->sk);
3177 
3178 	s_h = cb->args[0];
3179 	if (s_h < 0)
3180 		s_h = 0;
3181 	s_idx = idx = cb->args[1];
3182 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3183 		if (!rt_hash_table[h].chain)
3184 			continue;
3185 		rcu_read_lock_bh();
3186 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3187 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3188 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3189 				continue;
3190 			if (rt_is_expired(rt))
3191 				continue;
3192 			skb_dst_set_noref(skb, &rt->dst);
3193 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3194 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3195 					 1, NLM_F_MULTI) <= 0) {
3196 				skb_dst_drop(skb);
3197 				rcu_read_unlock_bh();
3198 				goto done;
3199 			}
3200 			skb_dst_drop(skb);
3201 		}
3202 		rcu_read_unlock_bh();
3203 	}
3204 
3205 done:
3206 	cb->args[0] = h;
3207 	cb->args[1] = idx;
3208 	return skb->len;
3209 }
3210 
3211 void ip_rt_multicast_event(struct in_device *in_dev)
3212 {
3213 	rt_cache_flush(dev_net(in_dev->dev), 0);
3214 }
3215 
3216 #ifdef CONFIG_SYSCTL
3217 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3218 					void __user *buffer,
3219 					size_t *lenp, loff_t *ppos)
3220 {
3221 	if (write) {
3222 		int flush_delay;
3223 		ctl_table ctl;
3224 		struct net *net;
3225 
3226 		memcpy(&ctl, __ctl, sizeof(ctl));
3227 		ctl.data = &flush_delay;
3228 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3229 
3230 		net = (struct net *)__ctl->extra1;
3231 		rt_cache_flush(net, flush_delay);
3232 		return 0;
3233 	}
3234 
3235 	return -EINVAL;
3236 }
3237 
3238 static ctl_table ipv4_route_table[] = {
3239 	{
3240 		.procname	= "gc_thresh",
3241 		.data		= &ipv4_dst_ops.gc_thresh,
3242 		.maxlen		= sizeof(int),
3243 		.mode		= 0644,
3244 		.proc_handler	= proc_dointvec,
3245 	},
3246 	{
3247 		.procname	= "max_size",
3248 		.data		= &ip_rt_max_size,
3249 		.maxlen		= sizeof(int),
3250 		.mode		= 0644,
3251 		.proc_handler	= proc_dointvec,
3252 	},
3253 	{
3254 		/*  Deprecated. Use gc_min_interval_ms */
3255 
3256 		.procname	= "gc_min_interval",
3257 		.data		= &ip_rt_gc_min_interval,
3258 		.maxlen		= sizeof(int),
3259 		.mode		= 0644,
3260 		.proc_handler	= proc_dointvec_jiffies,
3261 	},
3262 	{
3263 		.procname	= "gc_min_interval_ms",
3264 		.data		= &ip_rt_gc_min_interval,
3265 		.maxlen		= sizeof(int),
3266 		.mode		= 0644,
3267 		.proc_handler	= proc_dointvec_ms_jiffies,
3268 	},
3269 	{
3270 		.procname	= "gc_timeout",
3271 		.data		= &ip_rt_gc_timeout,
3272 		.maxlen		= sizeof(int),
3273 		.mode		= 0644,
3274 		.proc_handler	= proc_dointvec_jiffies,
3275 	},
3276 	{
3277 		.procname	= "gc_interval",
3278 		.data		= &ip_rt_gc_interval,
3279 		.maxlen		= sizeof(int),
3280 		.mode		= 0644,
3281 		.proc_handler	= proc_dointvec_jiffies,
3282 	},
3283 	{
3284 		.procname	= "redirect_load",
3285 		.data		= &ip_rt_redirect_load,
3286 		.maxlen		= sizeof(int),
3287 		.mode		= 0644,
3288 		.proc_handler	= proc_dointvec,
3289 	},
3290 	{
3291 		.procname	= "redirect_number",
3292 		.data		= &ip_rt_redirect_number,
3293 		.maxlen		= sizeof(int),
3294 		.mode		= 0644,
3295 		.proc_handler	= proc_dointvec,
3296 	},
3297 	{
3298 		.procname	= "redirect_silence",
3299 		.data		= &ip_rt_redirect_silence,
3300 		.maxlen		= sizeof(int),
3301 		.mode		= 0644,
3302 		.proc_handler	= proc_dointvec,
3303 	},
3304 	{
3305 		.procname	= "error_cost",
3306 		.data		= &ip_rt_error_cost,
3307 		.maxlen		= sizeof(int),
3308 		.mode		= 0644,
3309 		.proc_handler	= proc_dointvec,
3310 	},
3311 	{
3312 		.procname	= "error_burst",
3313 		.data		= &ip_rt_error_burst,
3314 		.maxlen		= sizeof(int),
3315 		.mode		= 0644,
3316 		.proc_handler	= proc_dointvec,
3317 	},
3318 	{
3319 		.procname	= "gc_elasticity",
3320 		.data		= &ip_rt_gc_elasticity,
3321 		.maxlen		= sizeof(int),
3322 		.mode		= 0644,
3323 		.proc_handler	= proc_dointvec,
3324 	},
3325 	{
3326 		.procname	= "mtu_expires",
3327 		.data		= &ip_rt_mtu_expires,
3328 		.maxlen		= sizeof(int),
3329 		.mode		= 0644,
3330 		.proc_handler	= proc_dointvec_jiffies,
3331 	},
3332 	{
3333 		.procname	= "min_pmtu",
3334 		.data		= &ip_rt_min_pmtu,
3335 		.maxlen		= sizeof(int),
3336 		.mode		= 0644,
3337 		.proc_handler	= proc_dointvec,
3338 	},
3339 	{
3340 		.procname	= "min_adv_mss",
3341 		.data		= &ip_rt_min_advmss,
3342 		.maxlen		= sizeof(int),
3343 		.mode		= 0644,
3344 		.proc_handler	= proc_dointvec,
3345 	},
3346 	{ }
3347 };
3348 
3349 static struct ctl_table empty[1];
3350 
3351 static struct ctl_table ipv4_skeleton[] =
3352 {
3353 	{ .procname = "route",
3354 	  .mode = 0555, .child = ipv4_route_table},
3355 	{ .procname = "neigh",
3356 	  .mode = 0555, .child = empty},
3357 	{ }
3358 };
3359 
3360 static __net_initdata struct ctl_path ipv4_path[] = {
3361 	{ .procname = "net", },
3362 	{ .procname = "ipv4", },
3363 	{ },
3364 };
3365 
3366 static struct ctl_table ipv4_route_flush_table[] = {
3367 	{
3368 		.procname	= "flush",
3369 		.maxlen		= sizeof(int),
3370 		.mode		= 0200,
3371 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3372 	},
3373 	{ },
3374 };
3375 
3376 static __net_initdata struct ctl_path ipv4_route_path[] = {
3377 	{ .procname = "net", },
3378 	{ .procname = "ipv4", },
3379 	{ .procname = "route", },
3380 	{ },
3381 };
3382 
3383 static __net_init int sysctl_route_net_init(struct net *net)
3384 {
3385 	struct ctl_table *tbl;
3386 
3387 	tbl = ipv4_route_flush_table;
3388 	if (!net_eq(net, &init_net)) {
3389 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3390 		if (tbl == NULL)
3391 			goto err_dup;
3392 	}
3393 	tbl[0].extra1 = net;
3394 
3395 	net->ipv4.route_hdr =
3396 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3397 	if (net->ipv4.route_hdr == NULL)
3398 		goto err_reg;
3399 	return 0;
3400 
3401 err_reg:
3402 	if (tbl != ipv4_route_flush_table)
3403 		kfree(tbl);
3404 err_dup:
3405 	return -ENOMEM;
3406 }
3407 
3408 static __net_exit void sysctl_route_net_exit(struct net *net)
3409 {
3410 	struct ctl_table *tbl;
3411 
3412 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3413 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3414 	BUG_ON(tbl == ipv4_route_flush_table);
3415 	kfree(tbl);
3416 }
3417 
3418 static __net_initdata struct pernet_operations sysctl_route_ops = {
3419 	.init = sysctl_route_net_init,
3420 	.exit = sysctl_route_net_exit,
3421 };
3422 #endif
3423 
3424 static __net_init int rt_genid_init(struct net *net)
3425 {
3426 	get_random_bytes(&net->ipv4.rt_genid,
3427 			 sizeof(net->ipv4.rt_genid));
3428 	get_random_bytes(&net->ipv4.dev_addr_genid,
3429 			 sizeof(net->ipv4.dev_addr_genid));
3430 	return 0;
3431 }
3432 
3433 static __net_initdata struct pernet_operations rt_genid_ops = {
3434 	.init = rt_genid_init,
3435 };
3436 
3437 
3438 #ifdef CONFIG_IP_ROUTE_CLASSID
3439 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3440 #endif /* CONFIG_IP_ROUTE_CLASSID */
3441 
3442 static __initdata unsigned long rhash_entries;
3443 static int __init set_rhash_entries(char *str)
3444 {
3445 	if (!str)
3446 		return 0;
3447 	rhash_entries = simple_strtoul(str, &str, 0);
3448 	return 1;
3449 }
3450 __setup("rhash_entries=", set_rhash_entries);
3451 
3452 int __init ip_rt_init(void)
3453 {
3454 	int rc = 0;
3455 
3456 #ifdef CONFIG_IP_ROUTE_CLASSID
3457 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3458 	if (!ip_rt_acct)
3459 		panic("IP: failed to allocate ip_rt_acct\n");
3460 #endif
3461 
3462 	ipv4_dst_ops.kmem_cachep =
3463 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3464 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3465 
3466 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3467 
3468 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3469 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3470 
3471 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3472 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3473 
3474 	rt_hash_table = (struct rt_hash_bucket *)
3475 		alloc_large_system_hash("IP route cache",
3476 					sizeof(struct rt_hash_bucket),
3477 					rhash_entries,
3478 					(totalram_pages >= 128 * 1024) ?
3479 					15 : 17,
3480 					0,
3481 					&rt_hash_log,
3482 					&rt_hash_mask,
3483 					rhash_entries ? 0 : 512 * 1024);
3484 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3485 	rt_hash_lock_init();
3486 
3487 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3488 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3489 
3490 	devinet_init();
3491 	ip_fib_init();
3492 
3493 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3494 	expires_ljiffies = jiffies;
3495 	schedule_delayed_work(&expires_work,
3496 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3497 
3498 	if (ip_rt_proc_init())
3499 		printk(KERN_ERR "Unable to create route proc files\n");
3500 #ifdef CONFIG_XFRM
3501 	xfrm_init();
3502 	xfrm4_init(ip_rt_max_size);
3503 #endif
3504 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3505 
3506 #ifdef CONFIG_SYSCTL
3507 	register_pernet_subsys(&sysctl_route_ops);
3508 #endif
3509 	register_pernet_subsys(&rt_genid_ops);
3510 	return rc;
3511 }
3512 
3513 #ifdef CONFIG_SYSCTL
3514 /*
3515  * We really need to sanitize the damn ipv4 init order, then all
3516  * this nonsense will go away.
3517  */
3518 void __init ip_static_sysctl_init(void)
3519 {
3520 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3521 }
3522 #endif
3523