xref: /linux/net/ipv4/route.c (revision ec2212088c42ff7d1362629ec26dda4f3e8bdad3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <linux/slab.h>
96 #include <linux/prefetch.h>
97 #include <net/dst.h>
98 #include <net/net_namespace.h>
99 #include <net/protocol.h>
100 #include <net/ip.h>
101 #include <net/route.h>
102 #include <net/inetpeer.h>
103 #include <net/sock.h>
104 #include <net/ip_fib.h>
105 #include <net/arp.h>
106 #include <net/tcp.h>
107 #include <net/icmp.h>
108 #include <net/xfrm.h>
109 #include <net/netevent.h>
110 #include <net/rtnetlink.h>
111 #ifdef CONFIG_SYSCTL
112 #include <linux/sysctl.h>
113 #endif
114 #include <net/secure_seq.h>
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define IP_MAX_MTU	0xFFF0
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
127 static int ip_rt_redirect_number __read_mostly	= 9;
128 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly	= HZ;
131 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly	= 8;
133 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly	= 256;
136 static int rt_chain_length_max __read_mostly	= 20;
137 
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153 
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 			    int how)
156 {
157 }
158 
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161 	struct rtable *rt = (struct rtable *) dst;
162 	struct inet_peer *peer;
163 	u32 *p = NULL;
164 
165 	if (!rt->peer)
166 		rt_bind_peer(rt, rt->rt_dst, 1);
167 
168 	peer = rt->peer;
169 	if (peer) {
170 		u32 *old_p = __DST_METRICS_PTR(old);
171 		unsigned long prev, new;
172 
173 		p = peer->metrics;
174 		if (inet_metrics_new(peer))
175 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176 
177 		new = (unsigned long) p;
178 		prev = cmpxchg(&dst->_metrics, old, new);
179 
180 		if (prev != old) {
181 			p = __DST_METRICS_PTR(prev);
182 			if (prev & DST_METRICS_READ_ONLY)
183 				p = NULL;
184 		} else {
185 			if (rt->fi) {
186 				fib_info_put(rt->fi);
187 				rt->fi = NULL;
188 			}
189 		}
190 	}
191 	return p;
192 }
193 
194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195 
196 static struct dst_ops ipv4_dst_ops = {
197 	.family =		AF_INET,
198 	.protocol =		cpu_to_be16(ETH_P_IP),
199 	.gc =			rt_garbage_collect,
200 	.check =		ipv4_dst_check,
201 	.default_advmss =	ipv4_default_advmss,
202 	.mtu =			ipv4_mtu,
203 	.cow_metrics =		ipv4_cow_metrics,
204 	.destroy =		ipv4_dst_destroy,
205 	.ifdown =		ipv4_dst_ifdown,
206 	.negative_advice =	ipv4_negative_advice,
207 	.link_failure =		ipv4_link_failure,
208 	.update_pmtu =		ip_rt_update_pmtu,
209 	.local_out =		__ip_local_out,
210 	.neigh_lookup =		ipv4_neigh_lookup,
211 };
212 
213 #define ECN_OR_COST(class)	TC_PRIO_##class
214 
215 const __u8 ip_tos2prio[16] = {
216 	TC_PRIO_BESTEFFORT,
217 	ECN_OR_COST(BESTEFFORT),
218 	TC_PRIO_BESTEFFORT,
219 	ECN_OR_COST(BESTEFFORT),
220 	TC_PRIO_BULK,
221 	ECN_OR_COST(BULK),
222 	TC_PRIO_BULK,
223 	ECN_OR_COST(BULK),
224 	TC_PRIO_INTERACTIVE,
225 	ECN_OR_COST(INTERACTIVE),
226 	TC_PRIO_INTERACTIVE,
227 	ECN_OR_COST(INTERACTIVE),
228 	TC_PRIO_INTERACTIVE_BULK,
229 	ECN_OR_COST(INTERACTIVE_BULK),
230 	TC_PRIO_INTERACTIVE_BULK,
231 	ECN_OR_COST(INTERACTIVE_BULK)
232 };
233 
234 
235 /*
236  * Route cache.
237  */
238 
239 /* The locking scheme is rather straight forward:
240  *
241  * 1) Read-Copy Update protects the buckets of the central route hash.
242  * 2) Only writers remove entries, and they hold the lock
243  *    as they look at rtable reference counts.
244  * 3) Only readers acquire references to rtable entries,
245  *    they do so with atomic increments and with the
246  *    lock held.
247  */
248 
249 struct rt_hash_bucket {
250 	struct rtable __rcu	*chain;
251 };
252 
253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 	defined(CONFIG_PROVE_LOCKING)
255 /*
256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257  * The size of this table is a power of two and depends on the number of CPUS.
258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259  */
260 #ifdef CONFIG_LOCKDEP
261 # define RT_HASH_LOCK_SZ	256
262 #else
263 # if NR_CPUS >= 32
264 #  define RT_HASH_LOCK_SZ	4096
265 # elif NR_CPUS >= 16
266 #  define RT_HASH_LOCK_SZ	2048
267 # elif NR_CPUS >= 8
268 #  define RT_HASH_LOCK_SZ	1024
269 # elif NR_CPUS >= 4
270 #  define RT_HASH_LOCK_SZ	512
271 # else
272 #  define RT_HASH_LOCK_SZ	256
273 # endif
274 #endif
275 
276 static spinlock_t	*rt_hash_locks;
277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278 
279 static __init void rt_hash_lock_init(void)
280 {
281 	int i;
282 
283 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 			GFP_KERNEL);
285 	if (!rt_hash_locks)
286 		panic("IP: failed to allocate rt_hash_locks\n");
287 
288 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 		spin_lock_init(&rt_hash_locks[i]);
290 }
291 #else
292 # define rt_hash_lock_addr(slot) NULL
293 
294 static inline void rt_hash_lock_init(void)
295 {
296 }
297 #endif
298 
299 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
300 static unsigned			rt_hash_mask __read_mostly;
301 static unsigned int		rt_hash_log  __read_mostly;
302 
303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305 
306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
307 				   int genid)
308 {
309 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
310 			    idx, genid)
311 		& rt_hash_mask;
312 }
313 
314 static inline int rt_genid(struct net *net)
315 {
316 	return atomic_read(&net->ipv4.rt_genid);
317 }
318 
319 #ifdef CONFIG_PROC_FS
320 struct rt_cache_iter_state {
321 	struct seq_net_private p;
322 	int bucket;
323 	int genid;
324 };
325 
326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
327 {
328 	struct rt_cache_iter_state *st = seq->private;
329 	struct rtable *r = NULL;
330 
331 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
332 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
333 			continue;
334 		rcu_read_lock_bh();
335 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 		while (r) {
337 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
338 			    r->rt_genid == st->genid)
339 				return r;
340 			r = rcu_dereference_bh(r->dst.rt_next);
341 		}
342 		rcu_read_unlock_bh();
343 	}
344 	return r;
345 }
346 
347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
348 					  struct rtable *r)
349 {
350 	struct rt_cache_iter_state *st = seq->private;
351 
352 	r = rcu_dereference_bh(r->dst.rt_next);
353 	while (!r) {
354 		rcu_read_unlock_bh();
355 		do {
356 			if (--st->bucket < 0)
357 				return NULL;
358 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359 		rcu_read_lock_bh();
360 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
361 	}
362 	return r;
363 }
364 
365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
366 					struct rtable *r)
367 {
368 	struct rt_cache_iter_state *st = seq->private;
369 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
370 		if (dev_net(r->dst.dev) != seq_file_net(seq))
371 			continue;
372 		if (r->rt_genid == st->genid)
373 			break;
374 	}
375 	return r;
376 }
377 
378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379 {
380 	struct rtable *r = rt_cache_get_first(seq);
381 
382 	if (r)
383 		while (pos && (r = rt_cache_get_next(seq, r)))
384 			--pos;
385 	return pos ? NULL : r;
386 }
387 
388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389 {
390 	struct rt_cache_iter_state *st = seq->private;
391 	if (*pos)
392 		return rt_cache_get_idx(seq, *pos - 1);
393 	st->genid = rt_genid(seq_file_net(seq));
394 	return SEQ_START_TOKEN;
395 }
396 
397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398 {
399 	struct rtable *r;
400 
401 	if (v == SEQ_START_TOKEN)
402 		r = rt_cache_get_first(seq);
403 	else
404 		r = rt_cache_get_next(seq, v);
405 	++*pos;
406 	return r;
407 }
408 
409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410 {
411 	if (v && v != SEQ_START_TOKEN)
412 		rcu_read_unlock_bh();
413 }
414 
415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
416 {
417 	if (v == SEQ_START_TOKEN)
418 		seq_printf(seq, "%-127s\n",
419 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421 			   "HHUptod\tSpecDst");
422 	else {
423 		struct rtable *r = v;
424 		struct neighbour *n;
425 		int len, HHUptod;
426 
427 		rcu_read_lock();
428 		n = dst_get_neighbour_noref(&r->dst);
429 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 		rcu_read_unlock();
431 
432 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434 			r->dst.dev ? r->dst.dev->name : "*",
435 			(__force u32)r->rt_dst,
436 			(__force u32)r->rt_gateway,
437 			r->rt_flags, atomic_read(&r->dst.__refcnt),
438 			r->dst.__use, 0, (__force u32)r->rt_src,
439 			dst_metric_advmss(&r->dst) + 40,
440 			dst_metric(&r->dst, RTAX_WINDOW),
441 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 			      dst_metric(&r->dst, RTAX_RTTVAR)),
443 			r->rt_key_tos,
444 			-1,
445 			HHUptod,
446 			r->rt_spec_dst, &len);
447 
448 		seq_printf(seq, "%*s\n", 127 - len, "");
449 	}
450 	return 0;
451 }
452 
453 static const struct seq_operations rt_cache_seq_ops = {
454 	.start  = rt_cache_seq_start,
455 	.next   = rt_cache_seq_next,
456 	.stop   = rt_cache_seq_stop,
457 	.show   = rt_cache_seq_show,
458 };
459 
460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
461 {
462 	return seq_open_net(inode, file, &rt_cache_seq_ops,
463 			sizeof(struct rt_cache_iter_state));
464 }
465 
466 static const struct file_operations rt_cache_seq_fops = {
467 	.owner	 = THIS_MODULE,
468 	.open	 = rt_cache_seq_open,
469 	.read	 = seq_read,
470 	.llseek	 = seq_lseek,
471 	.release = seq_release_net,
472 };
473 
474 
475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
476 {
477 	int cpu;
478 
479 	if (*pos == 0)
480 		return SEQ_START_TOKEN;
481 
482 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
483 		if (!cpu_possible(cpu))
484 			continue;
485 		*pos = cpu+1;
486 		return &per_cpu(rt_cache_stat, cpu);
487 	}
488 	return NULL;
489 }
490 
491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
492 {
493 	int cpu;
494 
495 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
496 		if (!cpu_possible(cpu))
497 			continue;
498 		*pos = cpu+1;
499 		return &per_cpu(rt_cache_stat, cpu);
500 	}
501 	return NULL;
502 
503 }
504 
505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
506 {
507 
508 }
509 
510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511 {
512 	struct rt_cache_stat *st = v;
513 
514 	if (v == SEQ_START_TOKEN) {
515 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
516 		return 0;
517 	}
518 
519 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
520 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
521 		   dst_entries_get_slow(&ipv4_dst_ops),
522 		   st->in_hit,
523 		   st->in_slow_tot,
524 		   st->in_slow_mc,
525 		   st->in_no_route,
526 		   st->in_brd,
527 		   st->in_martian_dst,
528 		   st->in_martian_src,
529 
530 		   st->out_hit,
531 		   st->out_slow_tot,
532 		   st->out_slow_mc,
533 
534 		   st->gc_total,
535 		   st->gc_ignored,
536 		   st->gc_goal_miss,
537 		   st->gc_dst_overflow,
538 		   st->in_hlist_search,
539 		   st->out_hlist_search
540 		);
541 	return 0;
542 }
543 
544 static const struct seq_operations rt_cpu_seq_ops = {
545 	.start  = rt_cpu_seq_start,
546 	.next   = rt_cpu_seq_next,
547 	.stop   = rt_cpu_seq_stop,
548 	.show   = rt_cpu_seq_show,
549 };
550 
551 
552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553 {
554 	return seq_open(file, &rt_cpu_seq_ops);
555 }
556 
557 static const struct file_operations rt_cpu_seq_fops = {
558 	.owner	 = THIS_MODULE,
559 	.open	 = rt_cpu_seq_open,
560 	.read	 = seq_read,
561 	.llseek	 = seq_lseek,
562 	.release = seq_release,
563 };
564 
565 #ifdef CONFIG_IP_ROUTE_CLASSID
566 static int rt_acct_proc_show(struct seq_file *m, void *v)
567 {
568 	struct ip_rt_acct *dst, *src;
569 	unsigned int i, j;
570 
571 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
572 	if (!dst)
573 		return -ENOMEM;
574 
575 	for_each_possible_cpu(i) {
576 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
577 		for (j = 0; j < 256; j++) {
578 			dst[j].o_bytes   += src[j].o_bytes;
579 			dst[j].o_packets += src[j].o_packets;
580 			dst[j].i_bytes   += src[j].i_bytes;
581 			dst[j].i_packets += src[j].i_packets;
582 		}
583 	}
584 
585 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
586 	kfree(dst);
587 	return 0;
588 }
589 
590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
591 {
592 	return single_open(file, rt_acct_proc_show, NULL);
593 }
594 
595 static const struct file_operations rt_acct_proc_fops = {
596 	.owner		= THIS_MODULE,
597 	.open		= rt_acct_proc_open,
598 	.read		= seq_read,
599 	.llseek		= seq_lseek,
600 	.release	= single_release,
601 };
602 #endif
603 
604 static int __net_init ip_rt_do_proc_init(struct net *net)
605 {
606 	struct proc_dir_entry *pde;
607 
608 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
609 			&rt_cache_seq_fops);
610 	if (!pde)
611 		goto err1;
612 
613 	pde = proc_create("rt_cache", S_IRUGO,
614 			  net->proc_net_stat, &rt_cpu_seq_fops);
615 	if (!pde)
616 		goto err2;
617 
618 #ifdef CONFIG_IP_ROUTE_CLASSID
619 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620 	if (!pde)
621 		goto err3;
622 #endif
623 	return 0;
624 
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626 err3:
627 	remove_proc_entry("rt_cache", net->proc_net_stat);
628 #endif
629 err2:
630 	remove_proc_entry("rt_cache", net->proc_net);
631 err1:
632 	return -ENOMEM;
633 }
634 
635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
636 {
637 	remove_proc_entry("rt_cache", net->proc_net_stat);
638 	remove_proc_entry("rt_cache", net->proc_net);
639 #ifdef CONFIG_IP_ROUTE_CLASSID
640 	remove_proc_entry("rt_acct", net->proc_net);
641 #endif
642 }
643 
644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
645 	.init = ip_rt_do_proc_init,
646 	.exit = ip_rt_do_proc_exit,
647 };
648 
649 static int __init ip_rt_proc_init(void)
650 {
651 	return register_pernet_subsys(&ip_rt_proc_ops);
652 }
653 
654 #else
655 static inline int ip_rt_proc_init(void)
656 {
657 	return 0;
658 }
659 #endif /* CONFIG_PROC_FS */
660 
661 static inline void rt_free(struct rtable *rt)
662 {
663 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
664 }
665 
666 static inline void rt_drop(struct rtable *rt)
667 {
668 	ip_rt_put(rt);
669 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
670 }
671 
672 static inline int rt_fast_clean(struct rtable *rth)
673 {
674 	/* Kill broadcast/multicast entries very aggresively, if they
675 	   collide in hash table with more useful entries */
676 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
677 		rt_is_input_route(rth) && rth->dst.rt_next;
678 }
679 
680 static inline int rt_valuable(struct rtable *rth)
681 {
682 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
683 		(rth->peer && rth->peer->pmtu_expires);
684 }
685 
686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687 {
688 	unsigned long age;
689 	int ret = 0;
690 
691 	if (atomic_read(&rth->dst.__refcnt))
692 		goto out;
693 
694 	age = jiffies - rth->dst.lastuse;
695 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 	    (age <= tmo2 && rt_valuable(rth)))
697 		goto out;
698 	ret = 1;
699 out:	return ret;
700 }
701 
702 /* Bits of score are:
703  * 31: very valuable
704  * 30: not quite useless
705  * 29..0: usage counter
706  */
707 static inline u32 rt_score(struct rtable *rt)
708 {
709 	u32 score = jiffies - rt->dst.lastuse;
710 
711 	score = ~score & ~(3<<30);
712 
713 	if (rt_valuable(rt))
714 		score |= (1<<31);
715 
716 	if (rt_is_output_route(rt) ||
717 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 		score |= (1<<30);
719 
720 	return score;
721 }
722 
723 static inline bool rt_caching(const struct net *net)
724 {
725 	return net->ipv4.current_rt_cache_rebuild_count <=
726 		net->ipv4.sysctl_rt_cache_rebuild_count;
727 }
728 
729 static inline bool compare_hash_inputs(const struct rtable *rt1,
730 				       const struct rtable *rt2)
731 {
732 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
734 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
735 }
736 
737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738 {
739 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 		(rt1->rt_mark ^ rt2->rt_mark) |
742 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
743 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
744 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
745 }
746 
747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748 {
749 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
750 }
751 
752 static inline int rt_is_expired(struct rtable *rth)
753 {
754 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
755 }
756 
757 /*
758  * Perform a full scan of hash table and free all entries.
759  * Can be called by a softirq or a process.
760  * In the later case, we want to be reschedule if necessary
761  */
762 static void rt_do_flush(struct net *net, int process_context)
763 {
764 	unsigned int i;
765 	struct rtable *rth, *next;
766 
767 	for (i = 0; i <= rt_hash_mask; i++) {
768 		struct rtable __rcu **pprev;
769 		struct rtable *list;
770 
771 		if (process_context && need_resched())
772 			cond_resched();
773 		rth = rcu_access_pointer(rt_hash_table[i].chain);
774 		if (!rth)
775 			continue;
776 
777 		spin_lock_bh(rt_hash_lock_addr(i));
778 
779 		list = NULL;
780 		pprev = &rt_hash_table[i].chain;
781 		rth = rcu_dereference_protected(*pprev,
782 			lockdep_is_held(rt_hash_lock_addr(i)));
783 
784 		while (rth) {
785 			next = rcu_dereference_protected(rth->dst.rt_next,
786 				lockdep_is_held(rt_hash_lock_addr(i)));
787 
788 			if (!net ||
789 			    net_eq(dev_net(rth->dst.dev), net)) {
790 				rcu_assign_pointer(*pprev, next);
791 				rcu_assign_pointer(rth->dst.rt_next, list);
792 				list = rth;
793 			} else {
794 				pprev = &rth->dst.rt_next;
795 			}
796 			rth = next;
797 		}
798 
799 		spin_unlock_bh(rt_hash_lock_addr(i));
800 
801 		for (; list; list = next) {
802 			next = rcu_dereference_protected(list->dst.rt_next, 1);
803 			rt_free(list);
804 		}
805 	}
806 }
807 
808 /*
809  * While freeing expired entries, we compute average chain length
810  * and standard deviation, using fixed-point arithmetic.
811  * This to have an estimation of rt_chain_length_max
812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814  */
815 
816 #define FRACT_BITS 3
817 #define ONE (1UL << FRACT_BITS)
818 
819 /*
820  * Given a hash chain and an item in this hash chain,
821  * find if a previous entry has the same hash_inputs
822  * (but differs on tos, mark or oif)
823  * Returns 0 if an alias is found.
824  * Returns ONE if rth has no alias before itself.
825  */
826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
827 {
828 	const struct rtable *aux = head;
829 
830 	while (aux != rth) {
831 		if (compare_hash_inputs(aux, rth))
832 			return 0;
833 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834 	}
835 	return ONE;
836 }
837 
838 static void rt_check_expire(void)
839 {
840 	static unsigned int rover;
841 	unsigned int i = rover, goal;
842 	struct rtable *rth;
843 	struct rtable __rcu **rthp;
844 	unsigned long samples = 0;
845 	unsigned long sum = 0, sum2 = 0;
846 	unsigned long delta;
847 	u64 mult;
848 
849 	delta = jiffies - expires_ljiffies;
850 	expires_ljiffies = jiffies;
851 	mult = ((u64)delta) << rt_hash_log;
852 	if (ip_rt_gc_timeout > 1)
853 		do_div(mult, ip_rt_gc_timeout);
854 	goal = (unsigned int)mult;
855 	if (goal > rt_hash_mask)
856 		goal = rt_hash_mask + 1;
857 	for (; goal > 0; goal--) {
858 		unsigned long tmo = ip_rt_gc_timeout;
859 		unsigned long length;
860 
861 		i = (i + 1) & rt_hash_mask;
862 		rthp = &rt_hash_table[i].chain;
863 
864 		if (need_resched())
865 			cond_resched();
866 
867 		samples++;
868 
869 		if (rcu_dereference_raw(*rthp) == NULL)
870 			continue;
871 		length = 0;
872 		spin_lock_bh(rt_hash_lock_addr(i));
873 		while ((rth = rcu_dereference_protected(*rthp,
874 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 			prefetch(rth->dst.rt_next);
876 			if (rt_is_expired(rth)) {
877 				*rthp = rth->dst.rt_next;
878 				rt_free(rth);
879 				continue;
880 			}
881 			if (rth->dst.expires) {
882 				/* Entry is expired even if it is in use */
883 				if (time_before_eq(jiffies, rth->dst.expires)) {
884 nofree:
885 					tmo >>= 1;
886 					rthp = &rth->dst.rt_next;
887 					/*
888 					 * We only count entries on
889 					 * a chain with equal hash inputs once
890 					 * so that entries for different QOS
891 					 * levels, and other non-hash input
892 					 * attributes don't unfairly skew
893 					 * the length computation
894 					 */
895 					length += has_noalias(rt_hash_table[i].chain, rth);
896 					continue;
897 				}
898 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 				goto nofree;
900 
901 			/* Cleanup aged off entries. */
902 			*rthp = rth->dst.rt_next;
903 			rt_free(rth);
904 		}
905 		spin_unlock_bh(rt_hash_lock_addr(i));
906 		sum += length;
907 		sum2 += length*length;
908 	}
909 	if (samples) {
910 		unsigned long avg = sum / samples;
911 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 		rt_chain_length_max = max_t(unsigned long,
913 					ip_rt_gc_elasticity,
914 					(avg + 4*sd) >> FRACT_BITS);
915 	}
916 	rover = i;
917 }
918 
919 /*
920  * rt_worker_func() is run in process context.
921  * we call rt_check_expire() to scan part of the hash table
922  */
923 static void rt_worker_func(struct work_struct *work)
924 {
925 	rt_check_expire();
926 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927 }
928 
929 /*
930  * Perturbation of rt_genid by a small quantity [1..256]
931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932  * many times (2^24) without giving recent rt_genid.
933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
934  */
935 static void rt_cache_invalidate(struct net *net)
936 {
937 	unsigned char shuffle;
938 
939 	get_random_bytes(&shuffle, sizeof(shuffle));
940 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941 	inetpeer_invalidate_tree(AF_INET);
942 }
943 
944 /*
945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
946  * delay >= 0 : invalidate & flush cache (can be long)
947  */
948 void rt_cache_flush(struct net *net, int delay)
949 {
950 	rt_cache_invalidate(net);
951 	if (delay >= 0)
952 		rt_do_flush(net, !in_softirq());
953 }
954 
955 /* Flush previous cache invalidated entries from the cache */
956 void rt_cache_flush_batch(struct net *net)
957 {
958 	rt_do_flush(net, !in_softirq());
959 }
960 
961 static void rt_emergency_hash_rebuild(struct net *net)
962 {
963 	if (net_ratelimit())
964 		pr_warn("Route hash chain too long!\n");
965 	rt_cache_invalidate(net);
966 }
967 
968 /*
969    Short description of GC goals.
970 
971    We want to build algorithm, which will keep routing cache
972    at some equilibrium point, when number of aged off entries
973    is kept approximately equal to newly generated ones.
974 
975    Current expiration strength is variable "expire".
976    We try to adjust it dynamically, so that if networking
977    is idle expires is large enough to keep enough of warm entries,
978    and when load increases it reduces to limit cache size.
979  */
980 
981 static int rt_garbage_collect(struct dst_ops *ops)
982 {
983 	static unsigned long expire = RT_GC_TIMEOUT;
984 	static unsigned long last_gc;
985 	static int rover;
986 	static int equilibrium;
987 	struct rtable *rth;
988 	struct rtable __rcu **rthp;
989 	unsigned long now = jiffies;
990 	int goal;
991 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
992 
993 	/*
994 	 * Garbage collection is pretty expensive,
995 	 * do not make it too frequently.
996 	 */
997 
998 	RT_CACHE_STAT_INC(gc_total);
999 
1000 	if (now - last_gc < ip_rt_gc_min_interval &&
1001 	    entries < ip_rt_max_size) {
1002 		RT_CACHE_STAT_INC(gc_ignored);
1003 		goto out;
1004 	}
1005 
1006 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1007 	/* Calculate number of entries, which we want to expire now. */
1008 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009 	if (goal <= 0) {
1010 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 			equilibrium = ipv4_dst_ops.gc_thresh;
1012 		goal = entries - equilibrium;
1013 		if (goal > 0) {
1014 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015 			goal = entries - equilibrium;
1016 		}
1017 	} else {
1018 		/* We are in dangerous area. Try to reduce cache really
1019 		 * aggressively.
1020 		 */
1021 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022 		equilibrium = entries - goal;
1023 	}
1024 
1025 	if (now - last_gc >= ip_rt_gc_min_interval)
1026 		last_gc = now;
1027 
1028 	if (goal <= 0) {
1029 		equilibrium += goal;
1030 		goto work_done;
1031 	}
1032 
1033 	do {
1034 		int i, k;
1035 
1036 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 			unsigned long tmo = expire;
1038 
1039 			k = (k + 1) & rt_hash_mask;
1040 			rthp = &rt_hash_table[k].chain;
1041 			spin_lock_bh(rt_hash_lock_addr(k));
1042 			while ((rth = rcu_dereference_protected(*rthp,
1043 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1044 				if (!rt_is_expired(rth) &&
1045 					!rt_may_expire(rth, tmo, expire)) {
1046 					tmo >>= 1;
1047 					rthp = &rth->dst.rt_next;
1048 					continue;
1049 				}
1050 				*rthp = rth->dst.rt_next;
1051 				rt_free(rth);
1052 				goal--;
1053 			}
1054 			spin_unlock_bh(rt_hash_lock_addr(k));
1055 			if (goal <= 0)
1056 				break;
1057 		}
1058 		rover = k;
1059 
1060 		if (goal <= 0)
1061 			goto work_done;
1062 
1063 		/* Goal is not achieved. We stop process if:
1064 
1065 		   - if expire reduced to zero. Otherwise, expire is halfed.
1066 		   - if table is not full.
1067 		   - if we are called from interrupt.
1068 		   - jiffies check is just fallback/debug loop breaker.
1069 		     We will not spin here for long time in any case.
1070 		 */
1071 
1072 		RT_CACHE_STAT_INC(gc_goal_miss);
1073 
1074 		if (expire == 0)
1075 			break;
1076 
1077 		expire >>= 1;
1078 
1079 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080 			goto out;
1081 	} while (!in_softirq() && time_before_eq(jiffies, now));
1082 
1083 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 		goto out;
1085 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086 		goto out;
1087 	if (net_ratelimit())
1088 		pr_warn("dst cache overflow\n");
1089 	RT_CACHE_STAT_INC(gc_dst_overflow);
1090 	return 1;
1091 
1092 work_done:
1093 	expire += ip_rt_gc_min_interval;
1094 	if (expire > ip_rt_gc_timeout ||
1095 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1097 		expire = ip_rt_gc_timeout;
1098 out:	return 0;
1099 }
1100 
1101 /*
1102  * Returns number of entries in a hash chain that have different hash_inputs
1103  */
1104 static int slow_chain_length(const struct rtable *head)
1105 {
1106 	int length = 0;
1107 	const struct rtable *rth = head;
1108 
1109 	while (rth) {
1110 		length += has_noalias(head, rth);
1111 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1112 	}
1113 	return length >> FRACT_BITS;
1114 }
1115 
1116 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117 {
1118 	static const __be32 inaddr_any = 0;
1119 	struct net_device *dev = dst->dev;
1120 	const __be32 *pkey = daddr;
1121 	const struct rtable *rt;
1122 	struct neighbour *n;
1123 
1124 	rt = (const struct rtable *) dst;
1125 
1126 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1127 		pkey = &inaddr_any;
1128 	else if (rt->rt_gateway)
1129 		pkey = (const __be32 *) &rt->rt_gateway;
1130 
1131 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1132 	if (n)
1133 		return n;
1134 	return neigh_create(&arp_tbl, pkey, dev);
1135 }
1136 
1137 static int rt_bind_neighbour(struct rtable *rt)
1138 {
1139 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1140 	if (IS_ERR(n))
1141 		return PTR_ERR(n);
1142 	dst_set_neighbour(&rt->dst, n);
1143 
1144 	return 0;
1145 }
1146 
1147 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148 				     struct sk_buff *skb, int ifindex)
1149 {
1150 	struct rtable	*rth, *cand;
1151 	struct rtable __rcu **rthp, **candp;
1152 	unsigned long	now;
1153 	u32 		min_score;
1154 	int		chain_length;
1155 	int attempts = !in_softirq();
1156 
1157 restart:
1158 	chain_length = 0;
1159 	min_score = ~(u32)0;
1160 	cand = NULL;
1161 	candp = NULL;
1162 	now = jiffies;
1163 
1164 	if (!rt_caching(dev_net(rt->dst.dev))) {
1165 		/*
1166 		 * If we're not caching, just tell the caller we
1167 		 * were successful and don't touch the route.  The
1168 		 * caller hold the sole reference to the cache entry, and
1169 		 * it will be released when the caller is done with it.
1170 		 * If we drop it here, the callers have no way to resolve routes
1171 		 * when we're not caching.  Instead, just point *rp at rt, so
1172 		 * the caller gets a single use out of the route
1173 		 * Note that we do rt_free on this new route entry, so that
1174 		 * once its refcount hits zero, we are still able to reap it
1175 		 * (Thanks Alexey)
1176 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1177 		 * we set DST_NOCACHE so that dst_release() can free dst without
1178 		 * waiting a grace period.
1179 		 */
1180 
1181 		rt->dst.flags |= DST_NOCACHE;
1182 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183 			int err = rt_bind_neighbour(rt);
1184 			if (err) {
1185 				if (net_ratelimit())
1186 					pr_warn("Neighbour table failure & not caching routes\n");
1187 				ip_rt_put(rt);
1188 				return ERR_PTR(err);
1189 			}
1190 		}
1191 
1192 		goto skip_hashing;
1193 	}
1194 
1195 	rthp = &rt_hash_table[hash].chain;
1196 
1197 	spin_lock_bh(rt_hash_lock_addr(hash));
1198 	while ((rth = rcu_dereference_protected(*rthp,
1199 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1200 		if (rt_is_expired(rth)) {
1201 			*rthp = rth->dst.rt_next;
1202 			rt_free(rth);
1203 			continue;
1204 		}
1205 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1206 			/* Put it first */
1207 			*rthp = rth->dst.rt_next;
1208 			/*
1209 			 * Since lookup is lockfree, the deletion
1210 			 * must be visible to another weakly ordered CPU before
1211 			 * the insertion at the start of the hash chain.
1212 			 */
1213 			rcu_assign_pointer(rth->dst.rt_next,
1214 					   rt_hash_table[hash].chain);
1215 			/*
1216 			 * Since lookup is lockfree, the update writes
1217 			 * must be ordered for consistency on SMP.
1218 			 */
1219 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220 
1221 			dst_use(&rth->dst, now);
1222 			spin_unlock_bh(rt_hash_lock_addr(hash));
1223 
1224 			rt_drop(rt);
1225 			if (skb)
1226 				skb_dst_set(skb, &rth->dst);
1227 			return rth;
1228 		}
1229 
1230 		if (!atomic_read(&rth->dst.__refcnt)) {
1231 			u32 score = rt_score(rth);
1232 
1233 			if (score <= min_score) {
1234 				cand = rth;
1235 				candp = rthp;
1236 				min_score = score;
1237 			}
1238 		}
1239 
1240 		chain_length++;
1241 
1242 		rthp = &rth->dst.rt_next;
1243 	}
1244 
1245 	if (cand) {
1246 		/* ip_rt_gc_elasticity used to be average length of chain
1247 		 * length, when exceeded gc becomes really aggressive.
1248 		 *
1249 		 * The second limit is less certain. At the moment it allows
1250 		 * only 2 entries per bucket. We will see.
1251 		 */
1252 		if (chain_length > ip_rt_gc_elasticity) {
1253 			*candp = cand->dst.rt_next;
1254 			rt_free(cand);
1255 		}
1256 	} else {
1257 		if (chain_length > rt_chain_length_max &&
1258 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1259 			struct net *net = dev_net(rt->dst.dev);
1260 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1261 			if (!rt_caching(net)) {
1262 				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1263 					rt->dst.dev->name, num);
1264 			}
1265 			rt_emergency_hash_rebuild(net);
1266 			spin_unlock_bh(rt_hash_lock_addr(hash));
1267 
1268 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1269 					ifindex, rt_genid(net));
1270 			goto restart;
1271 		}
1272 	}
1273 
1274 	/* Try to bind route to arp only if it is output
1275 	   route or unicast forwarding path.
1276 	 */
1277 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1278 		int err = rt_bind_neighbour(rt);
1279 		if (err) {
1280 			spin_unlock_bh(rt_hash_lock_addr(hash));
1281 
1282 			if (err != -ENOBUFS) {
1283 				rt_drop(rt);
1284 				return ERR_PTR(err);
1285 			}
1286 
1287 			/* Neighbour tables are full and nothing
1288 			   can be released. Try to shrink route cache,
1289 			   it is most likely it holds some neighbour records.
1290 			 */
1291 			if (attempts-- > 0) {
1292 				int saved_elasticity = ip_rt_gc_elasticity;
1293 				int saved_int = ip_rt_gc_min_interval;
1294 				ip_rt_gc_elasticity	= 1;
1295 				ip_rt_gc_min_interval	= 0;
1296 				rt_garbage_collect(&ipv4_dst_ops);
1297 				ip_rt_gc_min_interval	= saved_int;
1298 				ip_rt_gc_elasticity	= saved_elasticity;
1299 				goto restart;
1300 			}
1301 
1302 			if (net_ratelimit())
1303 				pr_warn("Neighbour table overflow\n");
1304 			rt_drop(rt);
1305 			return ERR_PTR(-ENOBUFS);
1306 		}
1307 	}
1308 
1309 	rt->dst.rt_next = rt_hash_table[hash].chain;
1310 
1311 	/*
1312 	 * Since lookup is lockfree, we must make sure
1313 	 * previous writes to rt are committed to memory
1314 	 * before making rt visible to other CPUS.
1315 	 */
1316 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1317 
1318 	spin_unlock_bh(rt_hash_lock_addr(hash));
1319 
1320 skip_hashing:
1321 	if (skb)
1322 		skb_dst_set(skb, &rt->dst);
1323 	return rt;
1324 }
1325 
1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327 
1328 static u32 rt_peer_genid(void)
1329 {
1330 	return atomic_read(&__rt_peer_genid);
1331 }
1332 
1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1334 {
1335 	struct inet_peer *peer;
1336 
1337 	peer = inet_getpeer_v4(daddr, create);
1338 
1339 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1340 		inet_putpeer(peer);
1341 	else
1342 		rt->rt_peer_genid = rt_peer_genid();
1343 }
1344 
1345 /*
1346  * Peer allocation may fail only in serious out-of-memory conditions.  However
1347  * we still can generate some output.
1348  * Random ID selection looks a bit dangerous because we have no chances to
1349  * select ID being unique in a reasonable period of time.
1350  * But broken packet identifier may be better than no packet at all.
1351  */
1352 static void ip_select_fb_ident(struct iphdr *iph)
1353 {
1354 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 	static u32 ip_fallback_id;
1356 	u32 salt;
1357 
1358 	spin_lock_bh(&ip_fb_id_lock);
1359 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1360 	iph->id = htons(salt & 0xFFFF);
1361 	ip_fallback_id = salt;
1362 	spin_unlock_bh(&ip_fb_id_lock);
1363 }
1364 
1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366 {
1367 	struct rtable *rt = (struct rtable *) dst;
1368 
1369 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1370 		if (rt->peer == NULL)
1371 			rt_bind_peer(rt, rt->rt_dst, 1);
1372 
1373 		/* If peer is attached to destination, it is never detached,
1374 		   so that we need not to grab a lock to dereference it.
1375 		 */
1376 		if (rt->peer) {
1377 			iph->id = htons(inet_getid(rt->peer, more));
1378 			return;
1379 		}
1380 	} else if (!rt)
1381 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1382 		       __builtin_return_address(0));
1383 
1384 	ip_select_fb_ident(iph);
1385 }
1386 EXPORT_SYMBOL(__ip_select_ident);
1387 
1388 static void rt_del(unsigned hash, struct rtable *rt)
1389 {
1390 	struct rtable __rcu **rthp;
1391 	struct rtable *aux;
1392 
1393 	rthp = &rt_hash_table[hash].chain;
1394 	spin_lock_bh(rt_hash_lock_addr(hash));
1395 	ip_rt_put(rt);
1396 	while ((aux = rcu_dereference_protected(*rthp,
1397 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1398 		if (aux == rt || rt_is_expired(aux)) {
1399 			*rthp = aux->dst.rt_next;
1400 			rt_free(aux);
1401 			continue;
1402 		}
1403 		rthp = &aux->dst.rt_next;
1404 	}
1405 	spin_unlock_bh(rt_hash_lock_addr(hash));
1406 }
1407 
1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1409 {
1410 	struct rtable *rt = (struct rtable *) dst;
1411 	__be32 orig_gw = rt->rt_gateway;
1412 	struct neighbour *n, *old_n;
1413 
1414 	dst_confirm(&rt->dst);
1415 
1416 	rt->rt_gateway = peer->redirect_learned.a4;
1417 
1418 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1419 	if (IS_ERR(n)) {
1420 		rt->rt_gateway = orig_gw;
1421 		return;
1422 	}
1423 	old_n = xchg(&rt->dst._neighbour, n);
1424 	if (old_n)
1425 		neigh_release(old_n);
1426 	if (!(n->nud_state & NUD_VALID)) {
1427 		neigh_event_send(n, NULL);
1428 	} else {
1429 		rt->rt_flags |= RTCF_REDIRECTED;
1430 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431 	}
1432 }
1433 
1434 /* called in rcu_read_lock() section */
1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 		    __be32 saddr, struct net_device *dev)
1437 {
1438 	int s, i;
1439 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1440 	__be32 skeys[2] = { saddr, 0 };
1441 	int    ikeys[2] = { dev->ifindex, 0 };
1442 	struct inet_peer *peer;
1443 	struct net *net;
1444 
1445 	if (!in_dev)
1446 		return;
1447 
1448 	net = dev_net(dev);
1449 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 	    ipv4_is_zeronet(new_gw))
1452 		goto reject_redirect;
1453 
1454 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 			goto reject_redirect;
1457 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 			goto reject_redirect;
1459 	} else {
1460 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461 			goto reject_redirect;
1462 	}
1463 
1464 	for (s = 0; s < 2; s++) {
1465 		for (i = 0; i < 2; i++) {
1466 			unsigned int hash;
1467 			struct rtable __rcu **rthp;
1468 			struct rtable *rt;
1469 
1470 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471 
1472 			rthp = &rt_hash_table[hash].chain;
1473 
1474 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 				rthp = &rt->dst.rt_next;
1476 
1477 				if (rt->rt_key_dst != daddr ||
1478 				    rt->rt_key_src != skeys[s] ||
1479 				    rt->rt_oif != ikeys[i] ||
1480 				    rt_is_input_route(rt) ||
1481 				    rt_is_expired(rt) ||
1482 				    !net_eq(dev_net(rt->dst.dev), net) ||
1483 				    rt->dst.error ||
1484 				    rt->dst.dev != dev ||
1485 				    rt->rt_gateway != old_gw)
1486 					continue;
1487 
1488 				if (!rt->peer)
1489 					rt_bind_peer(rt, rt->rt_dst, 1);
1490 
1491 				peer = rt->peer;
1492 				if (peer) {
1493 					if (peer->redirect_learned.a4 != new_gw) {
1494 						peer->redirect_learned.a4 = new_gw;
1495 						atomic_inc(&__rt_peer_genid);
1496 					}
1497 					check_peer_redir(&rt->dst, peer);
1498 				}
1499 			}
1500 		}
1501 	}
1502 	return;
1503 
1504 reject_redirect:
1505 #ifdef CONFIG_IP_ROUTE_VERBOSE
1506 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1507 		pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
1508 			"  Advised path = %pI4 -> %pI4\n",
1509 			&old_gw, dev->name, &new_gw,
1510 			&saddr, &daddr);
1511 #endif
1512 	;
1513 }
1514 
1515 static bool peer_pmtu_expired(struct inet_peer *peer)
1516 {
1517 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1518 
1519 	return orig &&
1520 	       time_after_eq(jiffies, orig) &&
1521 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1522 }
1523 
1524 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1525 {
1526 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1527 
1528 	return orig &&
1529 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1530 }
1531 
1532 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533 {
1534 	struct rtable *rt = (struct rtable *)dst;
1535 	struct dst_entry *ret = dst;
1536 
1537 	if (rt) {
1538 		if (dst->obsolete > 0) {
1539 			ip_rt_put(rt);
1540 			ret = NULL;
1541 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1542 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1543 						rt->rt_oif,
1544 						rt_genid(dev_net(dst->dev)));
1545 			rt_del(hash, rt);
1546 			ret = NULL;
1547 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1548 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1549 		}
1550 	}
1551 	return ret;
1552 }
1553 
1554 /*
1555  * Algorithm:
1556  *	1. The first ip_rt_redirect_number redirects are sent
1557  *	   with exponential backoff, then we stop sending them at all,
1558  *	   assuming that the host ignores our redirects.
1559  *	2. If we did not see packets requiring redirects
1560  *	   during ip_rt_redirect_silence, we assume that the host
1561  *	   forgot redirected route and start to send redirects again.
1562  *
1563  * This algorithm is much cheaper and more intelligent than dumb load limiting
1564  * in icmp.c.
1565  *
1566  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1567  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1568  */
1569 
1570 void ip_rt_send_redirect(struct sk_buff *skb)
1571 {
1572 	struct rtable *rt = skb_rtable(skb);
1573 	struct in_device *in_dev;
1574 	struct inet_peer *peer;
1575 	int log_martians;
1576 
1577 	rcu_read_lock();
1578 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1579 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1580 		rcu_read_unlock();
1581 		return;
1582 	}
1583 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1584 	rcu_read_unlock();
1585 
1586 	if (!rt->peer)
1587 		rt_bind_peer(rt, rt->rt_dst, 1);
1588 	peer = rt->peer;
1589 	if (!peer) {
1590 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1591 		return;
1592 	}
1593 
1594 	/* No redirected packets during ip_rt_redirect_silence;
1595 	 * reset the algorithm.
1596 	 */
1597 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1598 		peer->rate_tokens = 0;
1599 
1600 	/* Too many ignored redirects; do not send anything
1601 	 * set dst.rate_last to the last seen redirected packet.
1602 	 */
1603 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1604 		peer->rate_last = jiffies;
1605 		return;
1606 	}
1607 
1608 	/* Check for load limit; set rate_last to the latest sent
1609 	 * redirect.
1610 	 */
1611 	if (peer->rate_tokens == 0 ||
1612 	    time_after(jiffies,
1613 		       (peer->rate_last +
1614 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1615 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1616 		peer->rate_last = jiffies;
1617 		++peer->rate_tokens;
1618 #ifdef CONFIG_IP_ROUTE_VERBOSE
1619 		if (log_martians &&
1620 		    peer->rate_tokens == ip_rt_redirect_number &&
1621 		    net_ratelimit())
1622 			pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1623 				&ip_hdr(skb)->saddr, rt->rt_iif,
1624 				&rt->rt_dst, &rt->rt_gateway);
1625 #endif
1626 	}
1627 }
1628 
1629 static int ip_error(struct sk_buff *skb)
1630 {
1631 	struct rtable *rt = skb_rtable(skb);
1632 	struct inet_peer *peer;
1633 	unsigned long now;
1634 	bool send;
1635 	int code;
1636 
1637 	switch (rt->dst.error) {
1638 	case EINVAL:
1639 	default:
1640 		goto out;
1641 	case EHOSTUNREACH:
1642 		code = ICMP_HOST_UNREACH;
1643 		break;
1644 	case ENETUNREACH:
1645 		code = ICMP_NET_UNREACH;
1646 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1647 				IPSTATS_MIB_INNOROUTES);
1648 		break;
1649 	case EACCES:
1650 		code = ICMP_PKT_FILTERED;
1651 		break;
1652 	}
1653 
1654 	if (!rt->peer)
1655 		rt_bind_peer(rt, rt->rt_dst, 1);
1656 	peer = rt->peer;
1657 
1658 	send = true;
1659 	if (peer) {
1660 		now = jiffies;
1661 		peer->rate_tokens += now - peer->rate_last;
1662 		if (peer->rate_tokens > ip_rt_error_burst)
1663 			peer->rate_tokens = ip_rt_error_burst;
1664 		peer->rate_last = now;
1665 		if (peer->rate_tokens >= ip_rt_error_cost)
1666 			peer->rate_tokens -= ip_rt_error_cost;
1667 		else
1668 			send = false;
1669 	}
1670 	if (send)
1671 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1672 
1673 out:	kfree_skb(skb);
1674 	return 0;
1675 }
1676 
1677 /*
1678  *	The last two values are not from the RFC but
1679  *	are needed for AMPRnet AX.25 paths.
1680  */
1681 
1682 static const unsigned short mtu_plateau[] =
1683 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1684 
1685 static inline unsigned short guess_mtu(unsigned short old_mtu)
1686 {
1687 	int i;
1688 
1689 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1690 		if (old_mtu > mtu_plateau[i])
1691 			return mtu_plateau[i];
1692 	return 68;
1693 }
1694 
1695 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1696 				 unsigned short new_mtu,
1697 				 struct net_device *dev)
1698 {
1699 	unsigned short old_mtu = ntohs(iph->tot_len);
1700 	unsigned short est_mtu = 0;
1701 	struct inet_peer *peer;
1702 
1703 	peer = inet_getpeer_v4(iph->daddr, 1);
1704 	if (peer) {
1705 		unsigned short mtu = new_mtu;
1706 
1707 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1708 			/* BSD 4.2 derived systems incorrectly adjust
1709 			 * tot_len by the IP header length, and report
1710 			 * a zero MTU in the ICMP message.
1711 			 */
1712 			if (mtu == 0 &&
1713 			    old_mtu >= 68 + (iph->ihl << 2))
1714 				old_mtu -= iph->ihl << 2;
1715 			mtu = guess_mtu(old_mtu);
1716 		}
1717 
1718 		if (mtu < ip_rt_min_pmtu)
1719 			mtu = ip_rt_min_pmtu;
1720 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1721 			unsigned long pmtu_expires;
1722 
1723 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1724 			if (!pmtu_expires)
1725 				pmtu_expires = 1UL;
1726 
1727 			est_mtu = mtu;
1728 			peer->pmtu_learned = mtu;
1729 			peer->pmtu_expires = pmtu_expires;
1730 			atomic_inc(&__rt_peer_genid);
1731 		}
1732 
1733 		inet_putpeer(peer);
1734 	}
1735 	return est_mtu ? : new_mtu;
1736 }
1737 
1738 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1739 {
1740 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1741 
1742 	if (!expires)
1743 		return;
1744 	if (time_before(jiffies, expires)) {
1745 		u32 orig_dst_mtu = dst_mtu(dst);
1746 		if (peer->pmtu_learned < orig_dst_mtu) {
1747 			if (!peer->pmtu_orig)
1748 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1749 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1750 		}
1751 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1752 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1753 }
1754 
1755 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1756 {
1757 	struct rtable *rt = (struct rtable *) dst;
1758 	struct inet_peer *peer;
1759 
1760 	dst_confirm(dst);
1761 
1762 	if (!rt->peer)
1763 		rt_bind_peer(rt, rt->rt_dst, 1);
1764 	peer = rt->peer;
1765 	if (peer) {
1766 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1767 
1768 		if (mtu < ip_rt_min_pmtu)
1769 			mtu = ip_rt_min_pmtu;
1770 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1771 
1772 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1773 			if (!pmtu_expires)
1774 				pmtu_expires = 1UL;
1775 
1776 			peer->pmtu_learned = mtu;
1777 			peer->pmtu_expires = pmtu_expires;
1778 
1779 			atomic_inc(&__rt_peer_genid);
1780 			rt->rt_peer_genid = rt_peer_genid();
1781 		}
1782 		check_peer_pmtu(dst, peer);
1783 	}
1784 }
1785 
1786 
1787 static void ipv4_validate_peer(struct rtable *rt)
1788 {
1789 	if (rt->rt_peer_genid != rt_peer_genid()) {
1790 		struct inet_peer *peer;
1791 
1792 		if (!rt->peer)
1793 			rt_bind_peer(rt, rt->rt_dst, 0);
1794 
1795 		peer = rt->peer;
1796 		if (peer) {
1797 			check_peer_pmtu(&rt->dst, peer);
1798 
1799 			if (peer->redirect_learned.a4 &&
1800 			    peer->redirect_learned.a4 != rt->rt_gateway)
1801 				check_peer_redir(&rt->dst, peer);
1802 		}
1803 
1804 		rt->rt_peer_genid = rt_peer_genid();
1805 	}
1806 }
1807 
1808 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809 {
1810 	struct rtable *rt = (struct rtable *) dst;
1811 
1812 	if (rt_is_expired(rt))
1813 		return NULL;
1814 	ipv4_validate_peer(rt);
1815 	return dst;
1816 }
1817 
1818 static void ipv4_dst_destroy(struct dst_entry *dst)
1819 {
1820 	struct rtable *rt = (struct rtable *) dst;
1821 	struct inet_peer *peer = rt->peer;
1822 
1823 	if (rt->fi) {
1824 		fib_info_put(rt->fi);
1825 		rt->fi = NULL;
1826 	}
1827 	if (peer) {
1828 		rt->peer = NULL;
1829 		inet_putpeer(peer);
1830 	}
1831 }
1832 
1833 
1834 static void ipv4_link_failure(struct sk_buff *skb)
1835 {
1836 	struct rtable *rt;
1837 
1838 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839 
1840 	rt = skb_rtable(skb);
1841 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1843 }
1844 
1845 static int ip_rt_bug(struct sk_buff *skb)
1846 {
1847 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1849 		skb->dev ? skb->dev->name : "?");
1850 	kfree_skb(skb);
1851 	WARN_ON(1);
1852 	return 0;
1853 }
1854 
1855 /*
1856    We do not cache source address of outgoing interface,
1857    because it is used only by IP RR, TS and SRR options,
1858    so that it out of fast path.
1859 
1860    BTW remember: "addr" is allowed to be not aligned
1861    in IP options!
1862  */
1863 
1864 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1865 {
1866 	__be32 src;
1867 
1868 	if (rt_is_output_route(rt))
1869 		src = ip_hdr(skb)->saddr;
1870 	else {
1871 		struct fib_result res;
1872 		struct flowi4 fl4;
1873 		struct iphdr *iph;
1874 
1875 		iph = ip_hdr(skb);
1876 
1877 		memset(&fl4, 0, sizeof(fl4));
1878 		fl4.daddr = iph->daddr;
1879 		fl4.saddr = iph->saddr;
1880 		fl4.flowi4_tos = RT_TOS(iph->tos);
1881 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1882 		fl4.flowi4_iif = skb->dev->ifindex;
1883 		fl4.flowi4_mark = skb->mark;
1884 
1885 		rcu_read_lock();
1886 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1887 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1888 		else
1889 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1890 					RT_SCOPE_UNIVERSE);
1891 		rcu_read_unlock();
1892 	}
1893 	memcpy(addr, &src, 4);
1894 }
1895 
1896 #ifdef CONFIG_IP_ROUTE_CLASSID
1897 static void set_class_tag(struct rtable *rt, u32 tag)
1898 {
1899 	if (!(rt->dst.tclassid & 0xFFFF))
1900 		rt->dst.tclassid |= tag & 0xFFFF;
1901 	if (!(rt->dst.tclassid & 0xFFFF0000))
1902 		rt->dst.tclassid |= tag & 0xFFFF0000;
1903 }
1904 #endif
1905 
1906 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1907 {
1908 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1909 
1910 	if (advmss == 0) {
1911 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1912 			       ip_rt_min_advmss);
1913 		if (advmss > 65535 - 40)
1914 			advmss = 65535 - 40;
1915 	}
1916 	return advmss;
1917 }
1918 
1919 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1920 {
1921 	const struct rtable *rt = (const struct rtable *) dst;
1922 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1923 
1924 	if (mtu && rt_is_output_route(rt))
1925 		return mtu;
1926 
1927 	mtu = dst->dev->mtu;
1928 
1929 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1930 
1931 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1932 			mtu = 576;
1933 	}
1934 
1935 	if (mtu > IP_MAX_MTU)
1936 		mtu = IP_MAX_MTU;
1937 
1938 	return mtu;
1939 }
1940 
1941 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1942 			    struct fib_info *fi)
1943 {
1944 	struct inet_peer *peer;
1945 	int create = 0;
1946 
1947 	/* If a peer entry exists for this destination, we must hook
1948 	 * it up in order to get at cached metrics.
1949 	 */
1950 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1951 		create = 1;
1952 
1953 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1954 	if (peer) {
1955 		rt->rt_peer_genid = rt_peer_genid();
1956 		if (inet_metrics_new(peer))
1957 			memcpy(peer->metrics, fi->fib_metrics,
1958 			       sizeof(u32) * RTAX_MAX);
1959 		dst_init_metrics(&rt->dst, peer->metrics, false);
1960 
1961 		check_peer_pmtu(&rt->dst, peer);
1962 
1963 		if (peer->redirect_learned.a4 &&
1964 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1965 			rt->rt_gateway = peer->redirect_learned.a4;
1966 			rt->rt_flags |= RTCF_REDIRECTED;
1967 		}
1968 	} else {
1969 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970 			rt->fi = fi;
1971 			atomic_inc(&fi->fib_clntref);
1972 		}
1973 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1974 	}
1975 }
1976 
1977 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1978 			   const struct fib_result *res,
1979 			   struct fib_info *fi, u16 type, u32 itag)
1980 {
1981 	struct dst_entry *dst = &rt->dst;
1982 
1983 	if (fi) {
1984 		if (FIB_RES_GW(*res) &&
1985 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986 			rt->rt_gateway = FIB_RES_GW(*res);
1987 		rt_init_metrics(rt, fl4, fi);
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1990 #endif
1991 	}
1992 
1993 	if (dst_mtu(dst) > IP_MAX_MTU)
1994 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1995 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1996 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1997 
1998 #ifdef CONFIG_IP_ROUTE_CLASSID
1999 #ifdef CONFIG_IP_MULTIPLE_TABLES
2000 	set_class_tag(rt, fib_rules_tclass(res));
2001 #endif
2002 	set_class_tag(rt, itag);
2003 #endif
2004 }
2005 
2006 static struct rtable *rt_dst_alloc(struct net_device *dev,
2007 				   bool nopolicy, bool noxfrm)
2008 {
2009 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010 			 DST_HOST |
2011 			 (nopolicy ? DST_NOPOLICY : 0) |
2012 			 (noxfrm ? DST_NOXFRM : 0));
2013 }
2014 
2015 /* called in rcu_read_lock() section */
2016 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2017 				u8 tos, struct net_device *dev, int our)
2018 {
2019 	unsigned int hash;
2020 	struct rtable *rth;
2021 	__be32 spec_dst;
2022 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2023 	u32 itag = 0;
2024 	int err;
2025 
2026 	/* Primary sanity checks. */
2027 
2028 	if (in_dev == NULL)
2029 		return -EINVAL;
2030 
2031 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2032 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2033 		goto e_inval;
2034 
2035 	if (ipv4_is_zeronet(saddr)) {
2036 		if (!ipv4_is_local_multicast(daddr))
2037 			goto e_inval;
2038 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2039 	} else {
2040 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041 					  &itag);
2042 		if (err < 0)
2043 			goto e_err;
2044 	}
2045 	rth = rt_dst_alloc(init_net.loopback_dev,
2046 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2047 	if (!rth)
2048 		goto e_nobufs;
2049 
2050 #ifdef CONFIG_IP_ROUTE_CLASSID
2051 	rth->dst.tclassid = itag;
2052 #endif
2053 	rth->dst.output = ip_rt_bug;
2054 
2055 	rth->rt_key_dst	= daddr;
2056 	rth->rt_key_src	= saddr;
2057 	rth->rt_genid	= rt_genid(dev_net(dev));
2058 	rth->rt_flags	= RTCF_MULTICAST;
2059 	rth->rt_type	= RTN_MULTICAST;
2060 	rth->rt_key_tos	= tos;
2061 	rth->rt_dst	= daddr;
2062 	rth->rt_src	= saddr;
2063 	rth->rt_route_iif = dev->ifindex;
2064 	rth->rt_iif	= dev->ifindex;
2065 	rth->rt_oif	= 0;
2066 	rth->rt_mark    = skb->mark;
2067 	rth->rt_gateway	= daddr;
2068 	rth->rt_spec_dst= spec_dst;
2069 	rth->rt_peer_genid = 0;
2070 	rth->peer = NULL;
2071 	rth->fi = NULL;
2072 	if (our) {
2073 		rth->dst.input= ip_local_deliver;
2074 		rth->rt_flags |= RTCF_LOCAL;
2075 	}
2076 
2077 #ifdef CONFIG_IP_MROUTE
2078 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2079 		rth->dst.input = ip_mr_input;
2080 #endif
2081 	RT_CACHE_STAT_INC(in_slow_mc);
2082 
2083 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2084 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2085 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2086 
2087 e_nobufs:
2088 	return -ENOBUFS;
2089 e_inval:
2090 	return -EINVAL;
2091 e_err:
2092 	return err;
2093 }
2094 
2095 
2096 static void ip_handle_martian_source(struct net_device *dev,
2097 				     struct in_device *in_dev,
2098 				     struct sk_buff *skb,
2099 				     __be32 daddr,
2100 				     __be32 saddr)
2101 {
2102 	RT_CACHE_STAT_INC(in_martian_src);
2103 #ifdef CONFIG_IP_ROUTE_VERBOSE
2104 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105 		/*
2106 		 *	RFC1812 recommendation, if source is martian,
2107 		 *	the only hint is MAC header.
2108 		 */
2109 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2110 			&daddr, &saddr, dev->name);
2111 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2112 			print_hex_dump(KERN_WARNING, "ll header: ",
2113 				       DUMP_PREFIX_OFFSET, 16, 1,
2114 				       skb_mac_header(skb),
2115 				       dev->hard_header_len, true);
2116 		}
2117 	}
2118 #endif
2119 }
2120 
2121 /* called in rcu_read_lock() section */
2122 static int __mkroute_input(struct sk_buff *skb,
2123 			   const struct fib_result *res,
2124 			   struct in_device *in_dev,
2125 			   __be32 daddr, __be32 saddr, u32 tos,
2126 			   struct rtable **result)
2127 {
2128 	struct rtable *rth;
2129 	int err;
2130 	struct in_device *out_dev;
2131 	unsigned int flags = 0;
2132 	__be32 spec_dst;
2133 	u32 itag;
2134 
2135 	/* get a working reference to the output device */
2136 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2137 	if (out_dev == NULL) {
2138 		if (net_ratelimit())
2139 			pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2140 		return -EINVAL;
2141 	}
2142 
2143 
2144 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145 				  in_dev->dev, &spec_dst, &itag);
2146 	if (err < 0) {
2147 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148 					 saddr);
2149 
2150 		goto cleanup;
2151 	}
2152 
2153 	if (err)
2154 		flags |= RTCF_DIRECTSRC;
2155 
2156 	if (out_dev == in_dev && err &&
2157 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2158 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159 		flags |= RTCF_DOREDIRECT;
2160 
2161 	if (skb->protocol != htons(ETH_P_IP)) {
2162 		/* Not IP (i.e. ARP). Do not create route, if it is
2163 		 * invalid for proxy arp. DNAT routes are always valid.
2164 		 *
2165 		 * Proxy arp feature have been extended to allow, ARP
2166 		 * replies back to the same interface, to support
2167 		 * Private VLAN switch technologies. See arp.c.
2168 		 */
2169 		if (out_dev == in_dev &&
2170 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171 			err = -EINVAL;
2172 			goto cleanup;
2173 		}
2174 	}
2175 
2176 	rth = rt_dst_alloc(out_dev->dev,
2177 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2179 	if (!rth) {
2180 		err = -ENOBUFS;
2181 		goto cleanup;
2182 	}
2183 
2184 	rth->rt_key_dst	= daddr;
2185 	rth->rt_key_src	= saddr;
2186 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187 	rth->rt_flags = flags;
2188 	rth->rt_type = res->type;
2189 	rth->rt_key_tos	= tos;
2190 	rth->rt_dst	= daddr;
2191 	rth->rt_src	= saddr;
2192 	rth->rt_route_iif = in_dev->dev->ifindex;
2193 	rth->rt_iif 	= in_dev->dev->ifindex;
2194 	rth->rt_oif 	= 0;
2195 	rth->rt_mark    = skb->mark;
2196 	rth->rt_gateway	= daddr;
2197 	rth->rt_spec_dst= spec_dst;
2198 	rth->rt_peer_genid = 0;
2199 	rth->peer = NULL;
2200 	rth->fi = NULL;
2201 
2202 	rth->dst.input = ip_forward;
2203 	rth->dst.output = ip_output;
2204 
2205 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206 
2207 	*result = rth;
2208 	err = 0;
2209  cleanup:
2210 	return err;
2211 }
2212 
2213 static int ip_mkroute_input(struct sk_buff *skb,
2214 			    struct fib_result *res,
2215 			    const struct flowi4 *fl4,
2216 			    struct in_device *in_dev,
2217 			    __be32 daddr, __be32 saddr, u32 tos)
2218 {
2219 	struct rtable* rth = NULL;
2220 	int err;
2221 	unsigned hash;
2222 
2223 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2224 	if (res->fi && res->fi->fib_nhs > 1)
2225 		fib_select_multipath(res);
2226 #endif
2227 
2228 	/* create a routing cache entry */
2229 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230 	if (err)
2231 		return err;
2232 
2233 	/* put it into the cache */
2234 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235 		       rt_genid(dev_net(rth->dst.dev)));
2236 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237 	if (IS_ERR(rth))
2238 		return PTR_ERR(rth);
2239 	return 0;
2240 }
2241 
2242 /*
2243  *	NOTE. We drop all the packets that has local source
2244  *	addresses, because every properly looped back packet
2245  *	must have correct destination already attached by output routine.
2246  *
2247  *	Such approach solves two big problems:
2248  *	1. Not simplex devices are handled properly.
2249  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2250  *	called with rcu_read_lock()
2251  */
2252 
2253 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254 			       u8 tos, struct net_device *dev)
2255 {
2256 	struct fib_result res;
2257 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2258 	struct flowi4	fl4;
2259 	unsigned	flags = 0;
2260 	u32		itag = 0;
2261 	struct rtable * rth;
2262 	unsigned	hash;
2263 	__be32		spec_dst;
2264 	int		err = -EINVAL;
2265 	struct net    * net = dev_net(dev);
2266 
2267 	/* IP on this device is disabled. */
2268 
2269 	if (!in_dev)
2270 		goto out;
2271 
2272 	/* Check for the most weird martians, which can be not detected
2273 	   by fib_lookup.
2274 	 */
2275 
2276 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277 	    ipv4_is_loopback(saddr))
2278 		goto martian_source;
2279 
2280 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281 		goto brd_input;
2282 
2283 	/* Accept zero addresses only to limited broadcast;
2284 	 * I even do not know to fix it or not. Waiting for complains :-)
2285 	 */
2286 	if (ipv4_is_zeronet(saddr))
2287 		goto martian_source;
2288 
2289 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290 		goto martian_destination;
2291 
2292 	/*
2293 	 *	Now we are ready to route packet.
2294 	 */
2295 	fl4.flowi4_oif = 0;
2296 	fl4.flowi4_iif = dev->ifindex;
2297 	fl4.flowi4_mark = skb->mark;
2298 	fl4.flowi4_tos = tos;
2299 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300 	fl4.daddr = daddr;
2301 	fl4.saddr = saddr;
2302 	err = fib_lookup(net, &fl4, &res);
2303 	if (err != 0) {
2304 		if (!IN_DEV_FORWARD(in_dev))
2305 			goto e_hostunreach;
2306 		goto no_route;
2307 	}
2308 
2309 	RT_CACHE_STAT_INC(in_slow_tot);
2310 
2311 	if (res.type == RTN_BROADCAST)
2312 		goto brd_input;
2313 
2314 	if (res.type == RTN_LOCAL) {
2315 		err = fib_validate_source(skb, saddr, daddr, tos,
2316 					  net->loopback_dev->ifindex,
2317 					  dev, &spec_dst, &itag);
2318 		if (err < 0)
2319 			goto martian_source_keep_err;
2320 		if (err)
2321 			flags |= RTCF_DIRECTSRC;
2322 		spec_dst = daddr;
2323 		goto local_input;
2324 	}
2325 
2326 	if (!IN_DEV_FORWARD(in_dev))
2327 		goto e_hostunreach;
2328 	if (res.type != RTN_UNICAST)
2329 		goto martian_destination;
2330 
2331 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332 out:	return err;
2333 
2334 brd_input:
2335 	if (skb->protocol != htons(ETH_P_IP))
2336 		goto e_inval;
2337 
2338 	if (ipv4_is_zeronet(saddr))
2339 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340 	else {
2341 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342 					  &itag);
2343 		if (err < 0)
2344 			goto martian_source_keep_err;
2345 		if (err)
2346 			flags |= RTCF_DIRECTSRC;
2347 	}
2348 	flags |= RTCF_BROADCAST;
2349 	res.type = RTN_BROADCAST;
2350 	RT_CACHE_STAT_INC(in_brd);
2351 
2352 local_input:
2353 	rth = rt_dst_alloc(net->loopback_dev,
2354 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355 	if (!rth)
2356 		goto e_nobufs;
2357 
2358 	rth->dst.input= ip_local_deliver;
2359 	rth->dst.output= ip_rt_bug;
2360 #ifdef CONFIG_IP_ROUTE_CLASSID
2361 	rth->dst.tclassid = itag;
2362 #endif
2363 
2364 	rth->rt_key_dst	= daddr;
2365 	rth->rt_key_src	= saddr;
2366 	rth->rt_genid = rt_genid(net);
2367 	rth->rt_flags 	= flags|RTCF_LOCAL;
2368 	rth->rt_type	= res.type;
2369 	rth->rt_key_tos	= tos;
2370 	rth->rt_dst	= daddr;
2371 	rth->rt_src	= saddr;
2372 #ifdef CONFIG_IP_ROUTE_CLASSID
2373 	rth->dst.tclassid = itag;
2374 #endif
2375 	rth->rt_route_iif = dev->ifindex;
2376 	rth->rt_iif	= dev->ifindex;
2377 	rth->rt_oif	= 0;
2378 	rth->rt_mark    = skb->mark;
2379 	rth->rt_gateway	= daddr;
2380 	rth->rt_spec_dst= spec_dst;
2381 	rth->rt_peer_genid = 0;
2382 	rth->peer = NULL;
2383 	rth->fi = NULL;
2384 	if (res.type == RTN_UNREACHABLE) {
2385 		rth->dst.input= ip_error;
2386 		rth->dst.error= -err;
2387 		rth->rt_flags 	&= ~RTCF_LOCAL;
2388 	}
2389 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391 	err = 0;
2392 	if (IS_ERR(rth))
2393 		err = PTR_ERR(rth);
2394 	goto out;
2395 
2396 no_route:
2397 	RT_CACHE_STAT_INC(in_no_route);
2398 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399 	res.type = RTN_UNREACHABLE;
2400 	if (err == -ESRCH)
2401 		err = -ENETUNREACH;
2402 	goto local_input;
2403 
2404 	/*
2405 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2406 	 */
2407 martian_destination:
2408 	RT_CACHE_STAT_INC(in_martian_dst);
2409 #ifdef CONFIG_IP_ROUTE_VERBOSE
2410 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411 		pr_warn("martian destination %pI4 from %pI4, dev %s\n",
2412 			&daddr, &saddr, dev->name);
2413 #endif
2414 
2415 e_hostunreach:
2416 	err = -EHOSTUNREACH;
2417 	goto out;
2418 
2419 e_inval:
2420 	err = -EINVAL;
2421 	goto out;
2422 
2423 e_nobufs:
2424 	err = -ENOBUFS;
2425 	goto out;
2426 
2427 martian_source:
2428 	err = -EINVAL;
2429 martian_source_keep_err:
2430 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431 	goto out;
2432 }
2433 
2434 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435 			   u8 tos, struct net_device *dev, bool noref)
2436 {
2437 	struct rtable * rth;
2438 	unsigned	hash;
2439 	int iif = dev->ifindex;
2440 	struct net *net;
2441 	int res;
2442 
2443 	net = dev_net(dev);
2444 
2445 	rcu_read_lock();
2446 
2447 	if (!rt_caching(net))
2448 		goto skip_cache;
2449 
2450 	tos &= IPTOS_RT_MASK;
2451 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452 
2453 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454 	     rth = rcu_dereference(rth->dst.rt_next)) {
2455 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457 		     (rth->rt_route_iif ^ iif) |
2458 		     (rth->rt_key_tos ^ tos)) == 0 &&
2459 		    rth->rt_mark == skb->mark &&
2460 		    net_eq(dev_net(rth->dst.dev), net) &&
2461 		    !rt_is_expired(rth)) {
2462 			ipv4_validate_peer(rth);
2463 			if (noref) {
2464 				dst_use_noref(&rth->dst, jiffies);
2465 				skb_dst_set_noref(skb, &rth->dst);
2466 			} else {
2467 				dst_use(&rth->dst, jiffies);
2468 				skb_dst_set(skb, &rth->dst);
2469 			}
2470 			RT_CACHE_STAT_INC(in_hit);
2471 			rcu_read_unlock();
2472 			return 0;
2473 		}
2474 		RT_CACHE_STAT_INC(in_hlist_search);
2475 	}
2476 
2477 skip_cache:
2478 	/* Multicast recognition logic is moved from route cache to here.
2479 	   The problem was that too many Ethernet cards have broken/missing
2480 	   hardware multicast filters :-( As result the host on multicasting
2481 	   network acquires a lot of useless route cache entries, sort of
2482 	   SDR messages from all the world. Now we try to get rid of them.
2483 	   Really, provided software IP multicast filter is organized
2484 	   reasonably (at least, hashed), it does not result in a slowdown
2485 	   comparing with route cache reject entries.
2486 	   Note, that multicast routers are not affected, because
2487 	   route cache entry is created eventually.
2488 	 */
2489 	if (ipv4_is_multicast(daddr)) {
2490 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2491 
2492 		if (in_dev) {
2493 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494 						  ip_hdr(skb)->protocol);
2495 			if (our
2496 #ifdef CONFIG_IP_MROUTE
2497 				||
2498 			    (!ipv4_is_local_multicast(daddr) &&
2499 			     IN_DEV_MFORWARD(in_dev))
2500 #endif
2501 			   ) {
2502 				int res = ip_route_input_mc(skb, daddr, saddr,
2503 							    tos, dev, our);
2504 				rcu_read_unlock();
2505 				return res;
2506 			}
2507 		}
2508 		rcu_read_unlock();
2509 		return -EINVAL;
2510 	}
2511 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512 	rcu_read_unlock();
2513 	return res;
2514 }
2515 EXPORT_SYMBOL(ip_route_input_common);
2516 
2517 /* called with rcu_read_lock() */
2518 static struct rtable *__mkroute_output(const struct fib_result *res,
2519 				       const struct flowi4 *fl4,
2520 				       __be32 orig_daddr, __be32 orig_saddr,
2521 				       int orig_oif, __u8 orig_rtos,
2522 				       struct net_device *dev_out,
2523 				       unsigned int flags)
2524 {
2525 	struct fib_info *fi = res->fi;
2526 	struct in_device *in_dev;
2527 	u16 type = res->type;
2528 	struct rtable *rth;
2529 
2530 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531 		return ERR_PTR(-EINVAL);
2532 
2533 	if (ipv4_is_lbcast(fl4->daddr))
2534 		type = RTN_BROADCAST;
2535 	else if (ipv4_is_multicast(fl4->daddr))
2536 		type = RTN_MULTICAST;
2537 	else if (ipv4_is_zeronet(fl4->daddr))
2538 		return ERR_PTR(-EINVAL);
2539 
2540 	if (dev_out->flags & IFF_LOOPBACK)
2541 		flags |= RTCF_LOCAL;
2542 
2543 	in_dev = __in_dev_get_rcu(dev_out);
2544 	if (!in_dev)
2545 		return ERR_PTR(-EINVAL);
2546 
2547 	if (type == RTN_BROADCAST) {
2548 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549 		fi = NULL;
2550 	} else if (type == RTN_MULTICAST) {
2551 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553 				     fl4->flowi4_proto))
2554 			flags &= ~RTCF_LOCAL;
2555 		/* If multicast route do not exist use
2556 		 * default one, but do not gateway in this case.
2557 		 * Yes, it is hack.
2558 		 */
2559 		if (fi && res->prefixlen < 4)
2560 			fi = NULL;
2561 	}
2562 
2563 	rth = rt_dst_alloc(dev_out,
2564 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2566 	if (!rth)
2567 		return ERR_PTR(-ENOBUFS);
2568 
2569 	rth->dst.output = ip_output;
2570 
2571 	rth->rt_key_dst	= orig_daddr;
2572 	rth->rt_key_src	= orig_saddr;
2573 	rth->rt_genid = rt_genid(dev_net(dev_out));
2574 	rth->rt_flags	= flags;
2575 	rth->rt_type	= type;
2576 	rth->rt_key_tos	= orig_rtos;
2577 	rth->rt_dst	= fl4->daddr;
2578 	rth->rt_src	= fl4->saddr;
2579 	rth->rt_route_iif = 0;
2580 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2581 	rth->rt_oif	= orig_oif;
2582 	rth->rt_mark    = fl4->flowi4_mark;
2583 	rth->rt_gateway = fl4->daddr;
2584 	rth->rt_spec_dst= fl4->saddr;
2585 	rth->rt_peer_genid = 0;
2586 	rth->peer = NULL;
2587 	rth->fi = NULL;
2588 
2589 	RT_CACHE_STAT_INC(out_slow_tot);
2590 
2591 	if (flags & RTCF_LOCAL) {
2592 		rth->dst.input = ip_local_deliver;
2593 		rth->rt_spec_dst = fl4->daddr;
2594 	}
2595 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596 		rth->rt_spec_dst = fl4->saddr;
2597 		if (flags & RTCF_LOCAL &&
2598 		    !(dev_out->flags & IFF_LOOPBACK)) {
2599 			rth->dst.output = ip_mc_output;
2600 			RT_CACHE_STAT_INC(out_slow_mc);
2601 		}
2602 #ifdef CONFIG_IP_MROUTE
2603 		if (type == RTN_MULTICAST) {
2604 			if (IN_DEV_MFORWARD(in_dev) &&
2605 			    !ipv4_is_local_multicast(fl4->daddr)) {
2606 				rth->dst.input = ip_mr_input;
2607 				rth->dst.output = ip_mc_output;
2608 			}
2609 		}
2610 #endif
2611 	}
2612 
2613 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614 
2615 	return rth;
2616 }
2617 
2618 /*
2619  * Major route resolver routine.
2620  * called with rcu_read_lock();
2621  */
2622 
2623 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624 {
2625 	struct net_device *dev_out = NULL;
2626 	__u8 tos = RT_FL_TOS(fl4);
2627 	unsigned int flags = 0;
2628 	struct fib_result res;
2629 	struct rtable *rth;
2630 	__be32 orig_daddr;
2631 	__be32 orig_saddr;
2632 	int orig_oif;
2633 
2634 	res.fi		= NULL;
2635 #ifdef CONFIG_IP_MULTIPLE_TABLES
2636 	res.r		= NULL;
2637 #endif
2638 
2639 	orig_daddr = fl4->daddr;
2640 	orig_saddr = fl4->saddr;
2641 	orig_oif = fl4->flowi4_oif;
2642 
2643 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2644 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647 
2648 	rcu_read_lock();
2649 	if (fl4->saddr) {
2650 		rth = ERR_PTR(-EINVAL);
2651 		if (ipv4_is_multicast(fl4->saddr) ||
2652 		    ipv4_is_lbcast(fl4->saddr) ||
2653 		    ipv4_is_zeronet(fl4->saddr))
2654 			goto out;
2655 
2656 		/* I removed check for oif == dev_out->oif here.
2657 		   It was wrong for two reasons:
2658 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659 		      is assigned to multiple interfaces.
2660 		   2. Moreover, we are allowed to send packets with saddr
2661 		      of another iface. --ANK
2662 		 */
2663 
2664 		if (fl4->flowi4_oif == 0 &&
2665 		    (ipv4_is_multicast(fl4->daddr) ||
2666 		     ipv4_is_lbcast(fl4->daddr))) {
2667 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2668 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2669 			if (dev_out == NULL)
2670 				goto out;
2671 
2672 			/* Special hack: user can direct multicasts
2673 			   and limited broadcast via necessary interface
2674 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675 			   This hack is not just for fun, it allows
2676 			   vic,vat and friends to work.
2677 			   They bind socket to loopback, set ttl to zero
2678 			   and expect that it will work.
2679 			   From the viewpoint of routing cache they are broken,
2680 			   because we are not allowed to build multicast path
2681 			   with loopback source addr (look, routing cache
2682 			   cannot know, that ttl is zero, so that packet
2683 			   will not leave this host and route is valid).
2684 			   Luckily, this hack is good workaround.
2685 			 */
2686 
2687 			fl4->flowi4_oif = dev_out->ifindex;
2688 			goto make_route;
2689 		}
2690 
2691 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2693 			if (!__ip_dev_find(net, fl4->saddr, false))
2694 				goto out;
2695 		}
2696 	}
2697 
2698 
2699 	if (fl4->flowi4_oif) {
2700 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701 		rth = ERR_PTR(-ENODEV);
2702 		if (dev_out == NULL)
2703 			goto out;
2704 
2705 		/* RACE: Check return value of inet_select_addr instead. */
2706 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707 			rth = ERR_PTR(-ENETUNREACH);
2708 			goto out;
2709 		}
2710 		if (ipv4_is_local_multicast(fl4->daddr) ||
2711 		    ipv4_is_lbcast(fl4->daddr)) {
2712 			if (!fl4->saddr)
2713 				fl4->saddr = inet_select_addr(dev_out, 0,
2714 							      RT_SCOPE_LINK);
2715 			goto make_route;
2716 		}
2717 		if (fl4->saddr) {
2718 			if (ipv4_is_multicast(fl4->daddr))
2719 				fl4->saddr = inet_select_addr(dev_out, 0,
2720 							      fl4->flowi4_scope);
2721 			else if (!fl4->daddr)
2722 				fl4->saddr = inet_select_addr(dev_out, 0,
2723 							      RT_SCOPE_HOST);
2724 		}
2725 	}
2726 
2727 	if (!fl4->daddr) {
2728 		fl4->daddr = fl4->saddr;
2729 		if (!fl4->daddr)
2730 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731 		dev_out = net->loopback_dev;
2732 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2733 		res.type = RTN_LOCAL;
2734 		flags |= RTCF_LOCAL;
2735 		goto make_route;
2736 	}
2737 
2738 	if (fib_lookup(net, fl4, &res)) {
2739 		res.fi = NULL;
2740 		if (fl4->flowi4_oif) {
2741 			/* Apparently, routing tables are wrong. Assume,
2742 			   that the destination is on link.
2743 
2744 			   WHY? DW.
2745 			   Because we are allowed to send to iface
2746 			   even if it has NO routes and NO assigned
2747 			   addresses. When oif is specified, routing
2748 			   tables are looked up with only one purpose:
2749 			   to catch if destination is gatewayed, rather than
2750 			   direct. Moreover, if MSG_DONTROUTE is set,
2751 			   we send packet, ignoring both routing tables
2752 			   and ifaddr state. --ANK
2753 
2754 
2755 			   We could make it even if oif is unknown,
2756 			   likely IPv6, but we do not.
2757 			 */
2758 
2759 			if (fl4->saddr == 0)
2760 				fl4->saddr = inet_select_addr(dev_out, 0,
2761 							      RT_SCOPE_LINK);
2762 			res.type = RTN_UNICAST;
2763 			goto make_route;
2764 		}
2765 		rth = ERR_PTR(-ENETUNREACH);
2766 		goto out;
2767 	}
2768 
2769 	if (res.type == RTN_LOCAL) {
2770 		if (!fl4->saddr) {
2771 			if (res.fi->fib_prefsrc)
2772 				fl4->saddr = res.fi->fib_prefsrc;
2773 			else
2774 				fl4->saddr = fl4->daddr;
2775 		}
2776 		dev_out = net->loopback_dev;
2777 		fl4->flowi4_oif = dev_out->ifindex;
2778 		res.fi = NULL;
2779 		flags |= RTCF_LOCAL;
2780 		goto make_route;
2781 	}
2782 
2783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2784 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785 		fib_select_multipath(&res);
2786 	else
2787 #endif
2788 	if (!res.prefixlen &&
2789 	    res.table->tb_num_default > 1 &&
2790 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791 		fib_select_default(&res);
2792 
2793 	if (!fl4->saddr)
2794 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2795 
2796 	dev_out = FIB_RES_DEV(res);
2797 	fl4->flowi4_oif = dev_out->ifindex;
2798 
2799 
2800 make_route:
2801 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802 			       tos, dev_out, flags);
2803 	if (!IS_ERR(rth)) {
2804 		unsigned int hash;
2805 
2806 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807 			       rt_genid(dev_net(dev_out)));
2808 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809 	}
2810 
2811 out:
2812 	rcu_read_unlock();
2813 	return rth;
2814 }
2815 
2816 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817 {
2818 	struct rtable *rth;
2819 	unsigned int hash;
2820 
2821 	if (!rt_caching(net))
2822 		goto slow_output;
2823 
2824 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825 
2826 	rcu_read_lock_bh();
2827 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829 		if (rth->rt_key_dst == flp4->daddr &&
2830 		    rth->rt_key_src == flp4->saddr &&
2831 		    rt_is_output_route(rth) &&
2832 		    rth->rt_oif == flp4->flowi4_oif &&
2833 		    rth->rt_mark == flp4->flowi4_mark &&
2834 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836 		    net_eq(dev_net(rth->dst.dev), net) &&
2837 		    !rt_is_expired(rth)) {
2838 			ipv4_validate_peer(rth);
2839 			dst_use(&rth->dst, jiffies);
2840 			RT_CACHE_STAT_INC(out_hit);
2841 			rcu_read_unlock_bh();
2842 			if (!flp4->saddr)
2843 				flp4->saddr = rth->rt_src;
2844 			if (!flp4->daddr)
2845 				flp4->daddr = rth->rt_dst;
2846 			return rth;
2847 		}
2848 		RT_CACHE_STAT_INC(out_hlist_search);
2849 	}
2850 	rcu_read_unlock_bh();
2851 
2852 slow_output:
2853 	return ip_route_output_slow(net, flp4);
2854 }
2855 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856 
2857 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858 {
2859 	return NULL;
2860 }
2861 
2862 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863 {
2864 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865 
2866 	return mtu ? : dst->dev->mtu;
2867 }
2868 
2869 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870 {
2871 }
2872 
2873 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874 					  unsigned long old)
2875 {
2876 	return NULL;
2877 }
2878 
2879 static struct dst_ops ipv4_dst_blackhole_ops = {
2880 	.family			=	AF_INET,
2881 	.protocol		=	cpu_to_be16(ETH_P_IP),
2882 	.destroy		=	ipv4_dst_destroy,
2883 	.check			=	ipv4_blackhole_dst_check,
2884 	.mtu			=	ipv4_blackhole_mtu,
2885 	.default_advmss		=	ipv4_default_advmss,
2886 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2887 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2888 	.neigh_lookup		=	ipv4_neigh_lookup,
2889 };
2890 
2891 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892 {
2893 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894 	struct rtable *ort = (struct rtable *) dst_orig;
2895 
2896 	if (rt) {
2897 		struct dst_entry *new = &rt->dst;
2898 
2899 		new->__use = 1;
2900 		new->input = dst_discard;
2901 		new->output = dst_discard;
2902 		dst_copy_metrics(new, &ort->dst);
2903 
2904 		new->dev = ort->dst.dev;
2905 		if (new->dev)
2906 			dev_hold(new->dev);
2907 
2908 		rt->rt_key_dst = ort->rt_key_dst;
2909 		rt->rt_key_src = ort->rt_key_src;
2910 		rt->rt_key_tos = ort->rt_key_tos;
2911 		rt->rt_route_iif = ort->rt_route_iif;
2912 		rt->rt_iif = ort->rt_iif;
2913 		rt->rt_oif = ort->rt_oif;
2914 		rt->rt_mark = ort->rt_mark;
2915 
2916 		rt->rt_genid = rt_genid(net);
2917 		rt->rt_flags = ort->rt_flags;
2918 		rt->rt_type = ort->rt_type;
2919 		rt->rt_dst = ort->rt_dst;
2920 		rt->rt_src = ort->rt_src;
2921 		rt->rt_gateway = ort->rt_gateway;
2922 		rt->rt_spec_dst = ort->rt_spec_dst;
2923 		rt->peer = ort->peer;
2924 		if (rt->peer)
2925 			atomic_inc(&rt->peer->refcnt);
2926 		rt->fi = ort->fi;
2927 		if (rt->fi)
2928 			atomic_inc(&rt->fi->fib_clntref);
2929 
2930 		dst_free(new);
2931 	}
2932 
2933 	dst_release(dst_orig);
2934 
2935 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936 }
2937 
2938 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939 				    struct sock *sk)
2940 {
2941 	struct rtable *rt = __ip_route_output_key(net, flp4);
2942 
2943 	if (IS_ERR(rt))
2944 		return rt;
2945 
2946 	if (flp4->flowi4_proto)
2947 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948 						   flowi4_to_flowi(flp4),
2949 						   sk, 0);
2950 
2951 	return rt;
2952 }
2953 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954 
2955 static int rt_fill_info(struct net *net,
2956 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2957 			int nowait, unsigned int flags)
2958 {
2959 	struct rtable *rt = skb_rtable(skb);
2960 	struct rtmsg *r;
2961 	struct nlmsghdr *nlh;
2962 	unsigned long expires = 0;
2963 	const struct inet_peer *peer = rt->peer;
2964 	u32 id = 0, ts = 0, tsage = 0, error;
2965 
2966 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967 	if (nlh == NULL)
2968 		return -EMSGSIZE;
2969 
2970 	r = nlmsg_data(nlh);
2971 	r->rtm_family	 = AF_INET;
2972 	r->rtm_dst_len	= 32;
2973 	r->rtm_src_len	= 0;
2974 	r->rtm_tos	= rt->rt_key_tos;
2975 	r->rtm_table	= RT_TABLE_MAIN;
2976 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2977 	r->rtm_type	= rt->rt_type;
2978 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2979 	r->rtm_protocol = RTPROT_UNSPEC;
2980 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 	if (rt->rt_flags & RTCF_NOTIFY)
2982 		r->rtm_flags |= RTM_F_NOTIFY;
2983 
2984 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2985 
2986 	if (rt->rt_key_src) {
2987 		r->rtm_src_len = 32;
2988 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2989 	}
2990 	if (rt->dst.dev)
2991 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2992 #ifdef CONFIG_IP_ROUTE_CLASSID
2993 	if (rt->dst.tclassid)
2994 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2995 #endif
2996 	if (rt_is_input_route(rt))
2997 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2998 	else if (rt->rt_src != rt->rt_key_src)
2999 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3000 
3001 	if (rt->rt_dst != rt->rt_gateway)
3002 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3003 
3004 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005 		goto nla_put_failure;
3006 
3007 	if (rt->rt_mark)
3008 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3009 
3010 	error = rt->dst.error;
3011 	if (peer) {
3012 		inet_peer_refcheck(rt->peer);
3013 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3014 		if (peer->tcp_ts_stamp) {
3015 			ts = peer->tcp_ts;
3016 			tsage = get_seconds() - peer->tcp_ts_stamp;
3017 		}
3018 		expires = ACCESS_ONCE(peer->pmtu_expires);
3019 		if (expires) {
3020 			if (time_before(jiffies, expires))
3021 				expires -= jiffies;
3022 			else
3023 				expires = 0;
3024 		}
3025 	}
3026 
3027 	if (rt_is_input_route(rt)) {
3028 #ifdef CONFIG_IP_MROUTE
3029 		__be32 dst = rt->rt_dst;
3030 
3031 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3032 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3033 			int err = ipmr_get_route(net, skb,
3034 						 rt->rt_src, rt->rt_dst,
3035 						 r, nowait);
3036 			if (err <= 0) {
3037 				if (!nowait) {
3038 					if (err == 0)
3039 						return 0;
3040 					goto nla_put_failure;
3041 				} else {
3042 					if (err == -EMSGSIZE)
3043 						goto nla_put_failure;
3044 					error = err;
3045 				}
3046 			}
3047 		} else
3048 #endif
3049 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3050 	}
3051 
3052 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3053 			       expires, error) < 0)
3054 		goto nla_put_failure;
3055 
3056 	return nlmsg_end(skb, nlh);
3057 
3058 nla_put_failure:
3059 	nlmsg_cancel(skb, nlh);
3060 	return -EMSGSIZE;
3061 }
3062 
3063 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3064 {
3065 	struct net *net = sock_net(in_skb->sk);
3066 	struct rtmsg *rtm;
3067 	struct nlattr *tb[RTA_MAX+1];
3068 	struct rtable *rt = NULL;
3069 	__be32 dst = 0;
3070 	__be32 src = 0;
3071 	u32 iif;
3072 	int err;
3073 	int mark;
3074 	struct sk_buff *skb;
3075 
3076 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077 	if (err < 0)
3078 		goto errout;
3079 
3080 	rtm = nlmsg_data(nlh);
3081 
3082 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3083 	if (skb == NULL) {
3084 		err = -ENOBUFS;
3085 		goto errout;
3086 	}
3087 
3088 	/* Reserve room for dummy headers, this skb can pass
3089 	   through good chunk of routing engine.
3090 	 */
3091 	skb_reset_mac_header(skb);
3092 	skb_reset_network_header(skb);
3093 
3094 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3095 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3096 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097 
3098 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3100 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3101 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3102 
3103 	if (iif) {
3104 		struct net_device *dev;
3105 
3106 		dev = __dev_get_by_index(net, iif);
3107 		if (dev == NULL) {
3108 			err = -ENODEV;
3109 			goto errout_free;
3110 		}
3111 
3112 		skb->protocol	= htons(ETH_P_IP);
3113 		skb->dev	= dev;
3114 		skb->mark	= mark;
3115 		local_bh_disable();
3116 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117 		local_bh_enable();
3118 
3119 		rt = skb_rtable(skb);
3120 		if (err == 0 && rt->dst.error)
3121 			err = -rt->dst.error;
3122 	} else {
3123 		struct flowi4 fl4 = {
3124 			.daddr = dst,
3125 			.saddr = src,
3126 			.flowi4_tos = rtm->rtm_tos,
3127 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128 			.flowi4_mark = mark,
3129 		};
3130 		rt = ip_route_output_key(net, &fl4);
3131 
3132 		err = 0;
3133 		if (IS_ERR(rt))
3134 			err = PTR_ERR(rt);
3135 	}
3136 
3137 	if (err)
3138 		goto errout_free;
3139 
3140 	skb_dst_set(skb, &rt->dst);
3141 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3142 		rt->rt_flags |= RTCF_NOTIFY;
3143 
3144 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3145 			   RTM_NEWROUTE, 0, 0);
3146 	if (err <= 0)
3147 		goto errout_free;
3148 
3149 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3150 errout:
3151 	return err;
3152 
3153 errout_free:
3154 	kfree_skb(skb);
3155 	goto errout;
3156 }
3157 
3158 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3159 {
3160 	struct rtable *rt;
3161 	int h, s_h;
3162 	int idx, s_idx;
3163 	struct net *net;
3164 
3165 	net = sock_net(skb->sk);
3166 
3167 	s_h = cb->args[0];
3168 	if (s_h < 0)
3169 		s_h = 0;
3170 	s_idx = idx = cb->args[1];
3171 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172 		if (!rt_hash_table[h].chain)
3173 			continue;
3174 		rcu_read_lock_bh();
3175 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3176 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3178 				continue;
3179 			if (rt_is_expired(rt))
3180 				continue;
3181 			skb_dst_set_noref(skb, &rt->dst);
3182 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3183 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3184 					 1, NLM_F_MULTI) <= 0) {
3185 				skb_dst_drop(skb);
3186 				rcu_read_unlock_bh();
3187 				goto done;
3188 			}
3189 			skb_dst_drop(skb);
3190 		}
3191 		rcu_read_unlock_bh();
3192 	}
3193 
3194 done:
3195 	cb->args[0] = h;
3196 	cb->args[1] = idx;
3197 	return skb->len;
3198 }
3199 
3200 void ip_rt_multicast_event(struct in_device *in_dev)
3201 {
3202 	rt_cache_flush(dev_net(in_dev->dev), 0);
3203 }
3204 
3205 #ifdef CONFIG_SYSCTL
3206 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3207 					void __user *buffer,
3208 					size_t *lenp, loff_t *ppos)
3209 {
3210 	if (write) {
3211 		int flush_delay;
3212 		ctl_table ctl;
3213 		struct net *net;
3214 
3215 		memcpy(&ctl, __ctl, sizeof(ctl));
3216 		ctl.data = &flush_delay;
3217 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3218 
3219 		net = (struct net *)__ctl->extra1;
3220 		rt_cache_flush(net, flush_delay);
3221 		return 0;
3222 	}
3223 
3224 	return -EINVAL;
3225 }
3226 
3227 static ctl_table ipv4_route_table[] = {
3228 	{
3229 		.procname	= "gc_thresh",
3230 		.data		= &ipv4_dst_ops.gc_thresh,
3231 		.maxlen		= sizeof(int),
3232 		.mode		= 0644,
3233 		.proc_handler	= proc_dointvec,
3234 	},
3235 	{
3236 		.procname	= "max_size",
3237 		.data		= &ip_rt_max_size,
3238 		.maxlen		= sizeof(int),
3239 		.mode		= 0644,
3240 		.proc_handler	= proc_dointvec,
3241 	},
3242 	{
3243 		/*  Deprecated. Use gc_min_interval_ms */
3244 
3245 		.procname	= "gc_min_interval",
3246 		.data		= &ip_rt_gc_min_interval,
3247 		.maxlen		= sizeof(int),
3248 		.mode		= 0644,
3249 		.proc_handler	= proc_dointvec_jiffies,
3250 	},
3251 	{
3252 		.procname	= "gc_min_interval_ms",
3253 		.data		= &ip_rt_gc_min_interval,
3254 		.maxlen		= sizeof(int),
3255 		.mode		= 0644,
3256 		.proc_handler	= proc_dointvec_ms_jiffies,
3257 	},
3258 	{
3259 		.procname	= "gc_timeout",
3260 		.data		= &ip_rt_gc_timeout,
3261 		.maxlen		= sizeof(int),
3262 		.mode		= 0644,
3263 		.proc_handler	= proc_dointvec_jiffies,
3264 	},
3265 	{
3266 		.procname	= "gc_interval",
3267 		.data		= &ip_rt_gc_interval,
3268 		.maxlen		= sizeof(int),
3269 		.mode		= 0644,
3270 		.proc_handler	= proc_dointvec_jiffies,
3271 	},
3272 	{
3273 		.procname	= "redirect_load",
3274 		.data		= &ip_rt_redirect_load,
3275 		.maxlen		= sizeof(int),
3276 		.mode		= 0644,
3277 		.proc_handler	= proc_dointvec,
3278 	},
3279 	{
3280 		.procname	= "redirect_number",
3281 		.data		= &ip_rt_redirect_number,
3282 		.maxlen		= sizeof(int),
3283 		.mode		= 0644,
3284 		.proc_handler	= proc_dointvec,
3285 	},
3286 	{
3287 		.procname	= "redirect_silence",
3288 		.data		= &ip_rt_redirect_silence,
3289 		.maxlen		= sizeof(int),
3290 		.mode		= 0644,
3291 		.proc_handler	= proc_dointvec,
3292 	},
3293 	{
3294 		.procname	= "error_cost",
3295 		.data		= &ip_rt_error_cost,
3296 		.maxlen		= sizeof(int),
3297 		.mode		= 0644,
3298 		.proc_handler	= proc_dointvec,
3299 	},
3300 	{
3301 		.procname	= "error_burst",
3302 		.data		= &ip_rt_error_burst,
3303 		.maxlen		= sizeof(int),
3304 		.mode		= 0644,
3305 		.proc_handler	= proc_dointvec,
3306 	},
3307 	{
3308 		.procname	= "gc_elasticity",
3309 		.data		= &ip_rt_gc_elasticity,
3310 		.maxlen		= sizeof(int),
3311 		.mode		= 0644,
3312 		.proc_handler	= proc_dointvec,
3313 	},
3314 	{
3315 		.procname	= "mtu_expires",
3316 		.data		= &ip_rt_mtu_expires,
3317 		.maxlen		= sizeof(int),
3318 		.mode		= 0644,
3319 		.proc_handler	= proc_dointvec_jiffies,
3320 	},
3321 	{
3322 		.procname	= "min_pmtu",
3323 		.data		= &ip_rt_min_pmtu,
3324 		.maxlen		= sizeof(int),
3325 		.mode		= 0644,
3326 		.proc_handler	= proc_dointvec,
3327 	},
3328 	{
3329 		.procname	= "min_adv_mss",
3330 		.data		= &ip_rt_min_advmss,
3331 		.maxlen		= sizeof(int),
3332 		.mode		= 0644,
3333 		.proc_handler	= proc_dointvec,
3334 	},
3335 	{ }
3336 };
3337 
3338 static struct ctl_table empty[1];
3339 
3340 static struct ctl_table ipv4_skeleton[] =
3341 {
3342 	{ .procname = "route",
3343 	  .mode = 0555, .child = ipv4_route_table},
3344 	{ .procname = "neigh",
3345 	  .mode = 0555, .child = empty},
3346 	{ }
3347 };
3348 
3349 static __net_initdata struct ctl_path ipv4_path[] = {
3350 	{ .procname = "net", },
3351 	{ .procname = "ipv4", },
3352 	{ },
3353 };
3354 
3355 static struct ctl_table ipv4_route_flush_table[] = {
3356 	{
3357 		.procname	= "flush",
3358 		.maxlen		= sizeof(int),
3359 		.mode		= 0200,
3360 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3361 	},
3362 	{ },
3363 };
3364 
3365 static __net_initdata struct ctl_path ipv4_route_path[] = {
3366 	{ .procname = "net", },
3367 	{ .procname = "ipv4", },
3368 	{ .procname = "route", },
3369 	{ },
3370 };
3371 
3372 static __net_init int sysctl_route_net_init(struct net *net)
3373 {
3374 	struct ctl_table *tbl;
3375 
3376 	tbl = ipv4_route_flush_table;
3377 	if (!net_eq(net, &init_net)) {
3378 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379 		if (tbl == NULL)
3380 			goto err_dup;
3381 	}
3382 	tbl[0].extra1 = net;
3383 
3384 	net->ipv4.route_hdr =
3385 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3386 	if (net->ipv4.route_hdr == NULL)
3387 		goto err_reg;
3388 	return 0;
3389 
3390 err_reg:
3391 	if (tbl != ipv4_route_flush_table)
3392 		kfree(tbl);
3393 err_dup:
3394 	return -ENOMEM;
3395 }
3396 
3397 static __net_exit void sysctl_route_net_exit(struct net *net)
3398 {
3399 	struct ctl_table *tbl;
3400 
3401 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 	BUG_ON(tbl == ipv4_route_flush_table);
3404 	kfree(tbl);
3405 }
3406 
3407 static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 	.init = sysctl_route_net_init,
3409 	.exit = sysctl_route_net_exit,
3410 };
3411 #endif
3412 
3413 static __net_init int rt_genid_init(struct net *net)
3414 {
3415 	get_random_bytes(&net->ipv4.rt_genid,
3416 			 sizeof(net->ipv4.rt_genid));
3417 	get_random_bytes(&net->ipv4.dev_addr_genid,
3418 			 sizeof(net->ipv4.dev_addr_genid));
3419 	return 0;
3420 }
3421 
3422 static __net_initdata struct pernet_operations rt_genid_ops = {
3423 	.init = rt_genid_init,
3424 };
3425 
3426 
3427 #ifdef CONFIG_IP_ROUTE_CLASSID
3428 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3429 #endif /* CONFIG_IP_ROUTE_CLASSID */
3430 
3431 static __initdata unsigned long rhash_entries;
3432 static int __init set_rhash_entries(char *str)
3433 {
3434 	if (!str)
3435 		return 0;
3436 	rhash_entries = simple_strtoul(str, &str, 0);
3437 	return 1;
3438 }
3439 __setup("rhash_entries=", set_rhash_entries);
3440 
3441 int __init ip_rt_init(void)
3442 {
3443 	int rc = 0;
3444 
3445 #ifdef CONFIG_IP_ROUTE_CLASSID
3446 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3447 	if (!ip_rt_acct)
3448 		panic("IP: failed to allocate ip_rt_acct\n");
3449 #endif
3450 
3451 	ipv4_dst_ops.kmem_cachep =
3452 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3453 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3454 
3455 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456 
3457 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3458 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459 
3460 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462 
3463 	rt_hash_table = (struct rt_hash_bucket *)
3464 		alloc_large_system_hash("IP route cache",
3465 					sizeof(struct rt_hash_bucket),
3466 					rhash_entries,
3467 					(totalram_pages >= 128 * 1024) ?
3468 					15 : 17,
3469 					0,
3470 					&rt_hash_log,
3471 					&rt_hash_mask,
3472 					rhash_entries ? 0 : 512 * 1024);
3473 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474 	rt_hash_lock_init();
3475 
3476 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478 
3479 	devinet_init();
3480 	ip_fib_init();
3481 
3482 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483 	expires_ljiffies = jiffies;
3484 	schedule_delayed_work(&expires_work,
3485 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486 
3487 	if (ip_rt_proc_init())
3488 		pr_err("Unable to create route proc files\n");
3489 #ifdef CONFIG_XFRM
3490 	xfrm_init();
3491 	xfrm4_init(ip_rt_max_size);
3492 #endif
3493 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3494 
3495 #ifdef CONFIG_SYSCTL
3496 	register_pernet_subsys(&sysctl_route_ops);
3497 #endif
3498 	register_pernet_subsys(&rt_genid_ops);
3499 	return rc;
3500 }
3501 
3502 #ifdef CONFIG_SYSCTL
3503 /*
3504  * We really need to sanitize the damn ipv4 init order, then all
3505  * this nonsense will go away.
3506  */
3507 void __init ip_static_sysctl_init(void)
3508 {
3509 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3510 }
3511 #endif
3512