xref: /linux/net/ipv4/route.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define IP_MAX_MTU	0xFFF0
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
127 static int ip_rt_redirect_number __read_mostly	= 9;
128 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly	= HZ;
131 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly	= 8;
133 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly	= 256;
136 static int rt_chain_length_max __read_mostly	= 20;
137 
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140 
141 /*
142  *	Interface to generic destination cache.
143  */
144 
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
148 static void		 ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void		 ipv4_link_failure(struct sk_buff *skb);
151 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153 
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 			    int how)
156 {
157 }
158 
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161 	struct rtable *rt = (struct rtable *) dst;
162 	struct inet_peer *peer;
163 	u32 *p = NULL;
164 
165 	if (!rt->peer)
166 		rt_bind_peer(rt, rt->rt_dst, 1);
167 
168 	peer = rt->peer;
169 	if (peer) {
170 		u32 *old_p = __DST_METRICS_PTR(old);
171 		unsigned long prev, new;
172 
173 		p = peer->metrics;
174 		if (inet_metrics_new(peer))
175 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176 
177 		new = (unsigned long) p;
178 		prev = cmpxchg(&dst->_metrics, old, new);
179 
180 		if (prev != old) {
181 			p = __DST_METRICS_PTR(prev);
182 			if (prev & DST_METRICS_READ_ONLY)
183 				p = NULL;
184 		} else {
185 			if (rt->fi) {
186 				fib_info_put(rt->fi);
187 				rt->fi = NULL;
188 			}
189 		}
190 	}
191 	return p;
192 }
193 
194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195 
196 static struct dst_ops ipv4_dst_ops = {
197 	.family =		AF_INET,
198 	.protocol =		cpu_to_be16(ETH_P_IP),
199 	.gc =			rt_garbage_collect,
200 	.check =		ipv4_dst_check,
201 	.default_advmss =	ipv4_default_advmss,
202 	.mtu =			ipv4_mtu,
203 	.cow_metrics =		ipv4_cow_metrics,
204 	.destroy =		ipv4_dst_destroy,
205 	.ifdown =		ipv4_dst_ifdown,
206 	.negative_advice =	ipv4_negative_advice,
207 	.link_failure =		ipv4_link_failure,
208 	.update_pmtu =		ip_rt_update_pmtu,
209 	.local_out =		__ip_local_out,
210 	.neigh_lookup =		ipv4_neigh_lookup,
211 };
212 
213 #define ECN_OR_COST(class)	TC_PRIO_##class
214 
215 const __u8 ip_tos2prio[16] = {
216 	TC_PRIO_BESTEFFORT,
217 	ECN_OR_COST(BESTEFFORT),
218 	TC_PRIO_BESTEFFORT,
219 	ECN_OR_COST(BESTEFFORT),
220 	TC_PRIO_BULK,
221 	ECN_OR_COST(BULK),
222 	TC_PRIO_BULK,
223 	ECN_OR_COST(BULK),
224 	TC_PRIO_INTERACTIVE,
225 	ECN_OR_COST(INTERACTIVE),
226 	TC_PRIO_INTERACTIVE,
227 	ECN_OR_COST(INTERACTIVE),
228 	TC_PRIO_INTERACTIVE_BULK,
229 	ECN_OR_COST(INTERACTIVE_BULK),
230 	TC_PRIO_INTERACTIVE_BULK,
231 	ECN_OR_COST(INTERACTIVE_BULK)
232 };
233 EXPORT_SYMBOL(ip_tos2prio);
234 
235 /*
236  * Route cache.
237  */
238 
239 /* The locking scheme is rather straight forward:
240  *
241  * 1) Read-Copy Update protects the buckets of the central route hash.
242  * 2) Only writers remove entries, and they hold the lock
243  *    as they look at rtable reference counts.
244  * 3) Only readers acquire references to rtable entries,
245  *    they do so with atomic increments and with the
246  *    lock held.
247  */
248 
249 struct rt_hash_bucket {
250 	struct rtable __rcu	*chain;
251 };
252 
253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 	defined(CONFIG_PROVE_LOCKING)
255 /*
256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257  * The size of this table is a power of two and depends on the number of CPUS.
258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259  */
260 #ifdef CONFIG_LOCKDEP
261 # define RT_HASH_LOCK_SZ	256
262 #else
263 # if NR_CPUS >= 32
264 #  define RT_HASH_LOCK_SZ	4096
265 # elif NR_CPUS >= 16
266 #  define RT_HASH_LOCK_SZ	2048
267 # elif NR_CPUS >= 8
268 #  define RT_HASH_LOCK_SZ	1024
269 # elif NR_CPUS >= 4
270 #  define RT_HASH_LOCK_SZ	512
271 # else
272 #  define RT_HASH_LOCK_SZ	256
273 # endif
274 #endif
275 
276 static spinlock_t	*rt_hash_locks;
277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278 
279 static __init void rt_hash_lock_init(void)
280 {
281 	int i;
282 
283 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 			GFP_KERNEL);
285 	if (!rt_hash_locks)
286 		panic("IP: failed to allocate rt_hash_locks\n");
287 
288 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 		spin_lock_init(&rt_hash_locks[i]);
290 }
291 #else
292 # define rt_hash_lock_addr(slot) NULL
293 
294 static inline void rt_hash_lock_init(void)
295 {
296 }
297 #endif
298 
299 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
300 static unsigned int		rt_hash_mask __read_mostly;
301 static unsigned int		rt_hash_log  __read_mostly;
302 
303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305 
306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
307 				   int genid)
308 {
309 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
310 			    idx, genid)
311 		& rt_hash_mask;
312 }
313 
314 static inline int rt_genid(struct net *net)
315 {
316 	return atomic_read(&net->ipv4.rt_genid);
317 }
318 
319 #ifdef CONFIG_PROC_FS
320 struct rt_cache_iter_state {
321 	struct seq_net_private p;
322 	int bucket;
323 	int genid;
324 };
325 
326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
327 {
328 	struct rt_cache_iter_state *st = seq->private;
329 	struct rtable *r = NULL;
330 
331 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
332 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
333 			continue;
334 		rcu_read_lock_bh();
335 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 		while (r) {
337 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
338 			    r->rt_genid == st->genid)
339 				return r;
340 			r = rcu_dereference_bh(r->dst.rt_next);
341 		}
342 		rcu_read_unlock_bh();
343 	}
344 	return r;
345 }
346 
347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
348 					  struct rtable *r)
349 {
350 	struct rt_cache_iter_state *st = seq->private;
351 
352 	r = rcu_dereference_bh(r->dst.rt_next);
353 	while (!r) {
354 		rcu_read_unlock_bh();
355 		do {
356 			if (--st->bucket < 0)
357 				return NULL;
358 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359 		rcu_read_lock_bh();
360 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
361 	}
362 	return r;
363 }
364 
365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
366 					struct rtable *r)
367 {
368 	struct rt_cache_iter_state *st = seq->private;
369 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
370 		if (dev_net(r->dst.dev) != seq_file_net(seq))
371 			continue;
372 		if (r->rt_genid == st->genid)
373 			break;
374 	}
375 	return r;
376 }
377 
378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379 {
380 	struct rtable *r = rt_cache_get_first(seq);
381 
382 	if (r)
383 		while (pos && (r = rt_cache_get_next(seq, r)))
384 			--pos;
385 	return pos ? NULL : r;
386 }
387 
388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389 {
390 	struct rt_cache_iter_state *st = seq->private;
391 	if (*pos)
392 		return rt_cache_get_idx(seq, *pos - 1);
393 	st->genid = rt_genid(seq_file_net(seq));
394 	return SEQ_START_TOKEN;
395 }
396 
397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398 {
399 	struct rtable *r;
400 
401 	if (v == SEQ_START_TOKEN)
402 		r = rt_cache_get_first(seq);
403 	else
404 		r = rt_cache_get_next(seq, v);
405 	++*pos;
406 	return r;
407 }
408 
409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410 {
411 	if (v && v != SEQ_START_TOKEN)
412 		rcu_read_unlock_bh();
413 }
414 
415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
416 {
417 	if (v == SEQ_START_TOKEN)
418 		seq_printf(seq, "%-127s\n",
419 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421 			   "HHUptod\tSpecDst");
422 	else {
423 		struct rtable *r = v;
424 		struct neighbour *n;
425 		int len, HHUptod;
426 
427 		rcu_read_lock();
428 		n = dst_get_neighbour_noref(&r->dst);
429 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 		rcu_read_unlock();
431 
432 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434 			r->dst.dev ? r->dst.dev->name : "*",
435 			(__force u32)r->rt_dst,
436 			(__force u32)r->rt_gateway,
437 			r->rt_flags, atomic_read(&r->dst.__refcnt),
438 			r->dst.__use, 0, (__force u32)r->rt_src,
439 			dst_metric_advmss(&r->dst) + 40,
440 			dst_metric(&r->dst, RTAX_WINDOW),
441 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 			      dst_metric(&r->dst, RTAX_RTTVAR)),
443 			r->rt_key_tos,
444 			-1,
445 			HHUptod,
446 			r->rt_spec_dst, &len);
447 
448 		seq_printf(seq, "%*s\n", 127 - len, "");
449 	}
450 	return 0;
451 }
452 
453 static const struct seq_operations rt_cache_seq_ops = {
454 	.start  = rt_cache_seq_start,
455 	.next   = rt_cache_seq_next,
456 	.stop   = rt_cache_seq_stop,
457 	.show   = rt_cache_seq_show,
458 };
459 
460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
461 {
462 	return seq_open_net(inode, file, &rt_cache_seq_ops,
463 			sizeof(struct rt_cache_iter_state));
464 }
465 
466 static const struct file_operations rt_cache_seq_fops = {
467 	.owner	 = THIS_MODULE,
468 	.open	 = rt_cache_seq_open,
469 	.read	 = seq_read,
470 	.llseek	 = seq_lseek,
471 	.release = seq_release_net,
472 };
473 
474 
475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
476 {
477 	int cpu;
478 
479 	if (*pos == 0)
480 		return SEQ_START_TOKEN;
481 
482 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
483 		if (!cpu_possible(cpu))
484 			continue;
485 		*pos = cpu+1;
486 		return &per_cpu(rt_cache_stat, cpu);
487 	}
488 	return NULL;
489 }
490 
491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
492 {
493 	int cpu;
494 
495 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
496 		if (!cpu_possible(cpu))
497 			continue;
498 		*pos = cpu+1;
499 		return &per_cpu(rt_cache_stat, cpu);
500 	}
501 	return NULL;
502 
503 }
504 
505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
506 {
507 
508 }
509 
510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511 {
512 	struct rt_cache_stat *st = v;
513 
514 	if (v == SEQ_START_TOKEN) {
515 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
516 		return 0;
517 	}
518 
519 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
520 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
521 		   dst_entries_get_slow(&ipv4_dst_ops),
522 		   st->in_hit,
523 		   st->in_slow_tot,
524 		   st->in_slow_mc,
525 		   st->in_no_route,
526 		   st->in_brd,
527 		   st->in_martian_dst,
528 		   st->in_martian_src,
529 
530 		   st->out_hit,
531 		   st->out_slow_tot,
532 		   st->out_slow_mc,
533 
534 		   st->gc_total,
535 		   st->gc_ignored,
536 		   st->gc_goal_miss,
537 		   st->gc_dst_overflow,
538 		   st->in_hlist_search,
539 		   st->out_hlist_search
540 		);
541 	return 0;
542 }
543 
544 static const struct seq_operations rt_cpu_seq_ops = {
545 	.start  = rt_cpu_seq_start,
546 	.next   = rt_cpu_seq_next,
547 	.stop   = rt_cpu_seq_stop,
548 	.show   = rt_cpu_seq_show,
549 };
550 
551 
552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553 {
554 	return seq_open(file, &rt_cpu_seq_ops);
555 }
556 
557 static const struct file_operations rt_cpu_seq_fops = {
558 	.owner	 = THIS_MODULE,
559 	.open	 = rt_cpu_seq_open,
560 	.read	 = seq_read,
561 	.llseek	 = seq_lseek,
562 	.release = seq_release,
563 };
564 
565 #ifdef CONFIG_IP_ROUTE_CLASSID
566 static int rt_acct_proc_show(struct seq_file *m, void *v)
567 {
568 	struct ip_rt_acct *dst, *src;
569 	unsigned int i, j;
570 
571 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
572 	if (!dst)
573 		return -ENOMEM;
574 
575 	for_each_possible_cpu(i) {
576 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
577 		for (j = 0; j < 256; j++) {
578 			dst[j].o_bytes   += src[j].o_bytes;
579 			dst[j].o_packets += src[j].o_packets;
580 			dst[j].i_bytes   += src[j].i_bytes;
581 			dst[j].i_packets += src[j].i_packets;
582 		}
583 	}
584 
585 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
586 	kfree(dst);
587 	return 0;
588 }
589 
590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
591 {
592 	return single_open(file, rt_acct_proc_show, NULL);
593 }
594 
595 static const struct file_operations rt_acct_proc_fops = {
596 	.owner		= THIS_MODULE,
597 	.open		= rt_acct_proc_open,
598 	.read		= seq_read,
599 	.llseek		= seq_lseek,
600 	.release	= single_release,
601 };
602 #endif
603 
604 static int __net_init ip_rt_do_proc_init(struct net *net)
605 {
606 	struct proc_dir_entry *pde;
607 
608 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
609 			&rt_cache_seq_fops);
610 	if (!pde)
611 		goto err1;
612 
613 	pde = proc_create("rt_cache", S_IRUGO,
614 			  net->proc_net_stat, &rt_cpu_seq_fops);
615 	if (!pde)
616 		goto err2;
617 
618 #ifdef CONFIG_IP_ROUTE_CLASSID
619 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620 	if (!pde)
621 		goto err3;
622 #endif
623 	return 0;
624 
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626 err3:
627 	remove_proc_entry("rt_cache", net->proc_net_stat);
628 #endif
629 err2:
630 	remove_proc_entry("rt_cache", net->proc_net);
631 err1:
632 	return -ENOMEM;
633 }
634 
635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
636 {
637 	remove_proc_entry("rt_cache", net->proc_net_stat);
638 	remove_proc_entry("rt_cache", net->proc_net);
639 #ifdef CONFIG_IP_ROUTE_CLASSID
640 	remove_proc_entry("rt_acct", net->proc_net);
641 #endif
642 }
643 
644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
645 	.init = ip_rt_do_proc_init,
646 	.exit = ip_rt_do_proc_exit,
647 };
648 
649 static int __init ip_rt_proc_init(void)
650 {
651 	return register_pernet_subsys(&ip_rt_proc_ops);
652 }
653 
654 #else
655 static inline int ip_rt_proc_init(void)
656 {
657 	return 0;
658 }
659 #endif /* CONFIG_PROC_FS */
660 
661 static inline void rt_free(struct rtable *rt)
662 {
663 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
664 }
665 
666 static inline void rt_drop(struct rtable *rt)
667 {
668 	ip_rt_put(rt);
669 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
670 }
671 
672 static inline int rt_fast_clean(struct rtable *rth)
673 {
674 	/* Kill broadcast/multicast entries very aggresively, if they
675 	   collide in hash table with more useful entries */
676 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
677 		rt_is_input_route(rth) && rth->dst.rt_next;
678 }
679 
680 static inline int rt_valuable(struct rtable *rth)
681 {
682 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
683 		(rth->peer && rth->peer->pmtu_expires);
684 }
685 
686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687 {
688 	unsigned long age;
689 	int ret = 0;
690 
691 	if (atomic_read(&rth->dst.__refcnt))
692 		goto out;
693 
694 	age = jiffies - rth->dst.lastuse;
695 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 	    (age <= tmo2 && rt_valuable(rth)))
697 		goto out;
698 	ret = 1;
699 out:	return ret;
700 }
701 
702 /* Bits of score are:
703  * 31: very valuable
704  * 30: not quite useless
705  * 29..0: usage counter
706  */
707 static inline u32 rt_score(struct rtable *rt)
708 {
709 	u32 score = jiffies - rt->dst.lastuse;
710 
711 	score = ~score & ~(3<<30);
712 
713 	if (rt_valuable(rt))
714 		score |= (1<<31);
715 
716 	if (rt_is_output_route(rt) ||
717 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 		score |= (1<<30);
719 
720 	return score;
721 }
722 
723 static inline bool rt_caching(const struct net *net)
724 {
725 	return net->ipv4.current_rt_cache_rebuild_count <=
726 		net->ipv4.sysctl_rt_cache_rebuild_count;
727 }
728 
729 static inline bool compare_hash_inputs(const struct rtable *rt1,
730 				       const struct rtable *rt2)
731 {
732 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
734 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
735 }
736 
737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738 {
739 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 		(rt1->rt_mark ^ rt2->rt_mark) |
742 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
743 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
744 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
745 }
746 
747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748 {
749 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
750 }
751 
752 static inline int rt_is_expired(struct rtable *rth)
753 {
754 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
755 }
756 
757 /*
758  * Perform a full scan of hash table and free all entries.
759  * Can be called by a softirq or a process.
760  * In the later case, we want to be reschedule if necessary
761  */
762 static void rt_do_flush(struct net *net, int process_context)
763 {
764 	unsigned int i;
765 	struct rtable *rth, *next;
766 
767 	for (i = 0; i <= rt_hash_mask; i++) {
768 		struct rtable __rcu **pprev;
769 		struct rtable *list;
770 
771 		if (process_context && need_resched())
772 			cond_resched();
773 		rth = rcu_access_pointer(rt_hash_table[i].chain);
774 		if (!rth)
775 			continue;
776 
777 		spin_lock_bh(rt_hash_lock_addr(i));
778 
779 		list = NULL;
780 		pprev = &rt_hash_table[i].chain;
781 		rth = rcu_dereference_protected(*pprev,
782 			lockdep_is_held(rt_hash_lock_addr(i)));
783 
784 		while (rth) {
785 			next = rcu_dereference_protected(rth->dst.rt_next,
786 				lockdep_is_held(rt_hash_lock_addr(i)));
787 
788 			if (!net ||
789 			    net_eq(dev_net(rth->dst.dev), net)) {
790 				rcu_assign_pointer(*pprev, next);
791 				rcu_assign_pointer(rth->dst.rt_next, list);
792 				list = rth;
793 			} else {
794 				pprev = &rth->dst.rt_next;
795 			}
796 			rth = next;
797 		}
798 
799 		spin_unlock_bh(rt_hash_lock_addr(i));
800 
801 		for (; list; list = next) {
802 			next = rcu_dereference_protected(list->dst.rt_next, 1);
803 			rt_free(list);
804 		}
805 	}
806 }
807 
808 /*
809  * While freeing expired entries, we compute average chain length
810  * and standard deviation, using fixed-point arithmetic.
811  * This to have an estimation of rt_chain_length_max
812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814  */
815 
816 #define FRACT_BITS 3
817 #define ONE (1UL << FRACT_BITS)
818 
819 /*
820  * Given a hash chain and an item in this hash chain,
821  * find if a previous entry has the same hash_inputs
822  * (but differs on tos, mark or oif)
823  * Returns 0 if an alias is found.
824  * Returns ONE if rth has no alias before itself.
825  */
826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
827 {
828 	const struct rtable *aux = head;
829 
830 	while (aux != rth) {
831 		if (compare_hash_inputs(aux, rth))
832 			return 0;
833 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834 	}
835 	return ONE;
836 }
837 
838 static void rt_check_expire(void)
839 {
840 	static unsigned int rover;
841 	unsigned int i = rover, goal;
842 	struct rtable *rth;
843 	struct rtable __rcu **rthp;
844 	unsigned long samples = 0;
845 	unsigned long sum = 0, sum2 = 0;
846 	unsigned long delta;
847 	u64 mult;
848 
849 	delta = jiffies - expires_ljiffies;
850 	expires_ljiffies = jiffies;
851 	mult = ((u64)delta) << rt_hash_log;
852 	if (ip_rt_gc_timeout > 1)
853 		do_div(mult, ip_rt_gc_timeout);
854 	goal = (unsigned int)mult;
855 	if (goal > rt_hash_mask)
856 		goal = rt_hash_mask + 1;
857 	for (; goal > 0; goal--) {
858 		unsigned long tmo = ip_rt_gc_timeout;
859 		unsigned long length;
860 
861 		i = (i + 1) & rt_hash_mask;
862 		rthp = &rt_hash_table[i].chain;
863 
864 		if (need_resched())
865 			cond_resched();
866 
867 		samples++;
868 
869 		if (rcu_dereference_raw(*rthp) == NULL)
870 			continue;
871 		length = 0;
872 		spin_lock_bh(rt_hash_lock_addr(i));
873 		while ((rth = rcu_dereference_protected(*rthp,
874 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 			prefetch(rth->dst.rt_next);
876 			if (rt_is_expired(rth)) {
877 				*rthp = rth->dst.rt_next;
878 				rt_free(rth);
879 				continue;
880 			}
881 			if (rth->dst.expires) {
882 				/* Entry is expired even if it is in use */
883 				if (time_before_eq(jiffies, rth->dst.expires)) {
884 nofree:
885 					tmo >>= 1;
886 					rthp = &rth->dst.rt_next;
887 					/*
888 					 * We only count entries on
889 					 * a chain with equal hash inputs once
890 					 * so that entries for different QOS
891 					 * levels, and other non-hash input
892 					 * attributes don't unfairly skew
893 					 * the length computation
894 					 */
895 					length += has_noalias(rt_hash_table[i].chain, rth);
896 					continue;
897 				}
898 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 				goto nofree;
900 
901 			/* Cleanup aged off entries. */
902 			*rthp = rth->dst.rt_next;
903 			rt_free(rth);
904 		}
905 		spin_unlock_bh(rt_hash_lock_addr(i));
906 		sum += length;
907 		sum2 += length*length;
908 	}
909 	if (samples) {
910 		unsigned long avg = sum / samples;
911 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 		rt_chain_length_max = max_t(unsigned long,
913 					ip_rt_gc_elasticity,
914 					(avg + 4*sd) >> FRACT_BITS);
915 	}
916 	rover = i;
917 }
918 
919 /*
920  * rt_worker_func() is run in process context.
921  * we call rt_check_expire() to scan part of the hash table
922  */
923 static void rt_worker_func(struct work_struct *work)
924 {
925 	rt_check_expire();
926 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927 }
928 
929 /*
930  * Perturbation of rt_genid by a small quantity [1..256]
931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932  * many times (2^24) without giving recent rt_genid.
933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
934  */
935 static void rt_cache_invalidate(struct net *net)
936 {
937 	unsigned char shuffle;
938 
939 	get_random_bytes(&shuffle, sizeof(shuffle));
940 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941 	inetpeer_invalidate_tree(AF_INET);
942 }
943 
944 /*
945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
946  * delay >= 0 : invalidate & flush cache (can be long)
947  */
948 void rt_cache_flush(struct net *net, int delay)
949 {
950 	rt_cache_invalidate(net);
951 	if (delay >= 0)
952 		rt_do_flush(net, !in_softirq());
953 }
954 
955 /* Flush previous cache invalidated entries from the cache */
956 void rt_cache_flush_batch(struct net *net)
957 {
958 	rt_do_flush(net, !in_softirq());
959 }
960 
961 static void rt_emergency_hash_rebuild(struct net *net)
962 {
963 	net_warn_ratelimited("Route hash chain too long!\n");
964 	rt_cache_invalidate(net);
965 }
966 
967 /*
968    Short description of GC goals.
969 
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973 
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979 
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982 	static unsigned long expire = RT_GC_TIMEOUT;
983 	static unsigned long last_gc;
984 	static int rover;
985 	static int equilibrium;
986 	struct rtable *rth;
987 	struct rtable __rcu **rthp;
988 	unsigned long now = jiffies;
989 	int goal;
990 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
991 
992 	/*
993 	 * Garbage collection is pretty expensive,
994 	 * do not make it too frequently.
995 	 */
996 
997 	RT_CACHE_STAT_INC(gc_total);
998 
999 	if (now - last_gc < ip_rt_gc_min_interval &&
1000 	    entries < ip_rt_max_size) {
1001 		RT_CACHE_STAT_INC(gc_ignored);
1002 		goto out;
1003 	}
1004 
1005 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 	/* Calculate number of entries, which we want to expire now. */
1007 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 	if (goal <= 0) {
1009 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 			equilibrium = ipv4_dst_ops.gc_thresh;
1011 		goal = entries - equilibrium;
1012 		if (goal > 0) {
1013 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 			goal = entries - equilibrium;
1015 		}
1016 	} else {
1017 		/* We are in dangerous area. Try to reduce cache really
1018 		 * aggressively.
1019 		 */
1020 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 		equilibrium = entries - goal;
1022 	}
1023 
1024 	if (now - last_gc >= ip_rt_gc_min_interval)
1025 		last_gc = now;
1026 
1027 	if (goal <= 0) {
1028 		equilibrium += goal;
1029 		goto work_done;
1030 	}
1031 
1032 	do {
1033 		int i, k;
1034 
1035 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 			unsigned long tmo = expire;
1037 
1038 			k = (k + 1) & rt_hash_mask;
1039 			rthp = &rt_hash_table[k].chain;
1040 			spin_lock_bh(rt_hash_lock_addr(k));
1041 			while ((rth = rcu_dereference_protected(*rthp,
1042 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 				if (!rt_is_expired(rth) &&
1044 					!rt_may_expire(rth, tmo, expire)) {
1045 					tmo >>= 1;
1046 					rthp = &rth->dst.rt_next;
1047 					continue;
1048 				}
1049 				*rthp = rth->dst.rt_next;
1050 				rt_free(rth);
1051 				goal--;
1052 			}
1053 			spin_unlock_bh(rt_hash_lock_addr(k));
1054 			if (goal <= 0)
1055 				break;
1056 		}
1057 		rover = k;
1058 
1059 		if (goal <= 0)
1060 			goto work_done;
1061 
1062 		/* Goal is not achieved. We stop process if:
1063 
1064 		   - if expire reduced to zero. Otherwise, expire is halfed.
1065 		   - if table is not full.
1066 		   - if we are called from interrupt.
1067 		   - jiffies check is just fallback/debug loop breaker.
1068 		     We will not spin here for long time in any case.
1069 		 */
1070 
1071 		RT_CACHE_STAT_INC(gc_goal_miss);
1072 
1073 		if (expire == 0)
1074 			break;
1075 
1076 		expire >>= 1;
1077 
1078 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 			goto out;
1080 	} while (!in_softirq() && time_before_eq(jiffies, now));
1081 
1082 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 		goto out;
1084 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 		goto out;
1086 	net_warn_ratelimited("dst cache overflow\n");
1087 	RT_CACHE_STAT_INC(gc_dst_overflow);
1088 	return 1;
1089 
1090 work_done:
1091 	expire += ip_rt_gc_min_interval;
1092 	if (expire > ip_rt_gc_timeout ||
1093 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095 		expire = ip_rt_gc_timeout;
1096 out:	return 0;
1097 }
1098 
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104 	int length = 0;
1105 	const struct rtable *rth = head;
1106 
1107 	while (rth) {
1108 		length += has_noalias(head, rth);
1109 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110 	}
1111 	return length >> FRACT_BITS;
1112 }
1113 
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116 	static const __be32 inaddr_any = 0;
1117 	struct net_device *dev = dst->dev;
1118 	const __be32 *pkey = daddr;
1119 	const struct rtable *rt;
1120 	struct neighbour *n;
1121 
1122 	rt = (const struct rtable *) dst;
1123 
1124 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125 		pkey = &inaddr_any;
1126 	else if (rt->rt_gateway)
1127 		pkey = (const __be32 *) &rt->rt_gateway;
1128 
1129 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130 	if (n)
1131 		return n;
1132 	return neigh_create(&arp_tbl, pkey, dev);
1133 }
1134 
1135 static int rt_bind_neighbour(struct rtable *rt)
1136 {
1137 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138 	if (IS_ERR(n))
1139 		return PTR_ERR(n);
1140 	dst_set_neighbour(&rt->dst, n);
1141 
1142 	return 0;
1143 }
1144 
1145 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146 				     struct sk_buff *skb, int ifindex)
1147 {
1148 	struct rtable	*rth, *cand;
1149 	struct rtable __rcu **rthp, **candp;
1150 	unsigned long	now;
1151 	u32 		min_score;
1152 	int		chain_length;
1153 	int attempts = !in_softirq();
1154 
1155 restart:
1156 	chain_length = 0;
1157 	min_score = ~(u32)0;
1158 	cand = NULL;
1159 	candp = NULL;
1160 	now = jiffies;
1161 
1162 	if (!rt_caching(dev_net(rt->dst.dev))) {
1163 		/*
1164 		 * If we're not caching, just tell the caller we
1165 		 * were successful and don't touch the route.  The
1166 		 * caller hold the sole reference to the cache entry, and
1167 		 * it will be released when the caller is done with it.
1168 		 * If we drop it here, the callers have no way to resolve routes
1169 		 * when we're not caching.  Instead, just point *rp at rt, so
1170 		 * the caller gets a single use out of the route
1171 		 * Note that we do rt_free on this new route entry, so that
1172 		 * once its refcount hits zero, we are still able to reap it
1173 		 * (Thanks Alexey)
1174 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1175 		 * we set DST_NOCACHE so that dst_release() can free dst without
1176 		 * waiting a grace period.
1177 		 */
1178 
1179 		rt->dst.flags |= DST_NOCACHE;
1180 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181 			int err = rt_bind_neighbour(rt);
1182 			if (err) {
1183 				net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184 				ip_rt_put(rt);
1185 				return ERR_PTR(err);
1186 			}
1187 		}
1188 
1189 		goto skip_hashing;
1190 	}
1191 
1192 	rthp = &rt_hash_table[hash].chain;
1193 
1194 	spin_lock_bh(rt_hash_lock_addr(hash));
1195 	while ((rth = rcu_dereference_protected(*rthp,
1196 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197 		if (rt_is_expired(rth)) {
1198 			*rthp = rth->dst.rt_next;
1199 			rt_free(rth);
1200 			continue;
1201 		}
1202 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203 			/* Put it first */
1204 			*rthp = rth->dst.rt_next;
1205 			/*
1206 			 * Since lookup is lockfree, the deletion
1207 			 * must be visible to another weakly ordered CPU before
1208 			 * the insertion at the start of the hash chain.
1209 			 */
1210 			rcu_assign_pointer(rth->dst.rt_next,
1211 					   rt_hash_table[hash].chain);
1212 			/*
1213 			 * Since lookup is lockfree, the update writes
1214 			 * must be ordered for consistency on SMP.
1215 			 */
1216 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217 
1218 			dst_use(&rth->dst, now);
1219 			spin_unlock_bh(rt_hash_lock_addr(hash));
1220 
1221 			rt_drop(rt);
1222 			if (skb)
1223 				skb_dst_set(skb, &rth->dst);
1224 			return rth;
1225 		}
1226 
1227 		if (!atomic_read(&rth->dst.__refcnt)) {
1228 			u32 score = rt_score(rth);
1229 
1230 			if (score <= min_score) {
1231 				cand = rth;
1232 				candp = rthp;
1233 				min_score = score;
1234 			}
1235 		}
1236 
1237 		chain_length++;
1238 
1239 		rthp = &rth->dst.rt_next;
1240 	}
1241 
1242 	if (cand) {
1243 		/* ip_rt_gc_elasticity used to be average length of chain
1244 		 * length, when exceeded gc becomes really aggressive.
1245 		 *
1246 		 * The second limit is less certain. At the moment it allows
1247 		 * only 2 entries per bucket. We will see.
1248 		 */
1249 		if (chain_length > ip_rt_gc_elasticity) {
1250 			*candp = cand->dst.rt_next;
1251 			rt_free(cand);
1252 		}
1253 	} else {
1254 		if (chain_length > rt_chain_length_max &&
1255 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256 			struct net *net = dev_net(rt->dst.dev);
1257 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258 			if (!rt_caching(net)) {
1259 				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260 					rt->dst.dev->name, num);
1261 			}
1262 			rt_emergency_hash_rebuild(net);
1263 			spin_unlock_bh(rt_hash_lock_addr(hash));
1264 
1265 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266 					ifindex, rt_genid(net));
1267 			goto restart;
1268 		}
1269 	}
1270 
1271 	/* Try to bind route to arp only if it is output
1272 	   route or unicast forwarding path.
1273 	 */
1274 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275 		int err = rt_bind_neighbour(rt);
1276 		if (err) {
1277 			spin_unlock_bh(rt_hash_lock_addr(hash));
1278 
1279 			if (err != -ENOBUFS) {
1280 				rt_drop(rt);
1281 				return ERR_PTR(err);
1282 			}
1283 
1284 			/* Neighbour tables are full and nothing
1285 			   can be released. Try to shrink route cache,
1286 			   it is most likely it holds some neighbour records.
1287 			 */
1288 			if (attempts-- > 0) {
1289 				int saved_elasticity = ip_rt_gc_elasticity;
1290 				int saved_int = ip_rt_gc_min_interval;
1291 				ip_rt_gc_elasticity	= 1;
1292 				ip_rt_gc_min_interval	= 0;
1293 				rt_garbage_collect(&ipv4_dst_ops);
1294 				ip_rt_gc_min_interval	= saved_int;
1295 				ip_rt_gc_elasticity	= saved_elasticity;
1296 				goto restart;
1297 			}
1298 
1299 			net_warn_ratelimited("Neighbour table overflow\n");
1300 			rt_drop(rt);
1301 			return ERR_PTR(-ENOBUFS);
1302 		}
1303 	}
1304 
1305 	rt->dst.rt_next = rt_hash_table[hash].chain;
1306 
1307 	/*
1308 	 * Since lookup is lockfree, we must make sure
1309 	 * previous writes to rt are committed to memory
1310 	 * before making rt visible to other CPUS.
1311 	 */
1312 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313 
1314 	spin_unlock_bh(rt_hash_lock_addr(hash));
1315 
1316 skip_hashing:
1317 	if (skb)
1318 		skb_dst_set(skb, &rt->dst);
1319 	return rt;
1320 }
1321 
1322 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323 
1324 static u32 rt_peer_genid(void)
1325 {
1326 	return atomic_read(&__rt_peer_genid);
1327 }
1328 
1329 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330 {
1331 	struct inet_peer *peer;
1332 
1333 	peer = inet_getpeer_v4(daddr, create);
1334 
1335 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336 		inet_putpeer(peer);
1337 	else
1338 		rt->rt_peer_genid = rt_peer_genid();
1339 }
1340 
1341 /*
1342  * Peer allocation may fail only in serious out-of-memory conditions.  However
1343  * we still can generate some output.
1344  * Random ID selection looks a bit dangerous because we have no chances to
1345  * select ID being unique in a reasonable period of time.
1346  * But broken packet identifier may be better than no packet at all.
1347  */
1348 static void ip_select_fb_ident(struct iphdr *iph)
1349 {
1350 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1351 	static u32 ip_fallback_id;
1352 	u32 salt;
1353 
1354 	spin_lock_bh(&ip_fb_id_lock);
1355 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1356 	iph->id = htons(salt & 0xFFFF);
1357 	ip_fallback_id = salt;
1358 	spin_unlock_bh(&ip_fb_id_lock);
1359 }
1360 
1361 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362 {
1363 	struct rtable *rt = (struct rtable *) dst;
1364 
1365 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366 		if (rt->peer == NULL)
1367 			rt_bind_peer(rt, rt->rt_dst, 1);
1368 
1369 		/* If peer is attached to destination, it is never detached,
1370 		   so that we need not to grab a lock to dereference it.
1371 		 */
1372 		if (rt->peer) {
1373 			iph->id = htons(inet_getid(rt->peer, more));
1374 			return;
1375 		}
1376 	} else if (!rt)
1377 		pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378 
1379 	ip_select_fb_ident(iph);
1380 }
1381 EXPORT_SYMBOL(__ip_select_ident);
1382 
1383 static void rt_del(unsigned int hash, struct rtable *rt)
1384 {
1385 	struct rtable __rcu **rthp;
1386 	struct rtable *aux;
1387 
1388 	rthp = &rt_hash_table[hash].chain;
1389 	spin_lock_bh(rt_hash_lock_addr(hash));
1390 	ip_rt_put(rt);
1391 	while ((aux = rcu_dereference_protected(*rthp,
1392 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393 		if (aux == rt || rt_is_expired(aux)) {
1394 			*rthp = aux->dst.rt_next;
1395 			rt_free(aux);
1396 			continue;
1397 		}
1398 		rthp = &aux->dst.rt_next;
1399 	}
1400 	spin_unlock_bh(rt_hash_lock_addr(hash));
1401 }
1402 
1403 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404 {
1405 	struct rtable *rt = (struct rtable *) dst;
1406 	__be32 orig_gw = rt->rt_gateway;
1407 	struct neighbour *n, *old_n;
1408 
1409 	dst_confirm(&rt->dst);
1410 
1411 	rt->rt_gateway = peer->redirect_learned.a4;
1412 
1413 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414 	if (IS_ERR(n)) {
1415 		rt->rt_gateway = orig_gw;
1416 		return;
1417 	}
1418 	old_n = xchg(&rt->dst._neighbour, n);
1419 	if (old_n)
1420 		neigh_release(old_n);
1421 	if (!(n->nud_state & NUD_VALID)) {
1422 		neigh_event_send(n, NULL);
1423 	} else {
1424 		rt->rt_flags |= RTCF_REDIRECTED;
1425 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426 	}
1427 }
1428 
1429 /* called in rcu_read_lock() section */
1430 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431 		    __be32 saddr, struct net_device *dev)
1432 {
1433 	int s, i;
1434 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1435 	__be32 skeys[2] = { saddr, 0 };
1436 	int    ikeys[2] = { dev->ifindex, 0 };
1437 	struct inet_peer *peer;
1438 	struct net *net;
1439 
1440 	if (!in_dev)
1441 		return;
1442 
1443 	net = dev_net(dev);
1444 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446 	    ipv4_is_zeronet(new_gw))
1447 		goto reject_redirect;
1448 
1449 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451 			goto reject_redirect;
1452 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453 			goto reject_redirect;
1454 	} else {
1455 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456 			goto reject_redirect;
1457 	}
1458 
1459 	for (s = 0; s < 2; s++) {
1460 		for (i = 0; i < 2; i++) {
1461 			unsigned int hash;
1462 			struct rtable __rcu **rthp;
1463 			struct rtable *rt;
1464 
1465 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466 
1467 			rthp = &rt_hash_table[hash].chain;
1468 
1469 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1470 				rthp = &rt->dst.rt_next;
1471 
1472 				if (rt->rt_key_dst != daddr ||
1473 				    rt->rt_key_src != skeys[s] ||
1474 				    rt->rt_oif != ikeys[i] ||
1475 				    rt_is_input_route(rt) ||
1476 				    rt_is_expired(rt) ||
1477 				    !net_eq(dev_net(rt->dst.dev), net) ||
1478 				    rt->dst.error ||
1479 				    rt->dst.dev != dev ||
1480 				    rt->rt_gateway != old_gw)
1481 					continue;
1482 
1483 				if (!rt->peer)
1484 					rt_bind_peer(rt, rt->rt_dst, 1);
1485 
1486 				peer = rt->peer;
1487 				if (peer) {
1488 					if (peer->redirect_learned.a4 != new_gw) {
1489 						peer->redirect_learned.a4 = new_gw;
1490 						atomic_inc(&__rt_peer_genid);
1491 					}
1492 					check_peer_redir(&rt->dst, peer);
1493 				}
1494 			}
1495 		}
1496 	}
1497 	return;
1498 
1499 reject_redirect:
1500 #ifdef CONFIG_IP_ROUTE_VERBOSE
1501 	if (IN_DEV_LOG_MARTIANS(in_dev))
1502 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1503 				     "  Advised path = %pI4 -> %pI4\n",
1504 				     &old_gw, dev->name, &new_gw,
1505 				     &saddr, &daddr);
1506 #endif
1507 	;
1508 }
1509 
1510 static bool peer_pmtu_expired(struct inet_peer *peer)
1511 {
1512 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1513 
1514 	return orig &&
1515 	       time_after_eq(jiffies, orig) &&
1516 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517 }
1518 
1519 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1520 {
1521 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522 
1523 	return orig &&
1524 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525 }
1526 
1527 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1528 {
1529 	struct rtable *rt = (struct rtable *)dst;
1530 	struct dst_entry *ret = dst;
1531 
1532 	if (rt) {
1533 		if (dst->obsolete > 0) {
1534 			ip_rt_put(rt);
1535 			ret = NULL;
1536 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1537 			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1538 						rt->rt_oif,
1539 						rt_genid(dev_net(dst->dev)));
1540 			rt_del(hash, rt);
1541 			ret = NULL;
1542 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544 		}
1545 	}
1546 	return ret;
1547 }
1548 
1549 /*
1550  * Algorithm:
1551  *	1. The first ip_rt_redirect_number redirects are sent
1552  *	   with exponential backoff, then we stop sending them at all,
1553  *	   assuming that the host ignores our redirects.
1554  *	2. If we did not see packets requiring redirects
1555  *	   during ip_rt_redirect_silence, we assume that the host
1556  *	   forgot redirected route and start to send redirects again.
1557  *
1558  * This algorithm is much cheaper and more intelligent than dumb load limiting
1559  * in icmp.c.
1560  *
1561  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563  */
1564 
1565 void ip_rt_send_redirect(struct sk_buff *skb)
1566 {
1567 	struct rtable *rt = skb_rtable(skb);
1568 	struct in_device *in_dev;
1569 	struct inet_peer *peer;
1570 	int log_martians;
1571 
1572 	rcu_read_lock();
1573 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1574 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575 		rcu_read_unlock();
1576 		return;
1577 	}
1578 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579 	rcu_read_unlock();
1580 
1581 	if (!rt->peer)
1582 		rt_bind_peer(rt, rt->rt_dst, 1);
1583 	peer = rt->peer;
1584 	if (!peer) {
1585 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586 		return;
1587 	}
1588 
1589 	/* No redirected packets during ip_rt_redirect_silence;
1590 	 * reset the algorithm.
1591 	 */
1592 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1593 		peer->rate_tokens = 0;
1594 
1595 	/* Too many ignored redirects; do not send anything
1596 	 * set dst.rate_last to the last seen redirected packet.
1597 	 */
1598 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1599 		peer->rate_last = jiffies;
1600 		return;
1601 	}
1602 
1603 	/* Check for load limit; set rate_last to the latest sent
1604 	 * redirect.
1605 	 */
1606 	if (peer->rate_tokens == 0 ||
1607 	    time_after(jiffies,
1608 		       (peer->rate_last +
1609 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1610 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1611 		peer->rate_last = jiffies;
1612 		++peer->rate_tokens;
1613 #ifdef CONFIG_IP_ROUTE_VERBOSE
1614 		if (log_martians &&
1615 		    peer->rate_tokens == ip_rt_redirect_number)
1616 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1617 					     &ip_hdr(skb)->saddr, rt->rt_iif,
1618 					     &rt->rt_dst, &rt->rt_gateway);
1619 #endif
1620 	}
1621 }
1622 
1623 static int ip_error(struct sk_buff *skb)
1624 {
1625 	struct rtable *rt = skb_rtable(skb);
1626 	struct inet_peer *peer;
1627 	unsigned long now;
1628 	bool send;
1629 	int code;
1630 
1631 	switch (rt->dst.error) {
1632 	case EINVAL:
1633 	default:
1634 		goto out;
1635 	case EHOSTUNREACH:
1636 		code = ICMP_HOST_UNREACH;
1637 		break;
1638 	case ENETUNREACH:
1639 		code = ICMP_NET_UNREACH;
1640 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641 				IPSTATS_MIB_INNOROUTES);
1642 		break;
1643 	case EACCES:
1644 		code = ICMP_PKT_FILTERED;
1645 		break;
1646 	}
1647 
1648 	if (!rt->peer)
1649 		rt_bind_peer(rt, rt->rt_dst, 1);
1650 	peer = rt->peer;
1651 
1652 	send = true;
1653 	if (peer) {
1654 		now = jiffies;
1655 		peer->rate_tokens += now - peer->rate_last;
1656 		if (peer->rate_tokens > ip_rt_error_burst)
1657 			peer->rate_tokens = ip_rt_error_burst;
1658 		peer->rate_last = now;
1659 		if (peer->rate_tokens >= ip_rt_error_cost)
1660 			peer->rate_tokens -= ip_rt_error_cost;
1661 		else
1662 			send = false;
1663 	}
1664 	if (send)
1665 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666 
1667 out:	kfree_skb(skb);
1668 	return 0;
1669 }
1670 
1671 /*
1672  *	The last two values are not from the RFC but
1673  *	are needed for AMPRnet AX.25 paths.
1674  */
1675 
1676 static const unsigned short mtu_plateau[] =
1677 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678 
1679 static inline unsigned short guess_mtu(unsigned short old_mtu)
1680 {
1681 	int i;
1682 
1683 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684 		if (old_mtu > mtu_plateau[i])
1685 			return mtu_plateau[i];
1686 	return 68;
1687 }
1688 
1689 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690 				 unsigned short new_mtu,
1691 				 struct net_device *dev)
1692 {
1693 	unsigned short old_mtu = ntohs(iph->tot_len);
1694 	unsigned short est_mtu = 0;
1695 	struct inet_peer *peer;
1696 
1697 	peer = inet_getpeer_v4(iph->daddr, 1);
1698 	if (peer) {
1699 		unsigned short mtu = new_mtu;
1700 
1701 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1702 			/* BSD 4.2 derived systems incorrectly adjust
1703 			 * tot_len by the IP header length, and report
1704 			 * a zero MTU in the ICMP message.
1705 			 */
1706 			if (mtu == 0 &&
1707 			    old_mtu >= 68 + (iph->ihl << 2))
1708 				old_mtu -= iph->ihl << 2;
1709 			mtu = guess_mtu(old_mtu);
1710 		}
1711 
1712 		if (mtu < ip_rt_min_pmtu)
1713 			mtu = ip_rt_min_pmtu;
1714 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715 			unsigned long pmtu_expires;
1716 
1717 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 			if (!pmtu_expires)
1719 				pmtu_expires = 1UL;
1720 
1721 			est_mtu = mtu;
1722 			peer->pmtu_learned = mtu;
1723 			peer->pmtu_expires = pmtu_expires;
1724 			atomic_inc(&__rt_peer_genid);
1725 		}
1726 
1727 		inet_putpeer(peer);
1728 	}
1729 	return est_mtu ? : new_mtu;
1730 }
1731 
1732 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733 {
1734 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735 
1736 	if (!expires)
1737 		return;
1738 	if (time_before(jiffies, expires)) {
1739 		u32 orig_dst_mtu = dst_mtu(dst);
1740 		if (peer->pmtu_learned < orig_dst_mtu) {
1741 			if (!peer->pmtu_orig)
1742 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 		}
1745 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747 }
1748 
1749 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750 {
1751 	struct rtable *rt = (struct rtable *) dst;
1752 	struct inet_peer *peer;
1753 
1754 	dst_confirm(dst);
1755 
1756 	if (!rt->peer)
1757 		rt_bind_peer(rt, rt->rt_dst, 1);
1758 	peer = rt->peer;
1759 	if (peer) {
1760 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761 
1762 		if (mtu < ip_rt_min_pmtu)
1763 			mtu = ip_rt_min_pmtu;
1764 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765 
1766 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1767 			if (!pmtu_expires)
1768 				pmtu_expires = 1UL;
1769 
1770 			peer->pmtu_learned = mtu;
1771 			peer->pmtu_expires = pmtu_expires;
1772 
1773 			atomic_inc(&__rt_peer_genid);
1774 			rt->rt_peer_genid = rt_peer_genid();
1775 		}
1776 		check_peer_pmtu(dst, peer);
1777 	}
1778 }
1779 
1780 
1781 static void ipv4_validate_peer(struct rtable *rt)
1782 {
1783 	if (rt->rt_peer_genid != rt_peer_genid()) {
1784 		struct inet_peer *peer;
1785 
1786 		if (!rt->peer)
1787 			rt_bind_peer(rt, rt->rt_dst, 0);
1788 
1789 		peer = rt->peer;
1790 		if (peer) {
1791 			check_peer_pmtu(&rt->dst, peer);
1792 
1793 			if (peer->redirect_learned.a4 &&
1794 			    peer->redirect_learned.a4 != rt->rt_gateway)
1795 				check_peer_redir(&rt->dst, peer);
1796 		}
1797 
1798 		rt->rt_peer_genid = rt_peer_genid();
1799 	}
1800 }
1801 
1802 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803 {
1804 	struct rtable *rt = (struct rtable *) dst;
1805 
1806 	if (rt_is_expired(rt))
1807 		return NULL;
1808 	ipv4_validate_peer(rt);
1809 	return dst;
1810 }
1811 
1812 static void ipv4_dst_destroy(struct dst_entry *dst)
1813 {
1814 	struct rtable *rt = (struct rtable *) dst;
1815 	struct inet_peer *peer = rt->peer;
1816 
1817 	if (rt->fi) {
1818 		fib_info_put(rt->fi);
1819 		rt->fi = NULL;
1820 	}
1821 	if (peer) {
1822 		rt->peer = NULL;
1823 		inet_putpeer(peer);
1824 	}
1825 }
1826 
1827 
1828 static void ipv4_link_failure(struct sk_buff *skb)
1829 {
1830 	struct rtable *rt;
1831 
1832 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833 
1834 	rt = skb_rtable(skb);
1835 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837 }
1838 
1839 static int ip_rt_bug(struct sk_buff *skb)
1840 {
1841 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1842 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843 		 skb->dev ? skb->dev->name : "?");
1844 	kfree_skb(skb);
1845 	WARN_ON(1);
1846 	return 0;
1847 }
1848 
1849 /*
1850    We do not cache source address of outgoing interface,
1851    because it is used only by IP RR, TS and SRR options,
1852    so that it out of fast path.
1853 
1854    BTW remember: "addr" is allowed to be not aligned
1855    in IP options!
1856  */
1857 
1858 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859 {
1860 	__be32 src;
1861 
1862 	if (rt_is_output_route(rt))
1863 		src = ip_hdr(skb)->saddr;
1864 	else {
1865 		struct fib_result res;
1866 		struct flowi4 fl4;
1867 		struct iphdr *iph;
1868 
1869 		iph = ip_hdr(skb);
1870 
1871 		memset(&fl4, 0, sizeof(fl4));
1872 		fl4.daddr = iph->daddr;
1873 		fl4.saddr = iph->saddr;
1874 		fl4.flowi4_tos = RT_TOS(iph->tos);
1875 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1876 		fl4.flowi4_iif = skb->dev->ifindex;
1877 		fl4.flowi4_mark = skb->mark;
1878 
1879 		rcu_read_lock();
1880 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882 		else
1883 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884 					RT_SCOPE_UNIVERSE);
1885 		rcu_read_unlock();
1886 	}
1887 	memcpy(addr, &src, 4);
1888 }
1889 
1890 #ifdef CONFIG_IP_ROUTE_CLASSID
1891 static void set_class_tag(struct rtable *rt, u32 tag)
1892 {
1893 	if (!(rt->dst.tclassid & 0xFFFF))
1894 		rt->dst.tclassid |= tag & 0xFFFF;
1895 	if (!(rt->dst.tclassid & 0xFFFF0000))
1896 		rt->dst.tclassid |= tag & 0xFFFF0000;
1897 }
1898 #endif
1899 
1900 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901 {
1902 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903 
1904 	if (advmss == 0) {
1905 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906 			       ip_rt_min_advmss);
1907 		if (advmss > 65535 - 40)
1908 			advmss = 65535 - 40;
1909 	}
1910 	return advmss;
1911 }
1912 
1913 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914 {
1915 	const struct rtable *rt = (const struct rtable *) dst;
1916 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917 
1918 	if (mtu && rt_is_output_route(rt))
1919 		return mtu;
1920 
1921 	mtu = dst->dev->mtu;
1922 
1923 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924 
1925 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926 			mtu = 576;
1927 	}
1928 
1929 	if (mtu > IP_MAX_MTU)
1930 		mtu = IP_MAX_MTU;
1931 
1932 	return mtu;
1933 }
1934 
1935 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936 			    struct fib_info *fi)
1937 {
1938 	struct inet_peer *peer;
1939 	int create = 0;
1940 
1941 	/* If a peer entry exists for this destination, we must hook
1942 	 * it up in order to get at cached metrics.
1943 	 */
1944 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945 		create = 1;
1946 
1947 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948 	if (peer) {
1949 		rt->rt_peer_genid = rt_peer_genid();
1950 		if (inet_metrics_new(peer))
1951 			memcpy(peer->metrics, fi->fib_metrics,
1952 			       sizeof(u32) * RTAX_MAX);
1953 		dst_init_metrics(&rt->dst, peer->metrics, false);
1954 
1955 		check_peer_pmtu(&rt->dst, peer);
1956 
1957 		if (peer->redirect_learned.a4 &&
1958 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1959 			rt->rt_gateway = peer->redirect_learned.a4;
1960 			rt->rt_flags |= RTCF_REDIRECTED;
1961 		}
1962 	} else {
1963 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964 			rt->fi = fi;
1965 			atomic_inc(&fi->fib_clntref);
1966 		}
1967 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968 	}
1969 }
1970 
1971 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972 			   const struct fib_result *res,
1973 			   struct fib_info *fi, u16 type, u32 itag)
1974 {
1975 	struct dst_entry *dst = &rt->dst;
1976 
1977 	if (fi) {
1978 		if (FIB_RES_GW(*res) &&
1979 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980 			rt->rt_gateway = FIB_RES_GW(*res);
1981 		rt_init_metrics(rt, fl4, fi);
1982 #ifdef CONFIG_IP_ROUTE_CLASSID
1983 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984 #endif
1985 	}
1986 
1987 	if (dst_mtu(dst) > IP_MAX_MTU)
1988 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991 
1992 #ifdef CONFIG_IP_ROUTE_CLASSID
1993 #ifdef CONFIG_IP_MULTIPLE_TABLES
1994 	set_class_tag(rt, fib_rules_tclass(res));
1995 #endif
1996 	set_class_tag(rt, itag);
1997 #endif
1998 }
1999 
2000 static struct rtable *rt_dst_alloc(struct net_device *dev,
2001 				   bool nopolicy, bool noxfrm)
2002 {
2003 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004 			 DST_HOST |
2005 			 (nopolicy ? DST_NOPOLICY : 0) |
2006 			 (noxfrm ? DST_NOXFRM : 0));
2007 }
2008 
2009 /* called in rcu_read_lock() section */
2010 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011 				u8 tos, struct net_device *dev, int our)
2012 {
2013 	unsigned int hash;
2014 	struct rtable *rth;
2015 	__be32 spec_dst;
2016 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2017 	u32 itag = 0;
2018 	int err;
2019 
2020 	/* Primary sanity checks. */
2021 
2022 	if (in_dev == NULL)
2023 		return -EINVAL;
2024 
2025 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027 		goto e_inval;
2028 
2029 	if (ipv4_is_zeronet(saddr)) {
2030 		if (!ipv4_is_local_multicast(daddr))
2031 			goto e_inval;
2032 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033 	} else {
2034 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035 					  &itag);
2036 		if (err < 0)
2037 			goto e_err;
2038 	}
2039 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2040 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041 	if (!rth)
2042 		goto e_nobufs;
2043 
2044 #ifdef CONFIG_IP_ROUTE_CLASSID
2045 	rth->dst.tclassid = itag;
2046 #endif
2047 	rth->dst.output = ip_rt_bug;
2048 
2049 	rth->rt_key_dst	= daddr;
2050 	rth->rt_key_src	= saddr;
2051 	rth->rt_genid	= rt_genid(dev_net(dev));
2052 	rth->rt_flags	= RTCF_MULTICAST;
2053 	rth->rt_type	= RTN_MULTICAST;
2054 	rth->rt_key_tos	= tos;
2055 	rth->rt_dst	= daddr;
2056 	rth->rt_src	= saddr;
2057 	rth->rt_route_iif = dev->ifindex;
2058 	rth->rt_iif	= dev->ifindex;
2059 	rth->rt_oif	= 0;
2060 	rth->rt_mark    = skb->mark;
2061 	rth->rt_gateway	= daddr;
2062 	rth->rt_spec_dst= spec_dst;
2063 	rth->rt_peer_genid = 0;
2064 	rth->peer = NULL;
2065 	rth->fi = NULL;
2066 	if (our) {
2067 		rth->dst.input= ip_local_deliver;
2068 		rth->rt_flags |= RTCF_LOCAL;
2069 	}
2070 
2071 #ifdef CONFIG_IP_MROUTE
2072 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073 		rth->dst.input = ip_mr_input;
2074 #endif
2075 	RT_CACHE_STAT_INC(in_slow_mc);
2076 
2077 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080 
2081 e_nobufs:
2082 	return -ENOBUFS;
2083 e_inval:
2084 	return -EINVAL;
2085 e_err:
2086 	return err;
2087 }
2088 
2089 
2090 static void ip_handle_martian_source(struct net_device *dev,
2091 				     struct in_device *in_dev,
2092 				     struct sk_buff *skb,
2093 				     __be32 daddr,
2094 				     __be32 saddr)
2095 {
2096 	RT_CACHE_STAT_INC(in_martian_src);
2097 #ifdef CONFIG_IP_ROUTE_VERBOSE
2098 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099 		/*
2100 		 *	RFC1812 recommendation, if source is martian,
2101 		 *	the only hint is MAC header.
2102 		 */
2103 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2104 			&daddr, &saddr, dev->name);
2105 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106 			print_hex_dump(KERN_WARNING, "ll header: ",
2107 				       DUMP_PREFIX_OFFSET, 16, 1,
2108 				       skb_mac_header(skb),
2109 				       dev->hard_header_len, true);
2110 		}
2111 	}
2112 #endif
2113 }
2114 
2115 /* called in rcu_read_lock() section */
2116 static int __mkroute_input(struct sk_buff *skb,
2117 			   const struct fib_result *res,
2118 			   struct in_device *in_dev,
2119 			   __be32 daddr, __be32 saddr, u32 tos,
2120 			   struct rtable **result)
2121 {
2122 	struct rtable *rth;
2123 	int err;
2124 	struct in_device *out_dev;
2125 	unsigned int flags = 0;
2126 	__be32 spec_dst;
2127 	u32 itag;
2128 
2129 	/* get a working reference to the output device */
2130 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2131 	if (out_dev == NULL) {
2132 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2133 		return -EINVAL;
2134 	}
2135 
2136 
2137 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138 				  in_dev->dev, &spec_dst, &itag);
2139 	if (err < 0) {
2140 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141 					 saddr);
2142 
2143 		goto cleanup;
2144 	}
2145 
2146 	if (err)
2147 		flags |= RTCF_DIRECTSRC;
2148 
2149 	if (out_dev == in_dev && err &&
2150 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2151 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2152 		flags |= RTCF_DOREDIRECT;
2153 
2154 	if (skb->protocol != htons(ETH_P_IP)) {
2155 		/* Not IP (i.e. ARP). Do not create route, if it is
2156 		 * invalid for proxy arp. DNAT routes are always valid.
2157 		 *
2158 		 * Proxy arp feature have been extended to allow, ARP
2159 		 * replies back to the same interface, to support
2160 		 * Private VLAN switch technologies. See arp.c.
2161 		 */
2162 		if (out_dev == in_dev &&
2163 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2164 			err = -EINVAL;
2165 			goto cleanup;
2166 		}
2167 	}
2168 
2169 	rth = rt_dst_alloc(out_dev->dev,
2170 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2171 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2172 	if (!rth) {
2173 		err = -ENOBUFS;
2174 		goto cleanup;
2175 	}
2176 
2177 	rth->rt_key_dst	= daddr;
2178 	rth->rt_key_src	= saddr;
2179 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2180 	rth->rt_flags = flags;
2181 	rth->rt_type = res->type;
2182 	rth->rt_key_tos	= tos;
2183 	rth->rt_dst	= daddr;
2184 	rth->rt_src	= saddr;
2185 	rth->rt_route_iif = in_dev->dev->ifindex;
2186 	rth->rt_iif 	= in_dev->dev->ifindex;
2187 	rth->rt_oif 	= 0;
2188 	rth->rt_mark    = skb->mark;
2189 	rth->rt_gateway	= daddr;
2190 	rth->rt_spec_dst= spec_dst;
2191 	rth->rt_peer_genid = 0;
2192 	rth->peer = NULL;
2193 	rth->fi = NULL;
2194 
2195 	rth->dst.input = ip_forward;
2196 	rth->dst.output = ip_output;
2197 
2198 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2199 
2200 	*result = rth;
2201 	err = 0;
2202  cleanup:
2203 	return err;
2204 }
2205 
2206 static int ip_mkroute_input(struct sk_buff *skb,
2207 			    struct fib_result *res,
2208 			    const struct flowi4 *fl4,
2209 			    struct in_device *in_dev,
2210 			    __be32 daddr, __be32 saddr, u32 tos)
2211 {
2212 	struct rtable *rth = NULL;
2213 	int err;
2214 	unsigned int hash;
2215 
2216 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2217 	if (res->fi && res->fi->fib_nhs > 1)
2218 		fib_select_multipath(res);
2219 #endif
2220 
2221 	/* create a routing cache entry */
2222 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2223 	if (err)
2224 		return err;
2225 
2226 	/* put it into the cache */
2227 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2228 		       rt_genid(dev_net(rth->dst.dev)));
2229 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2230 	if (IS_ERR(rth))
2231 		return PTR_ERR(rth);
2232 	return 0;
2233 }
2234 
2235 /*
2236  *	NOTE. We drop all the packets that has local source
2237  *	addresses, because every properly looped back packet
2238  *	must have correct destination already attached by output routine.
2239  *
2240  *	Such approach solves two big problems:
2241  *	1. Not simplex devices are handled properly.
2242  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2243  *	called with rcu_read_lock()
2244  */
2245 
2246 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247 			       u8 tos, struct net_device *dev)
2248 {
2249 	struct fib_result res;
2250 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2251 	struct flowi4	fl4;
2252 	unsigned int	flags = 0;
2253 	u32		itag = 0;
2254 	struct rtable	*rth;
2255 	unsigned int	hash;
2256 	__be32		spec_dst;
2257 	int		err = -EINVAL;
2258 	struct net    *net = dev_net(dev);
2259 
2260 	/* IP on this device is disabled. */
2261 
2262 	if (!in_dev)
2263 		goto out;
2264 
2265 	/* Check for the most weird martians, which can be not detected
2266 	   by fib_lookup.
2267 	 */
2268 
2269 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2270 	    ipv4_is_loopback(saddr))
2271 		goto martian_source;
2272 
2273 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2274 		goto brd_input;
2275 
2276 	/* Accept zero addresses only to limited broadcast;
2277 	 * I even do not know to fix it or not. Waiting for complains :-)
2278 	 */
2279 	if (ipv4_is_zeronet(saddr))
2280 		goto martian_source;
2281 
2282 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2283 		goto martian_destination;
2284 
2285 	/*
2286 	 *	Now we are ready to route packet.
2287 	 */
2288 	fl4.flowi4_oif = 0;
2289 	fl4.flowi4_iif = dev->ifindex;
2290 	fl4.flowi4_mark = skb->mark;
2291 	fl4.flowi4_tos = tos;
2292 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2293 	fl4.daddr = daddr;
2294 	fl4.saddr = saddr;
2295 	err = fib_lookup(net, &fl4, &res);
2296 	if (err != 0) {
2297 		if (!IN_DEV_FORWARD(in_dev))
2298 			goto e_hostunreach;
2299 		goto no_route;
2300 	}
2301 
2302 	RT_CACHE_STAT_INC(in_slow_tot);
2303 
2304 	if (res.type == RTN_BROADCAST)
2305 		goto brd_input;
2306 
2307 	if (res.type == RTN_LOCAL) {
2308 		err = fib_validate_source(skb, saddr, daddr, tos,
2309 					  net->loopback_dev->ifindex,
2310 					  dev, &spec_dst, &itag);
2311 		if (err < 0)
2312 			goto martian_source_keep_err;
2313 		if (err)
2314 			flags |= RTCF_DIRECTSRC;
2315 		spec_dst = daddr;
2316 		goto local_input;
2317 	}
2318 
2319 	if (!IN_DEV_FORWARD(in_dev))
2320 		goto e_hostunreach;
2321 	if (res.type != RTN_UNICAST)
2322 		goto martian_destination;
2323 
2324 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2325 out:	return err;
2326 
2327 brd_input:
2328 	if (skb->protocol != htons(ETH_P_IP))
2329 		goto e_inval;
2330 
2331 	if (ipv4_is_zeronet(saddr))
2332 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2333 	else {
2334 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335 					  &itag);
2336 		if (err < 0)
2337 			goto martian_source_keep_err;
2338 		if (err)
2339 			flags |= RTCF_DIRECTSRC;
2340 	}
2341 	flags |= RTCF_BROADCAST;
2342 	res.type = RTN_BROADCAST;
2343 	RT_CACHE_STAT_INC(in_brd);
2344 
2345 local_input:
2346 	rth = rt_dst_alloc(net->loopback_dev,
2347 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2348 	if (!rth)
2349 		goto e_nobufs;
2350 
2351 	rth->dst.input= ip_local_deliver;
2352 	rth->dst.output= ip_rt_bug;
2353 #ifdef CONFIG_IP_ROUTE_CLASSID
2354 	rth->dst.tclassid = itag;
2355 #endif
2356 
2357 	rth->rt_key_dst	= daddr;
2358 	rth->rt_key_src	= saddr;
2359 	rth->rt_genid = rt_genid(net);
2360 	rth->rt_flags 	= flags|RTCF_LOCAL;
2361 	rth->rt_type	= res.type;
2362 	rth->rt_key_tos	= tos;
2363 	rth->rt_dst	= daddr;
2364 	rth->rt_src	= saddr;
2365 #ifdef CONFIG_IP_ROUTE_CLASSID
2366 	rth->dst.tclassid = itag;
2367 #endif
2368 	rth->rt_route_iif = dev->ifindex;
2369 	rth->rt_iif	= dev->ifindex;
2370 	rth->rt_oif	= 0;
2371 	rth->rt_mark    = skb->mark;
2372 	rth->rt_gateway	= daddr;
2373 	rth->rt_spec_dst= spec_dst;
2374 	rth->rt_peer_genid = 0;
2375 	rth->peer = NULL;
2376 	rth->fi = NULL;
2377 	if (res.type == RTN_UNREACHABLE) {
2378 		rth->dst.input= ip_error;
2379 		rth->dst.error= -err;
2380 		rth->rt_flags 	&= ~RTCF_LOCAL;
2381 	}
2382 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2383 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2384 	err = 0;
2385 	if (IS_ERR(rth))
2386 		err = PTR_ERR(rth);
2387 	goto out;
2388 
2389 no_route:
2390 	RT_CACHE_STAT_INC(in_no_route);
2391 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392 	res.type = RTN_UNREACHABLE;
2393 	if (err == -ESRCH)
2394 		err = -ENETUNREACH;
2395 	goto local_input;
2396 
2397 	/*
2398 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2399 	 */
2400 martian_destination:
2401 	RT_CACHE_STAT_INC(in_martian_dst);
2402 #ifdef CONFIG_IP_ROUTE_VERBOSE
2403 	if (IN_DEV_LOG_MARTIANS(in_dev))
2404 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2405 				     &daddr, &saddr, dev->name);
2406 #endif
2407 
2408 e_hostunreach:
2409 	err = -EHOSTUNREACH;
2410 	goto out;
2411 
2412 e_inval:
2413 	err = -EINVAL;
2414 	goto out;
2415 
2416 e_nobufs:
2417 	err = -ENOBUFS;
2418 	goto out;
2419 
2420 martian_source:
2421 	err = -EINVAL;
2422 martian_source_keep_err:
2423 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2424 	goto out;
2425 }
2426 
2427 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2428 			   u8 tos, struct net_device *dev, bool noref)
2429 {
2430 	struct rtable	*rth;
2431 	unsigned int	hash;
2432 	int iif = dev->ifindex;
2433 	struct net *net;
2434 	int res;
2435 
2436 	net = dev_net(dev);
2437 
2438 	rcu_read_lock();
2439 
2440 	if (!rt_caching(net))
2441 		goto skip_cache;
2442 
2443 	tos &= IPTOS_RT_MASK;
2444 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2445 
2446 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447 	     rth = rcu_dereference(rth->dst.rt_next)) {
2448 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2449 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2450 		     (rth->rt_route_iif ^ iif) |
2451 		     (rth->rt_key_tos ^ tos)) == 0 &&
2452 		    rth->rt_mark == skb->mark &&
2453 		    net_eq(dev_net(rth->dst.dev), net) &&
2454 		    !rt_is_expired(rth)) {
2455 			ipv4_validate_peer(rth);
2456 			if (noref) {
2457 				dst_use_noref(&rth->dst, jiffies);
2458 				skb_dst_set_noref(skb, &rth->dst);
2459 			} else {
2460 				dst_use(&rth->dst, jiffies);
2461 				skb_dst_set(skb, &rth->dst);
2462 			}
2463 			RT_CACHE_STAT_INC(in_hit);
2464 			rcu_read_unlock();
2465 			return 0;
2466 		}
2467 		RT_CACHE_STAT_INC(in_hlist_search);
2468 	}
2469 
2470 skip_cache:
2471 	/* Multicast recognition logic is moved from route cache to here.
2472 	   The problem was that too many Ethernet cards have broken/missing
2473 	   hardware multicast filters :-( As result the host on multicasting
2474 	   network acquires a lot of useless route cache entries, sort of
2475 	   SDR messages from all the world. Now we try to get rid of them.
2476 	   Really, provided software IP multicast filter is organized
2477 	   reasonably (at least, hashed), it does not result in a slowdown
2478 	   comparing with route cache reject entries.
2479 	   Note, that multicast routers are not affected, because
2480 	   route cache entry is created eventually.
2481 	 */
2482 	if (ipv4_is_multicast(daddr)) {
2483 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2484 
2485 		if (in_dev) {
2486 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2487 						  ip_hdr(skb)->protocol);
2488 			if (our
2489 #ifdef CONFIG_IP_MROUTE
2490 				||
2491 			    (!ipv4_is_local_multicast(daddr) &&
2492 			     IN_DEV_MFORWARD(in_dev))
2493 #endif
2494 			   ) {
2495 				int res = ip_route_input_mc(skb, daddr, saddr,
2496 							    tos, dev, our);
2497 				rcu_read_unlock();
2498 				return res;
2499 			}
2500 		}
2501 		rcu_read_unlock();
2502 		return -EINVAL;
2503 	}
2504 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2505 	rcu_read_unlock();
2506 	return res;
2507 }
2508 EXPORT_SYMBOL(ip_route_input_common);
2509 
2510 /* called with rcu_read_lock() */
2511 static struct rtable *__mkroute_output(const struct fib_result *res,
2512 				       const struct flowi4 *fl4,
2513 				       __be32 orig_daddr, __be32 orig_saddr,
2514 				       int orig_oif, __u8 orig_rtos,
2515 				       struct net_device *dev_out,
2516 				       unsigned int flags)
2517 {
2518 	struct fib_info *fi = res->fi;
2519 	struct in_device *in_dev;
2520 	u16 type = res->type;
2521 	struct rtable *rth;
2522 
2523 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2524 		return ERR_PTR(-EINVAL);
2525 
2526 	if (ipv4_is_lbcast(fl4->daddr))
2527 		type = RTN_BROADCAST;
2528 	else if (ipv4_is_multicast(fl4->daddr))
2529 		type = RTN_MULTICAST;
2530 	else if (ipv4_is_zeronet(fl4->daddr))
2531 		return ERR_PTR(-EINVAL);
2532 
2533 	if (dev_out->flags & IFF_LOOPBACK)
2534 		flags |= RTCF_LOCAL;
2535 
2536 	in_dev = __in_dev_get_rcu(dev_out);
2537 	if (!in_dev)
2538 		return ERR_PTR(-EINVAL);
2539 
2540 	if (type == RTN_BROADCAST) {
2541 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542 		fi = NULL;
2543 	} else if (type == RTN_MULTICAST) {
2544 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2545 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2546 				     fl4->flowi4_proto))
2547 			flags &= ~RTCF_LOCAL;
2548 		/* If multicast route do not exist use
2549 		 * default one, but do not gateway in this case.
2550 		 * Yes, it is hack.
2551 		 */
2552 		if (fi && res->prefixlen < 4)
2553 			fi = NULL;
2554 	}
2555 
2556 	rth = rt_dst_alloc(dev_out,
2557 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2558 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2559 	if (!rth)
2560 		return ERR_PTR(-ENOBUFS);
2561 
2562 	rth->dst.output = ip_output;
2563 
2564 	rth->rt_key_dst	= orig_daddr;
2565 	rth->rt_key_src	= orig_saddr;
2566 	rth->rt_genid = rt_genid(dev_net(dev_out));
2567 	rth->rt_flags	= flags;
2568 	rth->rt_type	= type;
2569 	rth->rt_key_tos	= orig_rtos;
2570 	rth->rt_dst	= fl4->daddr;
2571 	rth->rt_src	= fl4->saddr;
2572 	rth->rt_route_iif = 0;
2573 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2574 	rth->rt_oif	= orig_oif;
2575 	rth->rt_mark    = fl4->flowi4_mark;
2576 	rth->rt_gateway = fl4->daddr;
2577 	rth->rt_spec_dst= fl4->saddr;
2578 	rth->rt_peer_genid = 0;
2579 	rth->peer = NULL;
2580 	rth->fi = NULL;
2581 
2582 	RT_CACHE_STAT_INC(out_slow_tot);
2583 
2584 	if (flags & RTCF_LOCAL) {
2585 		rth->dst.input = ip_local_deliver;
2586 		rth->rt_spec_dst = fl4->daddr;
2587 	}
2588 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589 		rth->rt_spec_dst = fl4->saddr;
2590 		if (flags & RTCF_LOCAL &&
2591 		    !(dev_out->flags & IFF_LOOPBACK)) {
2592 			rth->dst.output = ip_mc_output;
2593 			RT_CACHE_STAT_INC(out_slow_mc);
2594 		}
2595 #ifdef CONFIG_IP_MROUTE
2596 		if (type == RTN_MULTICAST) {
2597 			if (IN_DEV_MFORWARD(in_dev) &&
2598 			    !ipv4_is_local_multicast(fl4->daddr)) {
2599 				rth->dst.input = ip_mr_input;
2600 				rth->dst.output = ip_mc_output;
2601 			}
2602 		}
2603 #endif
2604 	}
2605 
2606 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2607 
2608 	return rth;
2609 }
2610 
2611 /*
2612  * Major route resolver routine.
2613  * called with rcu_read_lock();
2614  */
2615 
2616 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2617 {
2618 	struct net_device *dev_out = NULL;
2619 	__u8 tos = RT_FL_TOS(fl4);
2620 	unsigned int flags = 0;
2621 	struct fib_result res;
2622 	struct rtable *rth;
2623 	__be32 orig_daddr;
2624 	__be32 orig_saddr;
2625 	int orig_oif;
2626 
2627 	res.fi		= NULL;
2628 #ifdef CONFIG_IP_MULTIPLE_TABLES
2629 	res.r		= NULL;
2630 #endif
2631 
2632 	orig_daddr = fl4->daddr;
2633 	orig_saddr = fl4->saddr;
2634 	orig_oif = fl4->flowi4_oif;
2635 
2636 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2637 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2638 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2639 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2640 
2641 	rcu_read_lock();
2642 	if (fl4->saddr) {
2643 		rth = ERR_PTR(-EINVAL);
2644 		if (ipv4_is_multicast(fl4->saddr) ||
2645 		    ipv4_is_lbcast(fl4->saddr) ||
2646 		    ipv4_is_zeronet(fl4->saddr))
2647 			goto out;
2648 
2649 		/* I removed check for oif == dev_out->oif here.
2650 		   It was wrong for two reasons:
2651 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2652 		      is assigned to multiple interfaces.
2653 		   2. Moreover, we are allowed to send packets with saddr
2654 		      of another iface. --ANK
2655 		 */
2656 
2657 		if (fl4->flowi4_oif == 0 &&
2658 		    (ipv4_is_multicast(fl4->daddr) ||
2659 		     ipv4_is_lbcast(fl4->daddr))) {
2660 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2661 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2662 			if (dev_out == NULL)
2663 				goto out;
2664 
2665 			/* Special hack: user can direct multicasts
2666 			   and limited broadcast via necessary interface
2667 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2668 			   This hack is not just for fun, it allows
2669 			   vic,vat and friends to work.
2670 			   They bind socket to loopback, set ttl to zero
2671 			   and expect that it will work.
2672 			   From the viewpoint of routing cache they are broken,
2673 			   because we are not allowed to build multicast path
2674 			   with loopback source addr (look, routing cache
2675 			   cannot know, that ttl is zero, so that packet
2676 			   will not leave this host and route is valid).
2677 			   Luckily, this hack is good workaround.
2678 			 */
2679 
2680 			fl4->flowi4_oif = dev_out->ifindex;
2681 			goto make_route;
2682 		}
2683 
2684 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2685 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2686 			if (!__ip_dev_find(net, fl4->saddr, false))
2687 				goto out;
2688 		}
2689 	}
2690 
2691 
2692 	if (fl4->flowi4_oif) {
2693 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2694 		rth = ERR_PTR(-ENODEV);
2695 		if (dev_out == NULL)
2696 			goto out;
2697 
2698 		/* RACE: Check return value of inet_select_addr instead. */
2699 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2700 			rth = ERR_PTR(-ENETUNREACH);
2701 			goto out;
2702 		}
2703 		if (ipv4_is_local_multicast(fl4->daddr) ||
2704 		    ipv4_is_lbcast(fl4->daddr)) {
2705 			if (!fl4->saddr)
2706 				fl4->saddr = inet_select_addr(dev_out, 0,
2707 							      RT_SCOPE_LINK);
2708 			goto make_route;
2709 		}
2710 		if (fl4->saddr) {
2711 			if (ipv4_is_multicast(fl4->daddr))
2712 				fl4->saddr = inet_select_addr(dev_out, 0,
2713 							      fl4->flowi4_scope);
2714 			else if (!fl4->daddr)
2715 				fl4->saddr = inet_select_addr(dev_out, 0,
2716 							      RT_SCOPE_HOST);
2717 		}
2718 	}
2719 
2720 	if (!fl4->daddr) {
2721 		fl4->daddr = fl4->saddr;
2722 		if (!fl4->daddr)
2723 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2724 		dev_out = net->loopback_dev;
2725 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2726 		res.type = RTN_LOCAL;
2727 		flags |= RTCF_LOCAL;
2728 		goto make_route;
2729 	}
2730 
2731 	if (fib_lookup(net, fl4, &res)) {
2732 		res.fi = NULL;
2733 		if (fl4->flowi4_oif) {
2734 			/* Apparently, routing tables are wrong. Assume,
2735 			   that the destination is on link.
2736 
2737 			   WHY? DW.
2738 			   Because we are allowed to send to iface
2739 			   even if it has NO routes and NO assigned
2740 			   addresses. When oif is specified, routing
2741 			   tables are looked up with only one purpose:
2742 			   to catch if destination is gatewayed, rather than
2743 			   direct. Moreover, if MSG_DONTROUTE is set,
2744 			   we send packet, ignoring both routing tables
2745 			   and ifaddr state. --ANK
2746 
2747 
2748 			   We could make it even if oif is unknown,
2749 			   likely IPv6, but we do not.
2750 			 */
2751 
2752 			if (fl4->saddr == 0)
2753 				fl4->saddr = inet_select_addr(dev_out, 0,
2754 							      RT_SCOPE_LINK);
2755 			res.type = RTN_UNICAST;
2756 			goto make_route;
2757 		}
2758 		rth = ERR_PTR(-ENETUNREACH);
2759 		goto out;
2760 	}
2761 
2762 	if (res.type == RTN_LOCAL) {
2763 		if (!fl4->saddr) {
2764 			if (res.fi->fib_prefsrc)
2765 				fl4->saddr = res.fi->fib_prefsrc;
2766 			else
2767 				fl4->saddr = fl4->daddr;
2768 		}
2769 		dev_out = net->loopback_dev;
2770 		fl4->flowi4_oif = dev_out->ifindex;
2771 		res.fi = NULL;
2772 		flags |= RTCF_LOCAL;
2773 		goto make_route;
2774 	}
2775 
2776 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2777 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2778 		fib_select_multipath(&res);
2779 	else
2780 #endif
2781 	if (!res.prefixlen &&
2782 	    res.table->tb_num_default > 1 &&
2783 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2784 		fib_select_default(&res);
2785 
2786 	if (!fl4->saddr)
2787 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2788 
2789 	dev_out = FIB_RES_DEV(res);
2790 	fl4->flowi4_oif = dev_out->ifindex;
2791 
2792 
2793 make_route:
2794 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2795 			       tos, dev_out, flags);
2796 	if (!IS_ERR(rth)) {
2797 		unsigned int hash;
2798 
2799 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2800 			       rt_genid(dev_net(dev_out)));
2801 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2802 	}
2803 
2804 out:
2805 	rcu_read_unlock();
2806 	return rth;
2807 }
2808 
2809 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2810 {
2811 	struct rtable *rth;
2812 	unsigned int hash;
2813 
2814 	if (!rt_caching(net))
2815 		goto slow_output;
2816 
2817 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2818 
2819 	rcu_read_lock_bh();
2820 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2821 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2822 		if (rth->rt_key_dst == flp4->daddr &&
2823 		    rth->rt_key_src == flp4->saddr &&
2824 		    rt_is_output_route(rth) &&
2825 		    rth->rt_oif == flp4->flowi4_oif &&
2826 		    rth->rt_mark == flp4->flowi4_mark &&
2827 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2828 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829 		    net_eq(dev_net(rth->dst.dev), net) &&
2830 		    !rt_is_expired(rth)) {
2831 			ipv4_validate_peer(rth);
2832 			dst_use(&rth->dst, jiffies);
2833 			RT_CACHE_STAT_INC(out_hit);
2834 			rcu_read_unlock_bh();
2835 			if (!flp4->saddr)
2836 				flp4->saddr = rth->rt_src;
2837 			if (!flp4->daddr)
2838 				flp4->daddr = rth->rt_dst;
2839 			return rth;
2840 		}
2841 		RT_CACHE_STAT_INC(out_hlist_search);
2842 	}
2843 	rcu_read_unlock_bh();
2844 
2845 slow_output:
2846 	return ip_route_output_slow(net, flp4);
2847 }
2848 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2849 
2850 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2851 {
2852 	return NULL;
2853 }
2854 
2855 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2856 {
2857 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2858 
2859 	return mtu ? : dst->dev->mtu;
2860 }
2861 
2862 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2863 {
2864 }
2865 
2866 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2867 					  unsigned long old)
2868 {
2869 	return NULL;
2870 }
2871 
2872 static struct dst_ops ipv4_dst_blackhole_ops = {
2873 	.family			=	AF_INET,
2874 	.protocol		=	cpu_to_be16(ETH_P_IP),
2875 	.destroy		=	ipv4_dst_destroy,
2876 	.check			=	ipv4_blackhole_dst_check,
2877 	.mtu			=	ipv4_blackhole_mtu,
2878 	.default_advmss		=	ipv4_default_advmss,
2879 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2880 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2881 	.neigh_lookup		=	ipv4_neigh_lookup,
2882 };
2883 
2884 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2885 {
2886 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2887 	struct rtable *ort = (struct rtable *) dst_orig;
2888 
2889 	if (rt) {
2890 		struct dst_entry *new = &rt->dst;
2891 
2892 		new->__use = 1;
2893 		new->input = dst_discard;
2894 		new->output = dst_discard;
2895 		dst_copy_metrics(new, &ort->dst);
2896 
2897 		new->dev = ort->dst.dev;
2898 		if (new->dev)
2899 			dev_hold(new->dev);
2900 
2901 		rt->rt_key_dst = ort->rt_key_dst;
2902 		rt->rt_key_src = ort->rt_key_src;
2903 		rt->rt_key_tos = ort->rt_key_tos;
2904 		rt->rt_route_iif = ort->rt_route_iif;
2905 		rt->rt_iif = ort->rt_iif;
2906 		rt->rt_oif = ort->rt_oif;
2907 		rt->rt_mark = ort->rt_mark;
2908 
2909 		rt->rt_genid = rt_genid(net);
2910 		rt->rt_flags = ort->rt_flags;
2911 		rt->rt_type = ort->rt_type;
2912 		rt->rt_dst = ort->rt_dst;
2913 		rt->rt_src = ort->rt_src;
2914 		rt->rt_gateway = ort->rt_gateway;
2915 		rt->rt_spec_dst = ort->rt_spec_dst;
2916 		rt->peer = ort->peer;
2917 		if (rt->peer)
2918 			atomic_inc(&rt->peer->refcnt);
2919 		rt->fi = ort->fi;
2920 		if (rt->fi)
2921 			atomic_inc(&rt->fi->fib_clntref);
2922 
2923 		dst_free(new);
2924 	}
2925 
2926 	dst_release(dst_orig);
2927 
2928 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2929 }
2930 
2931 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2932 				    struct sock *sk)
2933 {
2934 	struct rtable *rt = __ip_route_output_key(net, flp4);
2935 
2936 	if (IS_ERR(rt))
2937 		return rt;
2938 
2939 	if (flp4->flowi4_proto)
2940 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2941 						   flowi4_to_flowi(flp4),
2942 						   sk, 0);
2943 
2944 	return rt;
2945 }
2946 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2947 
2948 static int rt_fill_info(struct net *net,
2949 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2950 			int nowait, unsigned int flags)
2951 {
2952 	struct rtable *rt = skb_rtable(skb);
2953 	struct rtmsg *r;
2954 	struct nlmsghdr *nlh;
2955 	unsigned long expires = 0;
2956 	const struct inet_peer *peer = rt->peer;
2957 	u32 id = 0, ts = 0, tsage = 0, error;
2958 
2959 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960 	if (nlh == NULL)
2961 		return -EMSGSIZE;
2962 
2963 	r = nlmsg_data(nlh);
2964 	r->rtm_family	 = AF_INET;
2965 	r->rtm_dst_len	= 32;
2966 	r->rtm_src_len	= 0;
2967 	r->rtm_tos	= rt->rt_key_tos;
2968 	r->rtm_table	= RT_TABLE_MAIN;
2969 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970 		goto nla_put_failure;
2971 	r->rtm_type	= rt->rt_type;
2972 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2973 	r->rtm_protocol = RTPROT_UNSPEC;
2974 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2975 	if (rt->rt_flags & RTCF_NOTIFY)
2976 		r->rtm_flags |= RTM_F_NOTIFY;
2977 
2978 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2979 		goto nla_put_failure;
2980 	if (rt->rt_key_src) {
2981 		r->rtm_src_len = 32;
2982 		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2983 			goto nla_put_failure;
2984 	}
2985 	if (rt->dst.dev &&
2986 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2987 		goto nla_put_failure;
2988 #ifdef CONFIG_IP_ROUTE_CLASSID
2989 	if (rt->dst.tclassid &&
2990 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991 		goto nla_put_failure;
2992 #endif
2993 	if (rt_is_input_route(rt)) {
2994 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2995 			goto nla_put_failure;
2996 	} else if (rt->rt_src != rt->rt_key_src) {
2997 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2998 			goto nla_put_failure;
2999 	}
3000 	if (rt->rt_dst != rt->rt_gateway &&
3001 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3002 		goto nla_put_failure;
3003 
3004 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005 		goto nla_put_failure;
3006 
3007 	if (rt->rt_mark &&
3008 	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3009 		goto nla_put_failure;
3010 
3011 	error = rt->dst.error;
3012 	if (peer) {
3013 		inet_peer_refcheck(rt->peer);
3014 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3015 		if (peer->tcp_ts_stamp) {
3016 			ts = peer->tcp_ts;
3017 			tsage = get_seconds() - peer->tcp_ts_stamp;
3018 		}
3019 		expires = ACCESS_ONCE(peer->pmtu_expires);
3020 		if (expires) {
3021 			if (time_before(jiffies, expires))
3022 				expires -= jiffies;
3023 			else
3024 				expires = 0;
3025 		}
3026 	}
3027 
3028 	if (rt_is_input_route(rt)) {
3029 #ifdef CONFIG_IP_MROUTE
3030 		__be32 dst = rt->rt_dst;
3031 
3032 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3033 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3034 			int err = ipmr_get_route(net, skb,
3035 						 rt->rt_src, rt->rt_dst,
3036 						 r, nowait);
3037 			if (err <= 0) {
3038 				if (!nowait) {
3039 					if (err == 0)
3040 						return 0;
3041 					goto nla_put_failure;
3042 				} else {
3043 					if (err == -EMSGSIZE)
3044 						goto nla_put_failure;
3045 					error = err;
3046 				}
3047 			}
3048 		} else
3049 #endif
3050 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051 				goto nla_put_failure;
3052 	}
3053 
3054 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3055 			       expires, error) < 0)
3056 		goto nla_put_failure;
3057 
3058 	return nlmsg_end(skb, nlh);
3059 
3060 nla_put_failure:
3061 	nlmsg_cancel(skb, nlh);
3062 	return -EMSGSIZE;
3063 }
3064 
3065 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3066 {
3067 	struct net *net = sock_net(in_skb->sk);
3068 	struct rtmsg *rtm;
3069 	struct nlattr *tb[RTA_MAX+1];
3070 	struct rtable *rt = NULL;
3071 	__be32 dst = 0;
3072 	__be32 src = 0;
3073 	u32 iif;
3074 	int err;
3075 	int mark;
3076 	struct sk_buff *skb;
3077 
3078 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3079 	if (err < 0)
3080 		goto errout;
3081 
3082 	rtm = nlmsg_data(nlh);
3083 
3084 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3085 	if (skb == NULL) {
3086 		err = -ENOBUFS;
3087 		goto errout;
3088 	}
3089 
3090 	/* Reserve room for dummy headers, this skb can pass
3091 	   through good chunk of routing engine.
3092 	 */
3093 	skb_reset_mac_header(skb);
3094 	skb_reset_network_header(skb);
3095 
3096 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3097 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3098 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3099 
3100 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3101 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3102 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3103 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3104 
3105 	if (iif) {
3106 		struct net_device *dev;
3107 
3108 		dev = __dev_get_by_index(net, iif);
3109 		if (dev == NULL) {
3110 			err = -ENODEV;
3111 			goto errout_free;
3112 		}
3113 
3114 		skb->protocol	= htons(ETH_P_IP);
3115 		skb->dev	= dev;
3116 		skb->mark	= mark;
3117 		local_bh_disable();
3118 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3119 		local_bh_enable();
3120 
3121 		rt = skb_rtable(skb);
3122 		if (err == 0 && rt->dst.error)
3123 			err = -rt->dst.error;
3124 	} else {
3125 		struct flowi4 fl4 = {
3126 			.daddr = dst,
3127 			.saddr = src,
3128 			.flowi4_tos = rtm->rtm_tos,
3129 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3130 			.flowi4_mark = mark,
3131 		};
3132 		rt = ip_route_output_key(net, &fl4);
3133 
3134 		err = 0;
3135 		if (IS_ERR(rt))
3136 			err = PTR_ERR(rt);
3137 	}
3138 
3139 	if (err)
3140 		goto errout_free;
3141 
3142 	skb_dst_set(skb, &rt->dst);
3143 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3144 		rt->rt_flags |= RTCF_NOTIFY;
3145 
3146 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3147 			   RTM_NEWROUTE, 0, 0);
3148 	if (err <= 0)
3149 		goto errout_free;
3150 
3151 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3152 errout:
3153 	return err;
3154 
3155 errout_free:
3156 	kfree_skb(skb);
3157 	goto errout;
3158 }
3159 
3160 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3161 {
3162 	struct rtable *rt;
3163 	int h, s_h;
3164 	int idx, s_idx;
3165 	struct net *net;
3166 
3167 	net = sock_net(skb->sk);
3168 
3169 	s_h = cb->args[0];
3170 	if (s_h < 0)
3171 		s_h = 0;
3172 	s_idx = idx = cb->args[1];
3173 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3174 		if (!rt_hash_table[h].chain)
3175 			continue;
3176 		rcu_read_lock_bh();
3177 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3178 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3179 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3180 				continue;
3181 			if (rt_is_expired(rt))
3182 				continue;
3183 			skb_dst_set_noref(skb, &rt->dst);
3184 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3185 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3186 					 1, NLM_F_MULTI) <= 0) {
3187 				skb_dst_drop(skb);
3188 				rcu_read_unlock_bh();
3189 				goto done;
3190 			}
3191 			skb_dst_drop(skb);
3192 		}
3193 		rcu_read_unlock_bh();
3194 	}
3195 
3196 done:
3197 	cb->args[0] = h;
3198 	cb->args[1] = idx;
3199 	return skb->len;
3200 }
3201 
3202 void ip_rt_multicast_event(struct in_device *in_dev)
3203 {
3204 	rt_cache_flush(dev_net(in_dev->dev), 0);
3205 }
3206 
3207 #ifdef CONFIG_SYSCTL
3208 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3209 					void __user *buffer,
3210 					size_t *lenp, loff_t *ppos)
3211 {
3212 	if (write) {
3213 		int flush_delay;
3214 		ctl_table ctl;
3215 		struct net *net;
3216 
3217 		memcpy(&ctl, __ctl, sizeof(ctl));
3218 		ctl.data = &flush_delay;
3219 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3220 
3221 		net = (struct net *)__ctl->extra1;
3222 		rt_cache_flush(net, flush_delay);
3223 		return 0;
3224 	}
3225 
3226 	return -EINVAL;
3227 }
3228 
3229 static ctl_table ipv4_route_table[] = {
3230 	{
3231 		.procname	= "gc_thresh",
3232 		.data		= &ipv4_dst_ops.gc_thresh,
3233 		.maxlen		= sizeof(int),
3234 		.mode		= 0644,
3235 		.proc_handler	= proc_dointvec,
3236 	},
3237 	{
3238 		.procname	= "max_size",
3239 		.data		= &ip_rt_max_size,
3240 		.maxlen		= sizeof(int),
3241 		.mode		= 0644,
3242 		.proc_handler	= proc_dointvec,
3243 	},
3244 	{
3245 		/*  Deprecated. Use gc_min_interval_ms */
3246 
3247 		.procname	= "gc_min_interval",
3248 		.data		= &ip_rt_gc_min_interval,
3249 		.maxlen		= sizeof(int),
3250 		.mode		= 0644,
3251 		.proc_handler	= proc_dointvec_jiffies,
3252 	},
3253 	{
3254 		.procname	= "gc_min_interval_ms",
3255 		.data		= &ip_rt_gc_min_interval,
3256 		.maxlen		= sizeof(int),
3257 		.mode		= 0644,
3258 		.proc_handler	= proc_dointvec_ms_jiffies,
3259 	},
3260 	{
3261 		.procname	= "gc_timeout",
3262 		.data		= &ip_rt_gc_timeout,
3263 		.maxlen		= sizeof(int),
3264 		.mode		= 0644,
3265 		.proc_handler	= proc_dointvec_jiffies,
3266 	},
3267 	{
3268 		.procname	= "gc_interval",
3269 		.data		= &ip_rt_gc_interval,
3270 		.maxlen		= sizeof(int),
3271 		.mode		= 0644,
3272 		.proc_handler	= proc_dointvec_jiffies,
3273 	},
3274 	{
3275 		.procname	= "redirect_load",
3276 		.data		= &ip_rt_redirect_load,
3277 		.maxlen		= sizeof(int),
3278 		.mode		= 0644,
3279 		.proc_handler	= proc_dointvec,
3280 	},
3281 	{
3282 		.procname	= "redirect_number",
3283 		.data		= &ip_rt_redirect_number,
3284 		.maxlen		= sizeof(int),
3285 		.mode		= 0644,
3286 		.proc_handler	= proc_dointvec,
3287 	},
3288 	{
3289 		.procname	= "redirect_silence",
3290 		.data		= &ip_rt_redirect_silence,
3291 		.maxlen		= sizeof(int),
3292 		.mode		= 0644,
3293 		.proc_handler	= proc_dointvec,
3294 	},
3295 	{
3296 		.procname	= "error_cost",
3297 		.data		= &ip_rt_error_cost,
3298 		.maxlen		= sizeof(int),
3299 		.mode		= 0644,
3300 		.proc_handler	= proc_dointvec,
3301 	},
3302 	{
3303 		.procname	= "error_burst",
3304 		.data		= &ip_rt_error_burst,
3305 		.maxlen		= sizeof(int),
3306 		.mode		= 0644,
3307 		.proc_handler	= proc_dointvec,
3308 	},
3309 	{
3310 		.procname	= "gc_elasticity",
3311 		.data		= &ip_rt_gc_elasticity,
3312 		.maxlen		= sizeof(int),
3313 		.mode		= 0644,
3314 		.proc_handler	= proc_dointvec,
3315 	},
3316 	{
3317 		.procname	= "mtu_expires",
3318 		.data		= &ip_rt_mtu_expires,
3319 		.maxlen		= sizeof(int),
3320 		.mode		= 0644,
3321 		.proc_handler	= proc_dointvec_jiffies,
3322 	},
3323 	{
3324 		.procname	= "min_pmtu",
3325 		.data		= &ip_rt_min_pmtu,
3326 		.maxlen		= sizeof(int),
3327 		.mode		= 0644,
3328 		.proc_handler	= proc_dointvec,
3329 	},
3330 	{
3331 		.procname	= "min_adv_mss",
3332 		.data		= &ip_rt_min_advmss,
3333 		.maxlen		= sizeof(int),
3334 		.mode		= 0644,
3335 		.proc_handler	= proc_dointvec,
3336 	},
3337 	{ }
3338 };
3339 
3340 static struct ctl_table ipv4_route_flush_table[] = {
3341 	{
3342 		.procname	= "flush",
3343 		.maxlen		= sizeof(int),
3344 		.mode		= 0200,
3345 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3346 	},
3347 	{ },
3348 };
3349 
3350 static __net_init int sysctl_route_net_init(struct net *net)
3351 {
3352 	struct ctl_table *tbl;
3353 
3354 	tbl = ipv4_route_flush_table;
3355 	if (!net_eq(net, &init_net)) {
3356 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3357 		if (tbl == NULL)
3358 			goto err_dup;
3359 	}
3360 	tbl[0].extra1 = net;
3361 
3362 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3363 	if (net->ipv4.route_hdr == NULL)
3364 		goto err_reg;
3365 	return 0;
3366 
3367 err_reg:
3368 	if (tbl != ipv4_route_flush_table)
3369 		kfree(tbl);
3370 err_dup:
3371 	return -ENOMEM;
3372 }
3373 
3374 static __net_exit void sysctl_route_net_exit(struct net *net)
3375 {
3376 	struct ctl_table *tbl;
3377 
3378 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3379 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3380 	BUG_ON(tbl == ipv4_route_flush_table);
3381 	kfree(tbl);
3382 }
3383 
3384 static __net_initdata struct pernet_operations sysctl_route_ops = {
3385 	.init = sysctl_route_net_init,
3386 	.exit = sysctl_route_net_exit,
3387 };
3388 #endif
3389 
3390 static __net_init int rt_genid_init(struct net *net)
3391 {
3392 	get_random_bytes(&net->ipv4.rt_genid,
3393 			 sizeof(net->ipv4.rt_genid));
3394 	get_random_bytes(&net->ipv4.dev_addr_genid,
3395 			 sizeof(net->ipv4.dev_addr_genid));
3396 	return 0;
3397 }
3398 
3399 static __net_initdata struct pernet_operations rt_genid_ops = {
3400 	.init = rt_genid_init,
3401 };
3402 
3403 
3404 #ifdef CONFIG_IP_ROUTE_CLASSID
3405 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3406 #endif /* CONFIG_IP_ROUTE_CLASSID */
3407 
3408 static __initdata unsigned long rhash_entries;
3409 static int __init set_rhash_entries(char *str)
3410 {
3411 	ssize_t ret;
3412 
3413 	if (!str)
3414 		return 0;
3415 
3416 	ret = kstrtoul(str, 0, &rhash_entries);
3417 	if (ret)
3418 		return 0;
3419 
3420 	return 1;
3421 }
3422 __setup("rhash_entries=", set_rhash_entries);
3423 
3424 int __init ip_rt_init(void)
3425 {
3426 	int rc = 0;
3427 
3428 #ifdef CONFIG_IP_ROUTE_CLASSID
3429 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3430 	if (!ip_rt_acct)
3431 		panic("IP: failed to allocate ip_rt_acct\n");
3432 #endif
3433 
3434 	ipv4_dst_ops.kmem_cachep =
3435 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3436 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3437 
3438 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3439 
3440 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3441 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3442 
3443 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3444 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3445 
3446 	rt_hash_table = (struct rt_hash_bucket *)
3447 		alloc_large_system_hash("IP route cache",
3448 					sizeof(struct rt_hash_bucket),
3449 					rhash_entries,
3450 					(totalram_pages >= 128 * 1024) ?
3451 					15 : 17,
3452 					0,
3453 					&rt_hash_log,
3454 					&rt_hash_mask,
3455 					0,
3456 					rhash_entries ? 0 : 512 * 1024);
3457 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3458 	rt_hash_lock_init();
3459 
3460 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3461 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3462 
3463 	devinet_init();
3464 	ip_fib_init();
3465 
3466 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3467 	expires_ljiffies = jiffies;
3468 	schedule_delayed_work(&expires_work,
3469 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3470 
3471 	if (ip_rt_proc_init())
3472 		pr_err("Unable to create route proc files\n");
3473 #ifdef CONFIG_XFRM
3474 	xfrm_init();
3475 	xfrm4_init(ip_rt_max_size);
3476 #endif
3477 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3478 
3479 #ifdef CONFIG_SYSCTL
3480 	register_pernet_subsys(&sysctl_route_ops);
3481 #endif
3482 	register_pernet_subsys(&rt_genid_ops);
3483 	return rc;
3484 }
3485 
3486 #ifdef CONFIG_SYSCTL
3487 /*
3488  * We really need to sanitize the damn ipv4 init order, then all
3489  * this nonsense will go away.
3490  */
3491 void __init ip_static_sysctl_init(void)
3492 {
3493 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3494 }
3495 #endif
3496