xref: /linux/net/ipv4/route.c (revision 25aee3debe0464f6c680173041fa3de30ec9ff54)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define IP_MAX_MTU	0xFFF0
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
127 static int ip_rt_redirect_number __read_mostly	= 9;
128 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly	= HZ;
131 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly	= 8;
133 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly	= 256;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 					   struct sk_buff *skb, u32 mtu);
148 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 					struct sk_buff *skb);
150 static void		ipv4_dst_destroy(struct dst_entry *dst);
151 
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 			    int how)
154 {
155 }
156 
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159 	WARN_ON(1);
160 	return NULL;
161 }
162 
163 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
164 					   struct sk_buff *skb,
165 					   const void *daddr);
166 
167 static struct dst_ops ipv4_dst_ops = {
168 	.family =		AF_INET,
169 	.protocol =		cpu_to_be16(ETH_P_IP),
170 	.check =		ipv4_dst_check,
171 	.default_advmss =	ipv4_default_advmss,
172 	.mtu =			ipv4_mtu,
173 	.cow_metrics =		ipv4_cow_metrics,
174 	.destroy =		ipv4_dst_destroy,
175 	.ifdown =		ipv4_dst_ifdown,
176 	.negative_advice =	ipv4_negative_advice,
177 	.link_failure =		ipv4_link_failure,
178 	.update_pmtu =		ip_rt_update_pmtu,
179 	.redirect =		ip_do_redirect,
180 	.local_out =		__ip_local_out,
181 	.neigh_lookup =		ipv4_neigh_lookup,
182 };
183 
184 #define ECN_OR_COST(class)	TC_PRIO_##class
185 
186 const __u8 ip_tos2prio[16] = {
187 	TC_PRIO_BESTEFFORT,
188 	ECN_OR_COST(BESTEFFORT),
189 	TC_PRIO_BESTEFFORT,
190 	ECN_OR_COST(BESTEFFORT),
191 	TC_PRIO_BULK,
192 	ECN_OR_COST(BULK),
193 	TC_PRIO_BULK,
194 	ECN_OR_COST(BULK),
195 	TC_PRIO_INTERACTIVE,
196 	ECN_OR_COST(INTERACTIVE),
197 	TC_PRIO_INTERACTIVE,
198 	ECN_OR_COST(INTERACTIVE),
199 	TC_PRIO_INTERACTIVE_BULK,
200 	ECN_OR_COST(INTERACTIVE_BULK),
201 	TC_PRIO_INTERACTIVE_BULK,
202 	ECN_OR_COST(INTERACTIVE_BULK)
203 };
204 EXPORT_SYMBOL(ip_tos2prio);
205 
206 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
207 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
208 
209 static inline int rt_genid(struct net *net)
210 {
211 	return atomic_read(&net->ipv4.rt_genid);
212 }
213 
214 #ifdef CONFIG_PROC_FS
215 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
216 {
217 	if (*pos)
218 		return NULL;
219 	return SEQ_START_TOKEN;
220 }
221 
222 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223 {
224 	++*pos;
225 	return NULL;
226 }
227 
228 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
229 {
230 }
231 
232 static int rt_cache_seq_show(struct seq_file *seq, void *v)
233 {
234 	if (v == SEQ_START_TOKEN)
235 		seq_printf(seq, "%-127s\n",
236 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
237 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
238 			   "HHUptod\tSpecDst");
239 	return 0;
240 }
241 
242 static const struct seq_operations rt_cache_seq_ops = {
243 	.start  = rt_cache_seq_start,
244 	.next   = rt_cache_seq_next,
245 	.stop   = rt_cache_seq_stop,
246 	.show   = rt_cache_seq_show,
247 };
248 
249 static int rt_cache_seq_open(struct inode *inode, struct file *file)
250 {
251 	return seq_open(file, &rt_cache_seq_ops);
252 }
253 
254 static const struct file_operations rt_cache_seq_fops = {
255 	.owner	 = THIS_MODULE,
256 	.open	 = rt_cache_seq_open,
257 	.read	 = seq_read,
258 	.llseek	 = seq_lseek,
259 	.release = seq_release,
260 };
261 
262 
263 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
264 {
265 	int cpu;
266 
267 	if (*pos == 0)
268 		return SEQ_START_TOKEN;
269 
270 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 }
278 
279 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
280 {
281 	int cpu;
282 
283 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
284 		if (!cpu_possible(cpu))
285 			continue;
286 		*pos = cpu+1;
287 		return &per_cpu(rt_cache_stat, cpu);
288 	}
289 	return NULL;
290 
291 }
292 
293 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
294 {
295 
296 }
297 
298 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
299 {
300 	struct rt_cache_stat *st = v;
301 
302 	if (v == SEQ_START_TOKEN) {
303 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
304 		return 0;
305 	}
306 
307 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
308 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
309 		   dst_entries_get_slow(&ipv4_dst_ops),
310 		   st->in_hit,
311 		   st->in_slow_tot,
312 		   st->in_slow_mc,
313 		   st->in_no_route,
314 		   st->in_brd,
315 		   st->in_martian_dst,
316 		   st->in_martian_src,
317 
318 		   st->out_hit,
319 		   st->out_slow_tot,
320 		   st->out_slow_mc,
321 
322 		   st->gc_total,
323 		   st->gc_ignored,
324 		   st->gc_goal_miss,
325 		   st->gc_dst_overflow,
326 		   st->in_hlist_search,
327 		   st->out_hlist_search
328 		);
329 	return 0;
330 }
331 
332 static const struct seq_operations rt_cpu_seq_ops = {
333 	.start  = rt_cpu_seq_start,
334 	.next   = rt_cpu_seq_next,
335 	.stop   = rt_cpu_seq_stop,
336 	.show   = rt_cpu_seq_show,
337 };
338 
339 
340 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
341 {
342 	return seq_open(file, &rt_cpu_seq_ops);
343 }
344 
345 static const struct file_operations rt_cpu_seq_fops = {
346 	.owner	 = THIS_MODULE,
347 	.open	 = rt_cpu_seq_open,
348 	.read	 = seq_read,
349 	.llseek	 = seq_lseek,
350 	.release = seq_release,
351 };
352 
353 #ifdef CONFIG_IP_ROUTE_CLASSID
354 static int rt_acct_proc_show(struct seq_file *m, void *v)
355 {
356 	struct ip_rt_acct *dst, *src;
357 	unsigned int i, j;
358 
359 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
360 	if (!dst)
361 		return -ENOMEM;
362 
363 	for_each_possible_cpu(i) {
364 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
365 		for (j = 0; j < 256; j++) {
366 			dst[j].o_bytes   += src[j].o_bytes;
367 			dst[j].o_packets += src[j].o_packets;
368 			dst[j].i_bytes   += src[j].i_bytes;
369 			dst[j].i_packets += src[j].i_packets;
370 		}
371 	}
372 
373 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
374 	kfree(dst);
375 	return 0;
376 }
377 
378 static int rt_acct_proc_open(struct inode *inode, struct file *file)
379 {
380 	return single_open(file, rt_acct_proc_show, NULL);
381 }
382 
383 static const struct file_operations rt_acct_proc_fops = {
384 	.owner		= THIS_MODULE,
385 	.open		= rt_acct_proc_open,
386 	.read		= seq_read,
387 	.llseek		= seq_lseek,
388 	.release	= single_release,
389 };
390 #endif
391 
392 static int __net_init ip_rt_do_proc_init(struct net *net)
393 {
394 	struct proc_dir_entry *pde;
395 
396 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
397 			&rt_cache_seq_fops);
398 	if (!pde)
399 		goto err1;
400 
401 	pde = proc_create("rt_cache", S_IRUGO,
402 			  net->proc_net_stat, &rt_cpu_seq_fops);
403 	if (!pde)
404 		goto err2;
405 
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
408 	if (!pde)
409 		goto err3;
410 #endif
411 	return 0;
412 
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 err3:
415 	remove_proc_entry("rt_cache", net->proc_net_stat);
416 #endif
417 err2:
418 	remove_proc_entry("rt_cache", net->proc_net);
419 err1:
420 	return -ENOMEM;
421 }
422 
423 static void __net_exit ip_rt_do_proc_exit(struct net *net)
424 {
425 	remove_proc_entry("rt_cache", net->proc_net_stat);
426 	remove_proc_entry("rt_cache", net->proc_net);
427 #ifdef CONFIG_IP_ROUTE_CLASSID
428 	remove_proc_entry("rt_acct", net->proc_net);
429 #endif
430 }
431 
432 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
433 	.init = ip_rt_do_proc_init,
434 	.exit = ip_rt_do_proc_exit,
435 };
436 
437 static int __init ip_rt_proc_init(void)
438 {
439 	return register_pernet_subsys(&ip_rt_proc_ops);
440 }
441 
442 #else
443 static inline int ip_rt_proc_init(void)
444 {
445 	return 0;
446 }
447 #endif /* CONFIG_PROC_FS */
448 
449 static inline bool rt_is_expired(const struct rtable *rth)
450 {
451 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
452 }
453 
454 /*
455  * Perturbation of rt_genid by a small quantity [1..256]
456  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
457  * many times (2^24) without giving recent rt_genid.
458  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
459  */
460 static void rt_cache_invalidate(struct net *net)
461 {
462 	unsigned char shuffle;
463 
464 	get_random_bytes(&shuffle, sizeof(shuffle));
465 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
466 }
467 
468 /*
469  * delay < 0  : invalidate cache (fast : entries will be deleted later)
470  * delay >= 0 : invalidate & flush cache (can be long)
471  */
472 void rt_cache_flush(struct net *net, int delay)
473 {
474 	rt_cache_invalidate(net);
475 }
476 
477 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
478 					   struct sk_buff *skb,
479 					   const void *daddr)
480 {
481 	struct net_device *dev = dst->dev;
482 	const __be32 *pkey = daddr;
483 	const struct rtable *rt;
484 	struct neighbour *n;
485 
486 	rt = (const struct rtable *) dst;
487 	if (rt->rt_gateway)
488 		pkey = (const __be32 *) &rt->rt_gateway;
489 	else if (skb)
490 		pkey = &ip_hdr(skb)->daddr;
491 
492 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
493 	if (n)
494 		return n;
495 	return neigh_create(&arp_tbl, pkey, dev);
496 }
497 
498 /*
499  * Peer allocation may fail only in serious out-of-memory conditions.  However
500  * we still can generate some output.
501  * Random ID selection looks a bit dangerous because we have no chances to
502  * select ID being unique in a reasonable period of time.
503  * But broken packet identifier may be better than no packet at all.
504  */
505 static void ip_select_fb_ident(struct iphdr *iph)
506 {
507 	static DEFINE_SPINLOCK(ip_fb_id_lock);
508 	static u32 ip_fallback_id;
509 	u32 salt;
510 
511 	spin_lock_bh(&ip_fb_id_lock);
512 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
513 	iph->id = htons(salt & 0xFFFF);
514 	ip_fallback_id = salt;
515 	spin_unlock_bh(&ip_fb_id_lock);
516 }
517 
518 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
519 {
520 	struct net *net = dev_net(dst->dev);
521 	struct inet_peer *peer;
522 
523 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
524 	if (peer) {
525 		iph->id = htons(inet_getid(peer, more));
526 		inet_putpeer(peer);
527 		return;
528 	}
529 
530 	ip_select_fb_ident(iph);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533 
534 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
535 			     const struct iphdr *iph,
536 			     int oif, u8 tos,
537 			     u8 prot, u32 mark, int flow_flags)
538 {
539 	if (sk) {
540 		const struct inet_sock *inet = inet_sk(sk);
541 
542 		oif = sk->sk_bound_dev_if;
543 		mark = sk->sk_mark;
544 		tos = RT_CONN_FLAGS(sk);
545 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546 	}
547 	flowi4_init_output(fl4, oif, mark, tos,
548 			   RT_SCOPE_UNIVERSE, prot,
549 			   flow_flags,
550 			   iph->daddr, iph->saddr, 0, 0);
551 }
552 
553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 			       const struct sock *sk)
555 {
556 	const struct iphdr *iph = ip_hdr(skb);
557 	int oif = skb->dev->ifindex;
558 	u8 tos = RT_TOS(iph->tos);
559 	u8 prot = iph->protocol;
560 	u32 mark = skb->mark;
561 
562 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564 
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567 	const struct inet_sock *inet = inet_sk(sk);
568 	const struct ip_options_rcu *inet_opt;
569 	__be32 daddr = inet->inet_daddr;
570 
571 	rcu_read_lock();
572 	inet_opt = rcu_dereference(inet->inet_opt);
573 	if (inet_opt && inet_opt->opt.srr)
574 		daddr = inet_opt->opt.faddr;
575 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 			   inet_sk_flowi_flags(sk),
579 			   daddr, inet->inet_saddr, 0, 0);
580 	rcu_read_unlock();
581 }
582 
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 				 const struct sk_buff *skb)
585 {
586 	if (skb)
587 		build_skb_flow_key(fl4, skb, sk);
588 	else
589 		build_sk_flow_key(fl4, sk);
590 }
591 
592 static inline void rt_free(struct rtable *rt)
593 {
594 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
595 }
596 
597 static DEFINE_SPINLOCK(fnhe_lock);
598 
599 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
600 {
601 	struct fib_nh_exception *fnhe, *oldest;
602 	struct rtable *orig;
603 
604 	oldest = rcu_dereference(hash->chain);
605 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
606 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
607 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608 			oldest = fnhe;
609 	}
610 	orig = rcu_dereference(oldest->fnhe_rth);
611 	if (orig) {
612 		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
613 		rt_free(orig);
614 	}
615 	return oldest;
616 }
617 
618 static inline u32 fnhe_hashfun(__be32 daddr)
619 {
620 	u32 hval;
621 
622 	hval = (__force u32) daddr;
623 	hval ^= (hval >> 11) ^ (hval >> 22);
624 
625 	return hval & (FNHE_HASH_SIZE - 1);
626 }
627 
628 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
629 				  u32 pmtu, unsigned long expires)
630 {
631 	struct fnhe_hash_bucket *hash;
632 	struct fib_nh_exception *fnhe;
633 	int depth;
634 	u32 hval = fnhe_hashfun(daddr);
635 
636 	spin_lock_bh(&fnhe_lock);
637 
638 	hash = nh->nh_exceptions;
639 	if (!hash) {
640 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
641 		if (!hash)
642 			goto out_unlock;
643 		nh->nh_exceptions = hash;
644 	}
645 
646 	hash += hval;
647 
648 	depth = 0;
649 	for (fnhe = rcu_dereference(hash->chain); fnhe;
650 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
651 		if (fnhe->fnhe_daddr == daddr)
652 			break;
653 		depth++;
654 	}
655 
656 	if (fnhe) {
657 		if (gw)
658 			fnhe->fnhe_gw = gw;
659 		if (pmtu) {
660 			fnhe->fnhe_pmtu = pmtu;
661 			fnhe->fnhe_expires = expires;
662 		}
663 	} else {
664 		if (depth > FNHE_RECLAIM_DEPTH)
665 			fnhe = fnhe_oldest(hash);
666 		else {
667 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
668 			if (!fnhe)
669 				goto out_unlock;
670 
671 			fnhe->fnhe_next = hash->chain;
672 			rcu_assign_pointer(hash->chain, fnhe);
673 		}
674 		fnhe->fnhe_daddr = daddr;
675 		fnhe->fnhe_gw = gw;
676 		fnhe->fnhe_pmtu = pmtu;
677 		fnhe->fnhe_expires = expires;
678 	}
679 
680 	fnhe->fnhe_stamp = jiffies;
681 
682 out_unlock:
683 	spin_unlock_bh(&fnhe_lock);
684 	return;
685 }
686 
687 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
688 			     bool kill_route)
689 {
690 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
691 	__be32 old_gw = ip_hdr(skb)->saddr;
692 	struct net_device *dev = skb->dev;
693 	struct in_device *in_dev;
694 	struct fib_result res;
695 	struct neighbour *n;
696 	struct net *net;
697 
698 	switch (icmp_hdr(skb)->code & 7) {
699 	case ICMP_REDIR_NET:
700 	case ICMP_REDIR_NETTOS:
701 	case ICMP_REDIR_HOST:
702 	case ICMP_REDIR_HOSTTOS:
703 		break;
704 
705 	default:
706 		return;
707 	}
708 
709 	if (rt->rt_gateway != old_gw)
710 		return;
711 
712 	in_dev = __in_dev_get_rcu(dev);
713 	if (!in_dev)
714 		return;
715 
716 	net = dev_net(dev);
717 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
718 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
719 	    ipv4_is_zeronet(new_gw))
720 		goto reject_redirect;
721 
722 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
723 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
724 			goto reject_redirect;
725 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
726 			goto reject_redirect;
727 	} else {
728 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
729 			goto reject_redirect;
730 	}
731 
732 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
733 	if (n) {
734 		if (!(n->nud_state & NUD_VALID)) {
735 			neigh_event_send(n, NULL);
736 		} else {
737 			if (fib_lookup(net, fl4, &res) == 0) {
738 				struct fib_nh *nh = &FIB_RES_NH(res);
739 
740 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
741 						      0, 0);
742 			}
743 			if (kill_route)
744 				rt->dst.obsolete = DST_OBSOLETE_KILL;
745 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
746 		}
747 		neigh_release(n);
748 	}
749 	return;
750 
751 reject_redirect:
752 #ifdef CONFIG_IP_ROUTE_VERBOSE
753 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
754 		const struct iphdr *iph = (const struct iphdr *) skb->data;
755 		__be32 daddr = iph->daddr;
756 		__be32 saddr = iph->saddr;
757 
758 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
759 				     "  Advised path = %pI4 -> %pI4\n",
760 				     &old_gw, dev->name, &new_gw,
761 				     &saddr, &daddr);
762 	}
763 #endif
764 	;
765 }
766 
767 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
768 {
769 	struct rtable *rt;
770 	struct flowi4 fl4;
771 
772 	rt = (struct rtable *) dst;
773 
774 	ip_rt_build_flow_key(&fl4, sk, skb);
775 	__ip_do_redirect(rt, skb, &fl4, true);
776 }
777 
778 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
779 {
780 	struct rtable *rt = (struct rtable *)dst;
781 	struct dst_entry *ret = dst;
782 
783 	if (rt) {
784 		if (dst->obsolete > 0) {
785 			ip_rt_put(rt);
786 			ret = NULL;
787 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
788 			   rt->dst.expires) {
789 			ip_rt_put(rt);
790 			ret = NULL;
791 		}
792 	}
793 	return ret;
794 }
795 
796 /*
797  * Algorithm:
798  *	1. The first ip_rt_redirect_number redirects are sent
799  *	   with exponential backoff, then we stop sending them at all,
800  *	   assuming that the host ignores our redirects.
801  *	2. If we did not see packets requiring redirects
802  *	   during ip_rt_redirect_silence, we assume that the host
803  *	   forgot redirected route and start to send redirects again.
804  *
805  * This algorithm is much cheaper and more intelligent than dumb load limiting
806  * in icmp.c.
807  *
808  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
809  * and "frag. need" (breaks PMTU discovery) in icmp.c.
810  */
811 
812 void ip_rt_send_redirect(struct sk_buff *skb)
813 {
814 	struct rtable *rt = skb_rtable(skb);
815 	struct in_device *in_dev;
816 	struct inet_peer *peer;
817 	struct net *net;
818 	int log_martians;
819 
820 	rcu_read_lock();
821 	in_dev = __in_dev_get_rcu(rt->dst.dev);
822 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
823 		rcu_read_unlock();
824 		return;
825 	}
826 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
827 	rcu_read_unlock();
828 
829 	net = dev_net(rt->dst.dev);
830 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
831 	if (!peer) {
832 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
833 		return;
834 	}
835 
836 	/* No redirected packets during ip_rt_redirect_silence;
837 	 * reset the algorithm.
838 	 */
839 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
840 		peer->rate_tokens = 0;
841 
842 	/* Too many ignored redirects; do not send anything
843 	 * set dst.rate_last to the last seen redirected packet.
844 	 */
845 	if (peer->rate_tokens >= ip_rt_redirect_number) {
846 		peer->rate_last = jiffies;
847 		goto out_put_peer;
848 	}
849 
850 	/* Check for load limit; set rate_last to the latest sent
851 	 * redirect.
852 	 */
853 	if (peer->rate_tokens == 0 ||
854 	    time_after(jiffies,
855 		       (peer->rate_last +
856 			(ip_rt_redirect_load << peer->rate_tokens)))) {
857 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
858 		peer->rate_last = jiffies;
859 		++peer->rate_tokens;
860 #ifdef CONFIG_IP_ROUTE_VERBOSE
861 		if (log_martians &&
862 		    peer->rate_tokens == ip_rt_redirect_number)
863 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
864 					     &ip_hdr(skb)->saddr, inet_iif(skb),
865 					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
866 #endif
867 	}
868 out_put_peer:
869 	inet_putpeer(peer);
870 }
871 
872 static int ip_error(struct sk_buff *skb)
873 {
874 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
875 	struct rtable *rt = skb_rtable(skb);
876 	struct inet_peer *peer;
877 	unsigned long now;
878 	struct net *net;
879 	bool send;
880 	int code;
881 
882 	net = dev_net(rt->dst.dev);
883 	if (!IN_DEV_FORWARD(in_dev)) {
884 		switch (rt->dst.error) {
885 		case EHOSTUNREACH:
886 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
887 			break;
888 
889 		case ENETUNREACH:
890 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
891 			break;
892 		}
893 		goto out;
894 	}
895 
896 	switch (rt->dst.error) {
897 	case EINVAL:
898 	default:
899 		goto out;
900 	case EHOSTUNREACH:
901 		code = ICMP_HOST_UNREACH;
902 		break;
903 	case ENETUNREACH:
904 		code = ICMP_NET_UNREACH;
905 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
906 		break;
907 	case EACCES:
908 		code = ICMP_PKT_FILTERED;
909 		break;
910 	}
911 
912 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
913 
914 	send = true;
915 	if (peer) {
916 		now = jiffies;
917 		peer->rate_tokens += now - peer->rate_last;
918 		if (peer->rate_tokens > ip_rt_error_burst)
919 			peer->rate_tokens = ip_rt_error_burst;
920 		peer->rate_last = now;
921 		if (peer->rate_tokens >= ip_rt_error_cost)
922 			peer->rate_tokens -= ip_rt_error_cost;
923 		else
924 			send = false;
925 		inet_putpeer(peer);
926 	}
927 	if (send)
928 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
929 
930 out:	kfree_skb(skb);
931 	return 0;
932 }
933 
934 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
935 {
936 	struct fib_result res;
937 
938 	if (mtu < ip_rt_min_pmtu)
939 		mtu = ip_rt_min_pmtu;
940 
941 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
942 		struct fib_nh *nh = &FIB_RES_NH(res);
943 
944 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
945 				      jiffies + ip_rt_mtu_expires);
946 	}
947 	return mtu;
948 }
949 
950 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
951 			      struct sk_buff *skb, u32 mtu)
952 {
953 	struct rtable *rt = (struct rtable *) dst;
954 	struct flowi4 fl4;
955 
956 	ip_rt_build_flow_key(&fl4, sk, skb);
957 	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
958 
959 	if (!rt->rt_pmtu) {
960 		dst->obsolete = DST_OBSOLETE_KILL;
961 	} else {
962 		rt->rt_pmtu = mtu;
963 		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
964 	}
965 }
966 
967 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
968 		      int oif, u32 mark, u8 protocol, int flow_flags)
969 {
970 	const struct iphdr *iph = (const struct iphdr *) skb->data;
971 	struct flowi4 fl4;
972 	struct rtable *rt;
973 
974 	__build_flow_key(&fl4, NULL, iph, oif,
975 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
976 	rt = __ip_route_output_key(net, &fl4);
977 	if (!IS_ERR(rt)) {
978 		__ip_rt_update_pmtu(rt, &fl4, mtu);
979 		ip_rt_put(rt);
980 	}
981 }
982 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
983 
984 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
985 {
986 	const struct iphdr *iph = (const struct iphdr *) skb->data;
987 	struct flowi4 fl4;
988 	struct rtable *rt;
989 
990 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
991 	rt = __ip_route_output_key(sock_net(sk), &fl4);
992 	if (!IS_ERR(rt)) {
993 		__ip_rt_update_pmtu(rt, &fl4, mtu);
994 		ip_rt_put(rt);
995 	}
996 }
997 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
998 
999 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1000 		   int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1003 	struct flowi4 fl4;
1004 	struct rtable *rt;
1005 
1006 	__build_flow_key(&fl4, NULL, iph, oif,
1007 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1008 	rt = __ip_route_output_key(net, &fl4);
1009 	if (!IS_ERR(rt)) {
1010 		__ip_do_redirect(rt, skb, &fl4, false);
1011 		ip_rt_put(rt);
1012 	}
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_redirect);
1015 
1016 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1017 {
1018 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1019 	struct flowi4 fl4;
1020 	struct rtable *rt;
1021 
1022 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1024 	if (!IS_ERR(rt)) {
1025 		__ip_do_redirect(rt, skb, &fl4, false);
1026 		ip_rt_put(rt);
1027 	}
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1030 
1031 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1032 {
1033 	struct rtable *rt = (struct rtable *) dst;
1034 
1035 	/* All IPV4 dsts are created with ->obsolete set to the value
1036 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1037 	 * into this function always.
1038 	 *
1039 	 * When a PMTU/redirect information update invalidates a
1040 	 * route, this is indicated by setting obsolete to
1041 	 * DST_OBSOLETE_KILL.
1042 	 */
1043 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1044 		return NULL;
1045 	return dst;
1046 }
1047 
1048 static void ipv4_link_failure(struct sk_buff *skb)
1049 {
1050 	struct rtable *rt;
1051 
1052 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1053 
1054 	rt = skb_rtable(skb);
1055 	if (rt)
1056 		dst_set_expires(&rt->dst, 0);
1057 }
1058 
1059 static int ip_rt_bug(struct sk_buff *skb)
1060 {
1061 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1062 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1063 		 skb->dev ? skb->dev->name : "?");
1064 	kfree_skb(skb);
1065 	WARN_ON(1);
1066 	return 0;
1067 }
1068 
1069 /*
1070    We do not cache source address of outgoing interface,
1071    because it is used only by IP RR, TS and SRR options,
1072    so that it out of fast path.
1073 
1074    BTW remember: "addr" is allowed to be not aligned
1075    in IP options!
1076  */
1077 
1078 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1079 {
1080 	__be32 src;
1081 
1082 	if (rt_is_output_route(rt))
1083 		src = ip_hdr(skb)->saddr;
1084 	else {
1085 		struct fib_result res;
1086 		struct flowi4 fl4;
1087 		struct iphdr *iph;
1088 
1089 		iph = ip_hdr(skb);
1090 
1091 		memset(&fl4, 0, sizeof(fl4));
1092 		fl4.daddr = iph->daddr;
1093 		fl4.saddr = iph->saddr;
1094 		fl4.flowi4_tos = RT_TOS(iph->tos);
1095 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1096 		fl4.flowi4_iif = skb->dev->ifindex;
1097 		fl4.flowi4_mark = skb->mark;
1098 
1099 		rcu_read_lock();
1100 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1101 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1102 		else
1103 			src = inet_select_addr(rt->dst.dev,
1104 					       rt_nexthop(rt, iph->daddr),
1105 					       RT_SCOPE_UNIVERSE);
1106 		rcu_read_unlock();
1107 	}
1108 	memcpy(addr, &src, 4);
1109 }
1110 
1111 #ifdef CONFIG_IP_ROUTE_CLASSID
1112 static void set_class_tag(struct rtable *rt, u32 tag)
1113 {
1114 	if (!(rt->dst.tclassid & 0xFFFF))
1115 		rt->dst.tclassid |= tag & 0xFFFF;
1116 	if (!(rt->dst.tclassid & 0xFFFF0000))
1117 		rt->dst.tclassid |= tag & 0xFFFF0000;
1118 }
1119 #endif
1120 
1121 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1122 {
1123 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1124 
1125 	if (advmss == 0) {
1126 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1127 			       ip_rt_min_advmss);
1128 		if (advmss > 65535 - 40)
1129 			advmss = 65535 - 40;
1130 	}
1131 	return advmss;
1132 }
1133 
1134 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1135 {
1136 	const struct rtable *rt = (const struct rtable *) dst;
1137 	unsigned int mtu = rt->rt_pmtu;
1138 
1139 	if (mtu && time_after_eq(jiffies, rt->dst.expires))
1140 		mtu = 0;
1141 
1142 	if (!mtu)
1143 		mtu = dst_metric_raw(dst, RTAX_MTU);
1144 
1145 	if (mtu && rt_is_output_route(rt))
1146 		return mtu;
1147 
1148 	mtu = dst->dev->mtu;
1149 
1150 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1151 		if (rt->rt_gateway && mtu > 576)
1152 			mtu = 576;
1153 	}
1154 
1155 	if (mtu > IP_MAX_MTU)
1156 		mtu = IP_MAX_MTU;
1157 
1158 	return mtu;
1159 }
1160 
1161 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1162 {
1163 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1164 	struct fib_nh_exception *fnhe;
1165 	u32 hval;
1166 
1167 	if (!hash)
1168 		return NULL;
1169 
1170 	hval = fnhe_hashfun(daddr);
1171 
1172 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1173 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1174 		if (fnhe->fnhe_daddr == daddr)
1175 			return fnhe;
1176 	}
1177 	return NULL;
1178 }
1179 
1180 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1181 			      __be32 daddr)
1182 {
1183 	bool ret = false;
1184 
1185 	spin_lock_bh(&fnhe_lock);
1186 
1187 	if (daddr == fnhe->fnhe_daddr) {
1188 		struct rtable *orig;
1189 
1190 		if (fnhe->fnhe_pmtu) {
1191 			unsigned long expires = fnhe->fnhe_expires;
1192 			unsigned long diff = expires - jiffies;
1193 
1194 			if (time_before(jiffies, expires)) {
1195 				rt->rt_pmtu = fnhe->fnhe_pmtu;
1196 				dst_set_expires(&rt->dst, diff);
1197 			}
1198 		}
1199 		if (fnhe->fnhe_gw) {
1200 			rt->rt_flags |= RTCF_REDIRECTED;
1201 			rt->rt_gateway = fnhe->fnhe_gw;
1202 		}
1203 
1204 		orig = rcu_dereference(fnhe->fnhe_rth);
1205 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
1206 		if (orig)
1207 			rt_free(orig);
1208 
1209 		fnhe->fnhe_stamp = jiffies;
1210 		ret = true;
1211 	} else {
1212 		/* Routes we intend to cache in nexthop exception have
1213 		 * the DST_NOCACHE bit clear.  However, if we are
1214 		 * unsuccessful at storing this route into the cache
1215 		 * we really need to set it.
1216 		 */
1217 		rt->dst.flags |= DST_NOCACHE;
1218 	}
1219 	spin_unlock_bh(&fnhe_lock);
1220 
1221 	return ret;
1222 }
1223 
1224 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1225 {
1226 	struct rtable *orig, *prev, **p;
1227 	bool ret = true;
1228 
1229 	if (rt_is_input_route(rt)) {
1230 		p = (struct rtable **)&nh->nh_rth_input;
1231 	} else {
1232 		if (!nh->nh_pcpu_rth_output)
1233 			goto nocache;
1234 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1235 	}
1236 	orig = *p;
1237 
1238 	prev = cmpxchg(p, orig, rt);
1239 	if (prev == orig) {
1240 		if (orig)
1241 			rt_free(orig);
1242 	} else {
1243 		/* Routes we intend to cache in the FIB nexthop have
1244 		 * the DST_NOCACHE bit clear.  However, if we are
1245 		 * unsuccessful at storing this route into the cache
1246 		 * we really need to set it.
1247 		 */
1248 nocache:
1249 		rt->dst.flags |= DST_NOCACHE;
1250 		ret = false;
1251 	}
1252 
1253 	return ret;
1254 }
1255 
1256 static DEFINE_SPINLOCK(rt_uncached_lock);
1257 static LIST_HEAD(rt_uncached_list);
1258 
1259 static void rt_add_uncached_list(struct rtable *rt)
1260 {
1261 	spin_lock_bh(&rt_uncached_lock);
1262 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1263 	spin_unlock_bh(&rt_uncached_lock);
1264 }
1265 
1266 static void ipv4_dst_destroy(struct dst_entry *dst)
1267 {
1268 	struct rtable *rt = (struct rtable *) dst;
1269 
1270 	if (dst->flags & DST_NOCACHE) {
1271 		spin_lock_bh(&rt_uncached_lock);
1272 		list_del(&rt->rt_uncached);
1273 		spin_unlock_bh(&rt_uncached_lock);
1274 	}
1275 }
1276 
1277 void rt_flush_dev(struct net_device *dev)
1278 {
1279 	if (!list_empty(&rt_uncached_list)) {
1280 		struct net *net = dev_net(dev);
1281 		struct rtable *rt;
1282 
1283 		spin_lock_bh(&rt_uncached_lock);
1284 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1285 			if (rt->dst.dev != dev)
1286 				continue;
1287 			rt->dst.dev = net->loopback_dev;
1288 			dev_hold(rt->dst.dev);
1289 			dev_put(dev);
1290 		}
1291 		spin_unlock_bh(&rt_uncached_lock);
1292 	}
1293 }
1294 
1295 static bool rt_cache_valid(const struct rtable *rt)
1296 {
1297 	return	rt &&
1298 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1299 		!rt_is_expired(rt);
1300 }
1301 
1302 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1303 			   const struct fib_result *res,
1304 			   struct fib_nh_exception *fnhe,
1305 			   struct fib_info *fi, u16 type, u32 itag)
1306 {
1307 	bool cached = false;
1308 
1309 	if (fi) {
1310 		struct fib_nh *nh = &FIB_RES_NH(*res);
1311 
1312 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1313 			rt->rt_gateway = nh->nh_gw;
1314 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1315 #ifdef CONFIG_IP_ROUTE_CLASSID
1316 		rt->dst.tclassid = nh->nh_tclassid;
1317 #endif
1318 		if (unlikely(fnhe))
1319 			cached = rt_bind_exception(rt, fnhe, daddr);
1320 		else if (!(rt->dst.flags & DST_NOCACHE))
1321 			cached = rt_cache_route(nh, rt);
1322 	}
1323 	if (unlikely(!cached))
1324 		rt_add_uncached_list(rt);
1325 
1326 #ifdef CONFIG_IP_ROUTE_CLASSID
1327 #ifdef CONFIG_IP_MULTIPLE_TABLES
1328 	set_class_tag(rt, res->tclassid);
1329 #endif
1330 	set_class_tag(rt, itag);
1331 #endif
1332 }
1333 
1334 static struct rtable *rt_dst_alloc(struct net_device *dev,
1335 				   bool nopolicy, bool noxfrm, bool will_cache)
1336 {
1337 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1338 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1339 			 (nopolicy ? DST_NOPOLICY : 0) |
1340 			 (noxfrm ? DST_NOXFRM : 0));
1341 }
1342 
1343 /* called in rcu_read_lock() section */
1344 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1345 				u8 tos, struct net_device *dev, int our)
1346 {
1347 	struct rtable *rth;
1348 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1349 	u32 itag = 0;
1350 	int err;
1351 
1352 	/* Primary sanity checks. */
1353 
1354 	if (in_dev == NULL)
1355 		return -EINVAL;
1356 
1357 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1358 	    skb->protocol != htons(ETH_P_IP))
1359 		goto e_inval;
1360 
1361 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1362 		if (ipv4_is_loopback(saddr))
1363 			goto e_inval;
1364 
1365 	if (ipv4_is_zeronet(saddr)) {
1366 		if (!ipv4_is_local_multicast(daddr))
1367 			goto e_inval;
1368 	} else {
1369 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1370 					  in_dev, &itag);
1371 		if (err < 0)
1372 			goto e_err;
1373 	}
1374 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1375 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1376 	if (!rth)
1377 		goto e_nobufs;
1378 
1379 #ifdef CONFIG_IP_ROUTE_CLASSID
1380 	rth->dst.tclassid = itag;
1381 #endif
1382 	rth->dst.output = ip_rt_bug;
1383 
1384 	rth->rt_genid	= rt_genid(dev_net(dev));
1385 	rth->rt_flags	= RTCF_MULTICAST;
1386 	rth->rt_type	= RTN_MULTICAST;
1387 	rth->rt_is_input= 1;
1388 	rth->rt_iif	= 0;
1389 	rth->rt_pmtu	= 0;
1390 	rth->rt_gateway	= 0;
1391 	INIT_LIST_HEAD(&rth->rt_uncached);
1392 	if (our) {
1393 		rth->dst.input= ip_local_deliver;
1394 		rth->rt_flags |= RTCF_LOCAL;
1395 	}
1396 
1397 #ifdef CONFIG_IP_MROUTE
1398 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1399 		rth->dst.input = ip_mr_input;
1400 #endif
1401 	RT_CACHE_STAT_INC(in_slow_mc);
1402 
1403 	skb_dst_set(skb, &rth->dst);
1404 	return 0;
1405 
1406 e_nobufs:
1407 	return -ENOBUFS;
1408 e_inval:
1409 	return -EINVAL;
1410 e_err:
1411 	return err;
1412 }
1413 
1414 
1415 static void ip_handle_martian_source(struct net_device *dev,
1416 				     struct in_device *in_dev,
1417 				     struct sk_buff *skb,
1418 				     __be32 daddr,
1419 				     __be32 saddr)
1420 {
1421 	RT_CACHE_STAT_INC(in_martian_src);
1422 #ifdef CONFIG_IP_ROUTE_VERBOSE
1423 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1424 		/*
1425 		 *	RFC1812 recommendation, if source is martian,
1426 		 *	the only hint is MAC header.
1427 		 */
1428 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1429 			&daddr, &saddr, dev->name);
1430 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1431 			print_hex_dump(KERN_WARNING, "ll header: ",
1432 				       DUMP_PREFIX_OFFSET, 16, 1,
1433 				       skb_mac_header(skb),
1434 				       dev->hard_header_len, true);
1435 		}
1436 	}
1437 #endif
1438 }
1439 
1440 /* called in rcu_read_lock() section */
1441 static int __mkroute_input(struct sk_buff *skb,
1442 			   const struct fib_result *res,
1443 			   struct in_device *in_dev,
1444 			   __be32 daddr, __be32 saddr, u32 tos)
1445 {
1446 	struct rtable *rth;
1447 	int err;
1448 	struct in_device *out_dev;
1449 	unsigned int flags = 0;
1450 	bool do_cache;
1451 	u32 itag;
1452 
1453 	/* get a working reference to the output device */
1454 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1455 	if (out_dev == NULL) {
1456 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1457 		return -EINVAL;
1458 	}
1459 
1460 
1461 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1462 				  in_dev->dev, in_dev, &itag);
1463 	if (err < 0) {
1464 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1465 					 saddr);
1466 
1467 		goto cleanup;
1468 	}
1469 
1470 	if (out_dev == in_dev && err &&
1471 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1472 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1473 		flags |= RTCF_DOREDIRECT;
1474 
1475 	if (skb->protocol != htons(ETH_P_IP)) {
1476 		/* Not IP (i.e. ARP). Do not create route, if it is
1477 		 * invalid for proxy arp. DNAT routes are always valid.
1478 		 *
1479 		 * Proxy arp feature have been extended to allow, ARP
1480 		 * replies back to the same interface, to support
1481 		 * Private VLAN switch technologies. See arp.c.
1482 		 */
1483 		if (out_dev == in_dev &&
1484 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1485 			err = -EINVAL;
1486 			goto cleanup;
1487 		}
1488 	}
1489 
1490 	do_cache = false;
1491 	if (res->fi) {
1492 		if (!itag) {
1493 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1494 			if (rt_cache_valid(rth)) {
1495 				skb_dst_set_noref(skb, &rth->dst);
1496 				goto out;
1497 			}
1498 			do_cache = true;
1499 		}
1500 	}
1501 
1502 	rth = rt_dst_alloc(out_dev->dev,
1503 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1504 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1505 	if (!rth) {
1506 		err = -ENOBUFS;
1507 		goto cleanup;
1508 	}
1509 
1510 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1511 	rth->rt_flags = flags;
1512 	rth->rt_type = res->type;
1513 	rth->rt_is_input = 1;
1514 	rth->rt_iif 	= 0;
1515 	rth->rt_pmtu	= 0;
1516 	rth->rt_gateway	= 0;
1517 	INIT_LIST_HEAD(&rth->rt_uncached);
1518 
1519 	rth->dst.input = ip_forward;
1520 	rth->dst.output = ip_output;
1521 
1522 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1523 	skb_dst_set(skb, &rth->dst);
1524 out:
1525 	err = 0;
1526  cleanup:
1527 	return err;
1528 }
1529 
1530 static int ip_mkroute_input(struct sk_buff *skb,
1531 			    struct fib_result *res,
1532 			    const struct flowi4 *fl4,
1533 			    struct in_device *in_dev,
1534 			    __be32 daddr, __be32 saddr, u32 tos)
1535 {
1536 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1537 	if (res->fi && res->fi->fib_nhs > 1)
1538 		fib_select_multipath(res);
1539 #endif
1540 
1541 	/* create a routing cache entry */
1542 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1543 }
1544 
1545 /*
1546  *	NOTE. We drop all the packets that has local source
1547  *	addresses, because every properly looped back packet
1548  *	must have correct destination already attached by output routine.
1549  *
1550  *	Such approach solves two big problems:
1551  *	1. Not simplex devices are handled properly.
1552  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1553  *	called with rcu_read_lock()
1554  */
1555 
1556 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1557 			       u8 tos, struct net_device *dev)
1558 {
1559 	struct fib_result res;
1560 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1561 	struct flowi4	fl4;
1562 	unsigned int	flags = 0;
1563 	u32		itag = 0;
1564 	struct rtable	*rth;
1565 	int		err = -EINVAL;
1566 	struct net    *net = dev_net(dev);
1567 	bool do_cache;
1568 
1569 	/* IP on this device is disabled. */
1570 
1571 	if (!in_dev)
1572 		goto out;
1573 
1574 	/* Check for the most weird martians, which can be not detected
1575 	   by fib_lookup.
1576 	 */
1577 
1578 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1579 		goto martian_source;
1580 
1581 	res.fi = NULL;
1582 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1583 		goto brd_input;
1584 
1585 	/* Accept zero addresses only to limited broadcast;
1586 	 * I even do not know to fix it or not. Waiting for complains :-)
1587 	 */
1588 	if (ipv4_is_zeronet(saddr))
1589 		goto martian_source;
1590 
1591 	if (ipv4_is_zeronet(daddr))
1592 		goto martian_destination;
1593 
1594 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1595 		if (ipv4_is_loopback(daddr))
1596 			goto martian_destination;
1597 
1598 		if (ipv4_is_loopback(saddr))
1599 			goto martian_source;
1600 	}
1601 
1602 	/*
1603 	 *	Now we are ready to route packet.
1604 	 */
1605 	fl4.flowi4_oif = 0;
1606 	fl4.flowi4_iif = dev->ifindex;
1607 	fl4.flowi4_mark = skb->mark;
1608 	fl4.flowi4_tos = tos;
1609 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1610 	fl4.daddr = daddr;
1611 	fl4.saddr = saddr;
1612 	err = fib_lookup(net, &fl4, &res);
1613 	if (err != 0)
1614 		goto no_route;
1615 
1616 	RT_CACHE_STAT_INC(in_slow_tot);
1617 
1618 	if (res.type == RTN_BROADCAST)
1619 		goto brd_input;
1620 
1621 	if (res.type == RTN_LOCAL) {
1622 		err = fib_validate_source(skb, saddr, daddr, tos,
1623 					  net->loopback_dev->ifindex,
1624 					  dev, in_dev, &itag);
1625 		if (err < 0)
1626 			goto martian_source_keep_err;
1627 		goto local_input;
1628 	}
1629 
1630 	if (!IN_DEV_FORWARD(in_dev))
1631 		goto no_route;
1632 	if (res.type != RTN_UNICAST)
1633 		goto martian_destination;
1634 
1635 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1636 out:	return err;
1637 
1638 brd_input:
1639 	if (skb->protocol != htons(ETH_P_IP))
1640 		goto e_inval;
1641 
1642 	if (!ipv4_is_zeronet(saddr)) {
1643 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1644 					  in_dev, &itag);
1645 		if (err < 0)
1646 			goto martian_source_keep_err;
1647 	}
1648 	flags |= RTCF_BROADCAST;
1649 	res.type = RTN_BROADCAST;
1650 	RT_CACHE_STAT_INC(in_brd);
1651 
1652 local_input:
1653 	do_cache = false;
1654 	if (res.fi) {
1655 		if (!itag) {
1656 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1657 			if (rt_cache_valid(rth)) {
1658 				skb_dst_set_noref(skb, &rth->dst);
1659 				err = 0;
1660 				goto out;
1661 			}
1662 			do_cache = true;
1663 		}
1664 	}
1665 
1666 	rth = rt_dst_alloc(net->loopback_dev,
1667 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1668 	if (!rth)
1669 		goto e_nobufs;
1670 
1671 	rth->dst.input= ip_local_deliver;
1672 	rth->dst.output= ip_rt_bug;
1673 #ifdef CONFIG_IP_ROUTE_CLASSID
1674 	rth->dst.tclassid = itag;
1675 #endif
1676 
1677 	rth->rt_genid = rt_genid(net);
1678 	rth->rt_flags 	= flags|RTCF_LOCAL;
1679 	rth->rt_type	= res.type;
1680 	rth->rt_is_input = 1;
1681 	rth->rt_iif	= 0;
1682 	rth->rt_pmtu	= 0;
1683 	rth->rt_gateway	= 0;
1684 	INIT_LIST_HEAD(&rth->rt_uncached);
1685 	if (res.type == RTN_UNREACHABLE) {
1686 		rth->dst.input= ip_error;
1687 		rth->dst.error= -err;
1688 		rth->rt_flags 	&= ~RTCF_LOCAL;
1689 	}
1690 	if (do_cache)
1691 		rt_cache_route(&FIB_RES_NH(res), rth);
1692 	skb_dst_set(skb, &rth->dst);
1693 	err = 0;
1694 	goto out;
1695 
1696 no_route:
1697 	RT_CACHE_STAT_INC(in_no_route);
1698 	res.type = RTN_UNREACHABLE;
1699 	if (err == -ESRCH)
1700 		err = -ENETUNREACH;
1701 	goto local_input;
1702 
1703 	/*
1704 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1705 	 */
1706 martian_destination:
1707 	RT_CACHE_STAT_INC(in_martian_dst);
1708 #ifdef CONFIG_IP_ROUTE_VERBOSE
1709 	if (IN_DEV_LOG_MARTIANS(in_dev))
1710 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1711 				     &daddr, &saddr, dev->name);
1712 #endif
1713 
1714 e_inval:
1715 	err = -EINVAL;
1716 	goto out;
1717 
1718 e_nobufs:
1719 	err = -ENOBUFS;
1720 	goto out;
1721 
1722 martian_source:
1723 	err = -EINVAL;
1724 martian_source_keep_err:
1725 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1726 	goto out;
1727 }
1728 
1729 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730 			 u8 tos, struct net_device *dev)
1731 {
1732 	int res;
1733 
1734 	rcu_read_lock();
1735 
1736 	/* Multicast recognition logic is moved from route cache to here.
1737 	   The problem was that too many Ethernet cards have broken/missing
1738 	   hardware multicast filters :-( As result the host on multicasting
1739 	   network acquires a lot of useless route cache entries, sort of
1740 	   SDR messages from all the world. Now we try to get rid of them.
1741 	   Really, provided software IP multicast filter is organized
1742 	   reasonably (at least, hashed), it does not result in a slowdown
1743 	   comparing with route cache reject entries.
1744 	   Note, that multicast routers are not affected, because
1745 	   route cache entry is created eventually.
1746 	 */
1747 	if (ipv4_is_multicast(daddr)) {
1748 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1749 
1750 		if (in_dev) {
1751 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1752 						  ip_hdr(skb)->protocol);
1753 			if (our
1754 #ifdef CONFIG_IP_MROUTE
1755 				||
1756 			    (!ipv4_is_local_multicast(daddr) &&
1757 			     IN_DEV_MFORWARD(in_dev))
1758 #endif
1759 			   ) {
1760 				int res = ip_route_input_mc(skb, daddr, saddr,
1761 							    tos, dev, our);
1762 				rcu_read_unlock();
1763 				return res;
1764 			}
1765 		}
1766 		rcu_read_unlock();
1767 		return -EINVAL;
1768 	}
1769 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770 	rcu_read_unlock();
1771 	return res;
1772 }
1773 EXPORT_SYMBOL(ip_route_input_noref);
1774 
1775 /* called with rcu_read_lock() */
1776 static struct rtable *__mkroute_output(const struct fib_result *res,
1777 				       const struct flowi4 *fl4, int orig_oif,
1778 				       struct net_device *dev_out,
1779 				       unsigned int flags)
1780 {
1781 	struct fib_info *fi = res->fi;
1782 	struct fib_nh_exception *fnhe;
1783 	struct in_device *in_dev;
1784 	u16 type = res->type;
1785 	struct rtable *rth;
1786 
1787 	in_dev = __in_dev_get_rcu(dev_out);
1788 	if (!in_dev)
1789 		return ERR_PTR(-EINVAL);
1790 
1791 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1792 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1793 			return ERR_PTR(-EINVAL);
1794 
1795 	if (ipv4_is_lbcast(fl4->daddr))
1796 		type = RTN_BROADCAST;
1797 	else if (ipv4_is_multicast(fl4->daddr))
1798 		type = RTN_MULTICAST;
1799 	else if (ipv4_is_zeronet(fl4->daddr))
1800 		return ERR_PTR(-EINVAL);
1801 
1802 	if (dev_out->flags & IFF_LOOPBACK)
1803 		flags |= RTCF_LOCAL;
1804 
1805 	if (type == RTN_BROADCAST) {
1806 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1807 		fi = NULL;
1808 	} else if (type == RTN_MULTICAST) {
1809 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1810 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1811 				     fl4->flowi4_proto))
1812 			flags &= ~RTCF_LOCAL;
1813 		/* If multicast route do not exist use
1814 		 * default one, but do not gateway in this case.
1815 		 * Yes, it is hack.
1816 		 */
1817 		if (fi && res->prefixlen < 4)
1818 			fi = NULL;
1819 	}
1820 
1821 	fnhe = NULL;
1822 	if (fi) {
1823 		struct rtable __rcu **prth;
1824 
1825 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1826 		if (fnhe)
1827 			prth = &fnhe->fnhe_rth;
1828 		else
1829 			prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1830 		rth = rcu_dereference(*prth);
1831 		if (rt_cache_valid(rth)) {
1832 			dst_hold(&rth->dst);
1833 			return rth;
1834 		}
1835 	}
1836 	rth = rt_dst_alloc(dev_out,
1837 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1838 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1839 			   fi);
1840 	if (!rth)
1841 		return ERR_PTR(-ENOBUFS);
1842 
1843 	rth->dst.output = ip_output;
1844 
1845 	rth->rt_genid = rt_genid(dev_net(dev_out));
1846 	rth->rt_flags	= flags;
1847 	rth->rt_type	= type;
1848 	rth->rt_is_input = 0;
1849 	rth->rt_iif	= orig_oif ? : 0;
1850 	rth->rt_pmtu	= 0;
1851 	rth->rt_gateway = 0;
1852 	INIT_LIST_HEAD(&rth->rt_uncached);
1853 
1854 	RT_CACHE_STAT_INC(out_slow_tot);
1855 
1856 	if (flags & RTCF_LOCAL)
1857 		rth->dst.input = ip_local_deliver;
1858 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1859 		if (flags & RTCF_LOCAL &&
1860 		    !(dev_out->flags & IFF_LOOPBACK)) {
1861 			rth->dst.output = ip_mc_output;
1862 			RT_CACHE_STAT_INC(out_slow_mc);
1863 		}
1864 #ifdef CONFIG_IP_MROUTE
1865 		if (type == RTN_MULTICAST) {
1866 			if (IN_DEV_MFORWARD(in_dev) &&
1867 			    !ipv4_is_local_multicast(fl4->daddr)) {
1868 				rth->dst.input = ip_mr_input;
1869 				rth->dst.output = ip_mc_output;
1870 			}
1871 		}
1872 #endif
1873 	}
1874 
1875 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1876 
1877 	return rth;
1878 }
1879 
1880 /*
1881  * Major route resolver routine.
1882  */
1883 
1884 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1885 {
1886 	struct net_device *dev_out = NULL;
1887 	__u8 tos = RT_FL_TOS(fl4);
1888 	unsigned int flags = 0;
1889 	struct fib_result res;
1890 	struct rtable *rth;
1891 	int orig_oif;
1892 
1893 	res.tclassid	= 0;
1894 	res.fi		= NULL;
1895 	res.table	= NULL;
1896 
1897 	orig_oif = fl4->flowi4_oif;
1898 
1899 	fl4->flowi4_iif = net->loopback_dev->ifindex;
1900 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1901 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1902 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1903 
1904 	rcu_read_lock();
1905 	if (fl4->saddr) {
1906 		rth = ERR_PTR(-EINVAL);
1907 		if (ipv4_is_multicast(fl4->saddr) ||
1908 		    ipv4_is_lbcast(fl4->saddr) ||
1909 		    ipv4_is_zeronet(fl4->saddr))
1910 			goto out;
1911 
1912 		/* I removed check for oif == dev_out->oif here.
1913 		   It was wrong for two reasons:
1914 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1915 		      is assigned to multiple interfaces.
1916 		   2. Moreover, we are allowed to send packets with saddr
1917 		      of another iface. --ANK
1918 		 */
1919 
1920 		if (fl4->flowi4_oif == 0 &&
1921 		    (ipv4_is_multicast(fl4->daddr) ||
1922 		     ipv4_is_lbcast(fl4->daddr))) {
1923 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1925 			if (dev_out == NULL)
1926 				goto out;
1927 
1928 			/* Special hack: user can direct multicasts
1929 			   and limited broadcast via necessary interface
1930 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1931 			   This hack is not just for fun, it allows
1932 			   vic,vat and friends to work.
1933 			   They bind socket to loopback, set ttl to zero
1934 			   and expect that it will work.
1935 			   From the viewpoint of routing cache they are broken,
1936 			   because we are not allowed to build multicast path
1937 			   with loopback source addr (look, routing cache
1938 			   cannot know, that ttl is zero, so that packet
1939 			   will not leave this host and route is valid).
1940 			   Luckily, this hack is good workaround.
1941 			 */
1942 
1943 			fl4->flowi4_oif = dev_out->ifindex;
1944 			goto make_route;
1945 		}
1946 
1947 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1948 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1949 			if (!__ip_dev_find(net, fl4->saddr, false))
1950 				goto out;
1951 		}
1952 	}
1953 
1954 
1955 	if (fl4->flowi4_oif) {
1956 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1957 		rth = ERR_PTR(-ENODEV);
1958 		if (dev_out == NULL)
1959 			goto out;
1960 
1961 		/* RACE: Check return value of inet_select_addr instead. */
1962 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1963 			rth = ERR_PTR(-ENETUNREACH);
1964 			goto out;
1965 		}
1966 		if (ipv4_is_local_multicast(fl4->daddr) ||
1967 		    ipv4_is_lbcast(fl4->daddr)) {
1968 			if (!fl4->saddr)
1969 				fl4->saddr = inet_select_addr(dev_out, 0,
1970 							      RT_SCOPE_LINK);
1971 			goto make_route;
1972 		}
1973 		if (fl4->saddr) {
1974 			if (ipv4_is_multicast(fl4->daddr))
1975 				fl4->saddr = inet_select_addr(dev_out, 0,
1976 							      fl4->flowi4_scope);
1977 			else if (!fl4->daddr)
1978 				fl4->saddr = inet_select_addr(dev_out, 0,
1979 							      RT_SCOPE_HOST);
1980 		}
1981 	}
1982 
1983 	if (!fl4->daddr) {
1984 		fl4->daddr = fl4->saddr;
1985 		if (!fl4->daddr)
1986 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1987 		dev_out = net->loopback_dev;
1988 		fl4->flowi4_oif = net->loopback_dev->ifindex;
1989 		res.type = RTN_LOCAL;
1990 		flags |= RTCF_LOCAL;
1991 		goto make_route;
1992 	}
1993 
1994 	if (fib_lookup(net, fl4, &res)) {
1995 		res.fi = NULL;
1996 		res.table = NULL;
1997 		if (fl4->flowi4_oif) {
1998 			/* Apparently, routing tables are wrong. Assume,
1999 			   that the destination is on link.
2000 
2001 			   WHY? DW.
2002 			   Because we are allowed to send to iface
2003 			   even if it has NO routes and NO assigned
2004 			   addresses. When oif is specified, routing
2005 			   tables are looked up with only one purpose:
2006 			   to catch if destination is gatewayed, rather than
2007 			   direct. Moreover, if MSG_DONTROUTE is set,
2008 			   we send packet, ignoring both routing tables
2009 			   and ifaddr state. --ANK
2010 
2011 
2012 			   We could make it even if oif is unknown,
2013 			   likely IPv6, but we do not.
2014 			 */
2015 
2016 			if (fl4->saddr == 0)
2017 				fl4->saddr = inet_select_addr(dev_out, 0,
2018 							      RT_SCOPE_LINK);
2019 			res.type = RTN_UNICAST;
2020 			goto make_route;
2021 		}
2022 		rth = ERR_PTR(-ENETUNREACH);
2023 		goto out;
2024 	}
2025 
2026 	if (res.type == RTN_LOCAL) {
2027 		if (!fl4->saddr) {
2028 			if (res.fi->fib_prefsrc)
2029 				fl4->saddr = res.fi->fib_prefsrc;
2030 			else
2031 				fl4->saddr = fl4->daddr;
2032 		}
2033 		dev_out = net->loopback_dev;
2034 		fl4->flowi4_oif = dev_out->ifindex;
2035 		res.fi = NULL;
2036 		flags |= RTCF_LOCAL;
2037 		goto make_route;
2038 	}
2039 
2040 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2041 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2042 		fib_select_multipath(&res);
2043 	else
2044 #endif
2045 	if (!res.prefixlen &&
2046 	    res.table->tb_num_default > 1 &&
2047 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2048 		fib_select_default(&res);
2049 
2050 	if (!fl4->saddr)
2051 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2052 
2053 	dev_out = FIB_RES_DEV(res);
2054 	fl4->flowi4_oif = dev_out->ifindex;
2055 
2056 
2057 make_route:
2058 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2059 
2060 out:
2061 	rcu_read_unlock();
2062 	return rth;
2063 }
2064 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2065 
2066 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2067 {
2068 	return NULL;
2069 }
2070 
2071 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2072 {
2073 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2074 
2075 	return mtu ? : dst->dev->mtu;
2076 }
2077 
2078 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2079 					  struct sk_buff *skb, u32 mtu)
2080 {
2081 }
2082 
2083 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2084 				       struct sk_buff *skb)
2085 {
2086 }
2087 
2088 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2089 					  unsigned long old)
2090 {
2091 	return NULL;
2092 }
2093 
2094 static struct dst_ops ipv4_dst_blackhole_ops = {
2095 	.family			=	AF_INET,
2096 	.protocol		=	cpu_to_be16(ETH_P_IP),
2097 	.check			=	ipv4_blackhole_dst_check,
2098 	.mtu			=	ipv4_blackhole_mtu,
2099 	.default_advmss		=	ipv4_default_advmss,
2100 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2101 	.redirect		=	ipv4_rt_blackhole_redirect,
2102 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2103 	.neigh_lookup		=	ipv4_neigh_lookup,
2104 };
2105 
2106 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2107 {
2108 	struct rtable *ort = (struct rtable *) dst_orig;
2109 	struct rtable *rt;
2110 
2111 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2112 	if (rt) {
2113 		struct dst_entry *new = &rt->dst;
2114 
2115 		new->__use = 1;
2116 		new->input = dst_discard;
2117 		new->output = dst_discard;
2118 
2119 		new->dev = ort->dst.dev;
2120 		if (new->dev)
2121 			dev_hold(new->dev);
2122 
2123 		rt->rt_is_input = ort->rt_is_input;
2124 		rt->rt_iif = ort->rt_iif;
2125 		rt->rt_pmtu = ort->rt_pmtu;
2126 
2127 		rt->rt_genid = rt_genid(net);
2128 		rt->rt_flags = ort->rt_flags;
2129 		rt->rt_type = ort->rt_type;
2130 		rt->rt_gateway = ort->rt_gateway;
2131 
2132 		INIT_LIST_HEAD(&rt->rt_uncached);
2133 
2134 		dst_free(new);
2135 	}
2136 
2137 	dst_release(dst_orig);
2138 
2139 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2140 }
2141 
2142 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2143 				    struct sock *sk)
2144 {
2145 	struct rtable *rt = __ip_route_output_key(net, flp4);
2146 
2147 	if (IS_ERR(rt))
2148 		return rt;
2149 
2150 	if (flp4->flowi4_proto)
2151 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2152 						   flowi4_to_flowi(flp4),
2153 						   sk, 0);
2154 
2155 	return rt;
2156 }
2157 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2158 
2159 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2160 			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2161 			u32 seq, int event, int nowait, unsigned int flags)
2162 {
2163 	struct rtable *rt = skb_rtable(skb);
2164 	struct rtmsg *r;
2165 	struct nlmsghdr *nlh;
2166 	unsigned long expires = 0;
2167 	u32 error;
2168 	u32 metrics[RTAX_MAX];
2169 
2170 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2171 	if (nlh == NULL)
2172 		return -EMSGSIZE;
2173 
2174 	r = nlmsg_data(nlh);
2175 	r->rtm_family	 = AF_INET;
2176 	r->rtm_dst_len	= 32;
2177 	r->rtm_src_len	= 0;
2178 	r->rtm_tos	= fl4->flowi4_tos;
2179 	r->rtm_table	= RT_TABLE_MAIN;
2180 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2181 		goto nla_put_failure;
2182 	r->rtm_type	= rt->rt_type;
2183 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2184 	r->rtm_protocol = RTPROT_UNSPEC;
2185 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2186 	if (rt->rt_flags & RTCF_NOTIFY)
2187 		r->rtm_flags |= RTM_F_NOTIFY;
2188 
2189 	if (nla_put_be32(skb, RTA_DST, dst))
2190 		goto nla_put_failure;
2191 	if (src) {
2192 		r->rtm_src_len = 32;
2193 		if (nla_put_be32(skb, RTA_SRC, src))
2194 			goto nla_put_failure;
2195 	}
2196 	if (rt->dst.dev &&
2197 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2198 		goto nla_put_failure;
2199 #ifdef CONFIG_IP_ROUTE_CLASSID
2200 	if (rt->dst.tclassid &&
2201 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2202 		goto nla_put_failure;
2203 #endif
2204 	if (!rt_is_input_route(rt) &&
2205 	    fl4->saddr != src) {
2206 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2207 			goto nla_put_failure;
2208 	}
2209 	if (rt->rt_gateway &&
2210 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2211 		goto nla_put_failure;
2212 
2213 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2214 	if (rt->rt_pmtu)
2215 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2216 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2217 		goto nla_put_failure;
2218 
2219 	if (fl4->flowi4_mark &&
2220 	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2221 		goto nla_put_failure;
2222 
2223 	error = rt->dst.error;
2224 	expires = rt->dst.expires;
2225 	if (expires) {
2226 		if (time_before(jiffies, expires))
2227 			expires -= jiffies;
2228 		else
2229 			expires = 0;
2230 	}
2231 
2232 	if (rt_is_input_route(rt)) {
2233 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2234 			goto nla_put_failure;
2235 	}
2236 
2237 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2238 		goto nla_put_failure;
2239 
2240 	return nlmsg_end(skb, nlh);
2241 
2242 nla_put_failure:
2243 	nlmsg_cancel(skb, nlh);
2244 	return -EMSGSIZE;
2245 }
2246 
2247 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2248 {
2249 	struct net *net = sock_net(in_skb->sk);
2250 	struct rtmsg *rtm;
2251 	struct nlattr *tb[RTA_MAX+1];
2252 	struct rtable *rt = NULL;
2253 	struct flowi4 fl4;
2254 	__be32 dst = 0;
2255 	__be32 src = 0;
2256 	u32 iif;
2257 	int err;
2258 	int mark;
2259 	struct sk_buff *skb;
2260 
2261 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2262 	if (err < 0)
2263 		goto errout;
2264 
2265 	rtm = nlmsg_data(nlh);
2266 
2267 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2268 	if (skb == NULL) {
2269 		err = -ENOBUFS;
2270 		goto errout;
2271 	}
2272 
2273 	/* Reserve room for dummy headers, this skb can pass
2274 	   through good chunk of routing engine.
2275 	 */
2276 	skb_reset_mac_header(skb);
2277 	skb_reset_network_header(skb);
2278 
2279 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2280 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2281 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2282 
2283 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2284 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2285 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2286 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2287 
2288 	memset(&fl4, 0, sizeof(fl4));
2289 	fl4.daddr = dst;
2290 	fl4.saddr = src;
2291 	fl4.flowi4_tos = rtm->rtm_tos;
2292 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2293 	fl4.flowi4_mark = mark;
2294 
2295 	if (iif) {
2296 		struct net_device *dev;
2297 
2298 		dev = __dev_get_by_index(net, iif);
2299 		if (dev == NULL) {
2300 			err = -ENODEV;
2301 			goto errout_free;
2302 		}
2303 
2304 		skb->protocol	= htons(ETH_P_IP);
2305 		skb->dev	= dev;
2306 		skb->mark	= mark;
2307 		local_bh_disable();
2308 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2309 		local_bh_enable();
2310 
2311 		rt = skb_rtable(skb);
2312 		if (err == 0 && rt->dst.error)
2313 			err = -rt->dst.error;
2314 	} else {
2315 		rt = ip_route_output_key(net, &fl4);
2316 
2317 		err = 0;
2318 		if (IS_ERR(rt))
2319 			err = PTR_ERR(rt);
2320 	}
2321 
2322 	if (err)
2323 		goto errout_free;
2324 
2325 	skb_dst_set(skb, &rt->dst);
2326 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2327 		rt->rt_flags |= RTCF_NOTIFY;
2328 
2329 	err = rt_fill_info(net, dst, src, &fl4, skb,
2330 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2331 			   RTM_NEWROUTE, 0, 0);
2332 	if (err <= 0)
2333 		goto errout_free;
2334 
2335 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2336 errout:
2337 	return err;
2338 
2339 errout_free:
2340 	kfree_skb(skb);
2341 	goto errout;
2342 }
2343 
2344 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2345 {
2346 	return skb->len;
2347 }
2348 
2349 void ip_rt_multicast_event(struct in_device *in_dev)
2350 {
2351 	rt_cache_flush(dev_net(in_dev->dev), 0);
2352 }
2353 
2354 #ifdef CONFIG_SYSCTL
2355 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2356 					void __user *buffer,
2357 					size_t *lenp, loff_t *ppos)
2358 {
2359 	if (write) {
2360 		int flush_delay;
2361 		ctl_table ctl;
2362 		struct net *net;
2363 
2364 		memcpy(&ctl, __ctl, sizeof(ctl));
2365 		ctl.data = &flush_delay;
2366 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
2367 
2368 		net = (struct net *)__ctl->extra1;
2369 		rt_cache_flush(net, flush_delay);
2370 		return 0;
2371 	}
2372 
2373 	return -EINVAL;
2374 }
2375 
2376 static ctl_table ipv4_route_table[] = {
2377 	{
2378 		.procname	= "gc_thresh",
2379 		.data		= &ipv4_dst_ops.gc_thresh,
2380 		.maxlen		= sizeof(int),
2381 		.mode		= 0644,
2382 		.proc_handler	= proc_dointvec,
2383 	},
2384 	{
2385 		.procname	= "max_size",
2386 		.data		= &ip_rt_max_size,
2387 		.maxlen		= sizeof(int),
2388 		.mode		= 0644,
2389 		.proc_handler	= proc_dointvec,
2390 	},
2391 	{
2392 		/*  Deprecated. Use gc_min_interval_ms */
2393 
2394 		.procname	= "gc_min_interval",
2395 		.data		= &ip_rt_gc_min_interval,
2396 		.maxlen		= sizeof(int),
2397 		.mode		= 0644,
2398 		.proc_handler	= proc_dointvec_jiffies,
2399 	},
2400 	{
2401 		.procname	= "gc_min_interval_ms",
2402 		.data		= &ip_rt_gc_min_interval,
2403 		.maxlen		= sizeof(int),
2404 		.mode		= 0644,
2405 		.proc_handler	= proc_dointvec_ms_jiffies,
2406 	},
2407 	{
2408 		.procname	= "gc_timeout",
2409 		.data		= &ip_rt_gc_timeout,
2410 		.maxlen		= sizeof(int),
2411 		.mode		= 0644,
2412 		.proc_handler	= proc_dointvec_jiffies,
2413 	},
2414 	{
2415 		.procname	= "gc_interval",
2416 		.data		= &ip_rt_gc_interval,
2417 		.maxlen		= sizeof(int),
2418 		.mode		= 0644,
2419 		.proc_handler	= proc_dointvec_jiffies,
2420 	},
2421 	{
2422 		.procname	= "redirect_load",
2423 		.data		= &ip_rt_redirect_load,
2424 		.maxlen		= sizeof(int),
2425 		.mode		= 0644,
2426 		.proc_handler	= proc_dointvec,
2427 	},
2428 	{
2429 		.procname	= "redirect_number",
2430 		.data		= &ip_rt_redirect_number,
2431 		.maxlen		= sizeof(int),
2432 		.mode		= 0644,
2433 		.proc_handler	= proc_dointvec,
2434 	},
2435 	{
2436 		.procname	= "redirect_silence",
2437 		.data		= &ip_rt_redirect_silence,
2438 		.maxlen		= sizeof(int),
2439 		.mode		= 0644,
2440 		.proc_handler	= proc_dointvec,
2441 	},
2442 	{
2443 		.procname	= "error_cost",
2444 		.data		= &ip_rt_error_cost,
2445 		.maxlen		= sizeof(int),
2446 		.mode		= 0644,
2447 		.proc_handler	= proc_dointvec,
2448 	},
2449 	{
2450 		.procname	= "error_burst",
2451 		.data		= &ip_rt_error_burst,
2452 		.maxlen		= sizeof(int),
2453 		.mode		= 0644,
2454 		.proc_handler	= proc_dointvec,
2455 	},
2456 	{
2457 		.procname	= "gc_elasticity",
2458 		.data		= &ip_rt_gc_elasticity,
2459 		.maxlen		= sizeof(int),
2460 		.mode		= 0644,
2461 		.proc_handler	= proc_dointvec,
2462 	},
2463 	{
2464 		.procname	= "mtu_expires",
2465 		.data		= &ip_rt_mtu_expires,
2466 		.maxlen		= sizeof(int),
2467 		.mode		= 0644,
2468 		.proc_handler	= proc_dointvec_jiffies,
2469 	},
2470 	{
2471 		.procname	= "min_pmtu",
2472 		.data		= &ip_rt_min_pmtu,
2473 		.maxlen		= sizeof(int),
2474 		.mode		= 0644,
2475 		.proc_handler	= proc_dointvec,
2476 	},
2477 	{
2478 		.procname	= "min_adv_mss",
2479 		.data		= &ip_rt_min_advmss,
2480 		.maxlen		= sizeof(int),
2481 		.mode		= 0644,
2482 		.proc_handler	= proc_dointvec,
2483 	},
2484 	{ }
2485 };
2486 
2487 static struct ctl_table ipv4_route_flush_table[] = {
2488 	{
2489 		.procname	= "flush",
2490 		.maxlen		= sizeof(int),
2491 		.mode		= 0200,
2492 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2493 	},
2494 	{ },
2495 };
2496 
2497 static __net_init int sysctl_route_net_init(struct net *net)
2498 {
2499 	struct ctl_table *tbl;
2500 
2501 	tbl = ipv4_route_flush_table;
2502 	if (!net_eq(net, &init_net)) {
2503 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2504 		if (tbl == NULL)
2505 			goto err_dup;
2506 	}
2507 	tbl[0].extra1 = net;
2508 
2509 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2510 	if (net->ipv4.route_hdr == NULL)
2511 		goto err_reg;
2512 	return 0;
2513 
2514 err_reg:
2515 	if (tbl != ipv4_route_flush_table)
2516 		kfree(tbl);
2517 err_dup:
2518 	return -ENOMEM;
2519 }
2520 
2521 static __net_exit void sysctl_route_net_exit(struct net *net)
2522 {
2523 	struct ctl_table *tbl;
2524 
2525 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2526 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2527 	BUG_ON(tbl == ipv4_route_flush_table);
2528 	kfree(tbl);
2529 }
2530 
2531 static __net_initdata struct pernet_operations sysctl_route_ops = {
2532 	.init = sysctl_route_net_init,
2533 	.exit = sysctl_route_net_exit,
2534 };
2535 #endif
2536 
2537 static __net_init int rt_genid_init(struct net *net)
2538 {
2539 	get_random_bytes(&net->ipv4.rt_genid,
2540 			 sizeof(net->ipv4.rt_genid));
2541 	get_random_bytes(&net->ipv4.dev_addr_genid,
2542 			 sizeof(net->ipv4.dev_addr_genid));
2543 	return 0;
2544 }
2545 
2546 static __net_initdata struct pernet_operations rt_genid_ops = {
2547 	.init = rt_genid_init,
2548 };
2549 
2550 static int __net_init ipv4_inetpeer_init(struct net *net)
2551 {
2552 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2553 
2554 	if (!bp)
2555 		return -ENOMEM;
2556 	inet_peer_base_init(bp);
2557 	net->ipv4.peers = bp;
2558 	return 0;
2559 }
2560 
2561 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2562 {
2563 	struct inet_peer_base *bp = net->ipv4.peers;
2564 
2565 	net->ipv4.peers = NULL;
2566 	inetpeer_invalidate_tree(bp);
2567 	kfree(bp);
2568 }
2569 
2570 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2571 	.init	=	ipv4_inetpeer_init,
2572 	.exit	=	ipv4_inetpeer_exit,
2573 };
2574 
2575 #ifdef CONFIG_IP_ROUTE_CLASSID
2576 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2577 #endif /* CONFIG_IP_ROUTE_CLASSID */
2578 
2579 int __init ip_rt_init(void)
2580 {
2581 	int rc = 0;
2582 
2583 #ifdef CONFIG_IP_ROUTE_CLASSID
2584 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2585 	if (!ip_rt_acct)
2586 		panic("IP: failed to allocate ip_rt_acct\n");
2587 #endif
2588 
2589 	ipv4_dst_ops.kmem_cachep =
2590 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2591 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2592 
2593 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2594 
2595 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2596 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2597 
2598 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2599 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2600 
2601 	ipv4_dst_ops.gc_thresh = ~0;
2602 	ip_rt_max_size = INT_MAX;
2603 
2604 	devinet_init();
2605 	ip_fib_init();
2606 
2607 	if (ip_rt_proc_init())
2608 		pr_err("Unable to create route proc files\n");
2609 #ifdef CONFIG_XFRM
2610 	xfrm_init();
2611 	xfrm4_init(ip_rt_max_size);
2612 #endif
2613 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2614 
2615 #ifdef CONFIG_SYSCTL
2616 	register_pernet_subsys(&sysctl_route_ops);
2617 #endif
2618 	register_pernet_subsys(&rt_genid_ops);
2619 	register_pernet_subsys(&ipv4_inetpeer_ops);
2620 	return rc;
2621 }
2622 
2623 #ifdef CONFIG_SYSCTL
2624 /*
2625  * We really need to sanitize the damn ipv4 init order, then all
2626  * this nonsense will go away.
2627  */
2628 void __init ip_static_sysctl_init(void)
2629 {
2630 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2631 }
2632 #endif
2633