xref: /linux/net/ipv4/route.c (revision d0b73b488c55df905ea8faaad079f8535629ed26)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu);
144 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 					struct sk_buff *skb);
146 static void		ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 			    int how)
150 {
151 }
152 
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 	WARN_ON(1);
156 	return NULL;
157 }
158 
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 					   struct sk_buff *skb,
161 					   const void *daddr);
162 
163 static struct dst_ops ipv4_dst_ops = {
164 	.family =		AF_INET,
165 	.protocol =		cpu_to_be16(ETH_P_IP),
166 	.check =		ipv4_dst_check,
167 	.default_advmss =	ipv4_default_advmss,
168 	.mtu =			ipv4_mtu,
169 	.cow_metrics =		ipv4_cow_metrics,
170 	.destroy =		ipv4_dst_destroy,
171 	.ifdown =		ipv4_dst_ifdown,
172 	.negative_advice =	ipv4_negative_advice,
173 	.link_failure =		ipv4_link_failure,
174 	.update_pmtu =		ip_rt_update_pmtu,
175 	.redirect =		ip_do_redirect,
176 	.local_out =		__ip_local_out,
177 	.neigh_lookup =		ipv4_neigh_lookup,
178 };
179 
180 #define ECN_OR_COST(class)	TC_PRIO_##class
181 
182 const __u8 ip_tos2prio[16] = {
183 	TC_PRIO_BESTEFFORT,
184 	ECN_OR_COST(BESTEFFORT),
185 	TC_PRIO_BESTEFFORT,
186 	ECN_OR_COST(BESTEFFORT),
187 	TC_PRIO_BULK,
188 	ECN_OR_COST(BULK),
189 	TC_PRIO_BULK,
190 	ECN_OR_COST(BULK),
191 	TC_PRIO_INTERACTIVE,
192 	ECN_OR_COST(INTERACTIVE),
193 	TC_PRIO_INTERACTIVE,
194 	ECN_OR_COST(INTERACTIVE),
195 	TC_PRIO_INTERACTIVE_BULK,
196 	ECN_OR_COST(INTERACTIVE_BULK),
197 	TC_PRIO_INTERACTIVE_BULK,
198 	ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201 
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 	if (*pos)
209 		return NULL;
210 	return SEQ_START_TOKEN;
211 }
212 
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 	++*pos;
216 	return NULL;
217 }
218 
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222 
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 	if (v == SEQ_START_TOKEN)
226 		seq_printf(seq, "%-127s\n",
227 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 			   "HHUptod\tSpecDst");
230 	return 0;
231 }
232 
233 static const struct seq_operations rt_cache_seq_ops = {
234 	.start  = rt_cache_seq_start,
235 	.next   = rt_cache_seq_next,
236 	.stop   = rt_cache_seq_stop,
237 	.show   = rt_cache_seq_show,
238 };
239 
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 	return seq_open(file, &rt_cache_seq_ops);
243 }
244 
245 static const struct file_operations rt_cache_seq_fops = {
246 	.owner	 = THIS_MODULE,
247 	.open	 = rt_cache_seq_open,
248 	.read	 = seq_read,
249 	.llseek	 = seq_lseek,
250 	.release = seq_release,
251 };
252 
253 
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 	int cpu;
257 
258 	if (*pos == 0)
259 		return SEQ_START_TOKEN;
260 
261 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 		if (!cpu_possible(cpu))
263 			continue;
264 		*pos = cpu+1;
265 		return &per_cpu(rt_cache_stat, cpu);
266 	}
267 	return NULL;
268 }
269 
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 	int cpu;
273 
274 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 		if (!cpu_possible(cpu))
276 			continue;
277 		*pos = cpu+1;
278 		return &per_cpu(rt_cache_stat, cpu);
279 	}
280 	return NULL;
281 
282 }
283 
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286 
287 }
288 
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 	struct rt_cache_stat *st = v;
292 
293 	if (v == SEQ_START_TOKEN) {
294 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 		return 0;
296 	}
297 
298 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 		   dst_entries_get_slow(&ipv4_dst_ops),
301 		   st->in_hit,
302 		   st->in_slow_tot,
303 		   st->in_slow_mc,
304 		   st->in_no_route,
305 		   st->in_brd,
306 		   st->in_martian_dst,
307 		   st->in_martian_src,
308 
309 		   st->out_hit,
310 		   st->out_slow_tot,
311 		   st->out_slow_mc,
312 
313 		   st->gc_total,
314 		   st->gc_ignored,
315 		   st->gc_goal_miss,
316 		   st->gc_dst_overflow,
317 		   st->in_hlist_search,
318 		   st->out_hlist_search
319 		);
320 	return 0;
321 }
322 
323 static const struct seq_operations rt_cpu_seq_ops = {
324 	.start  = rt_cpu_seq_start,
325 	.next   = rt_cpu_seq_next,
326 	.stop   = rt_cpu_seq_stop,
327 	.show   = rt_cpu_seq_show,
328 };
329 
330 
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 	return seq_open(file, &rt_cpu_seq_ops);
334 }
335 
336 static const struct file_operations rt_cpu_seq_fops = {
337 	.owner	 = THIS_MODULE,
338 	.open	 = rt_cpu_seq_open,
339 	.read	 = seq_read,
340 	.llseek	 = seq_lseek,
341 	.release = seq_release,
342 };
343 
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 	struct ip_rt_acct *dst, *src;
348 	unsigned int i, j;
349 
350 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 	if (!dst)
352 		return -ENOMEM;
353 
354 	for_each_possible_cpu(i) {
355 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 		for (j = 0; j < 256; j++) {
357 			dst[j].o_bytes   += src[j].o_bytes;
358 			dst[j].o_packets += src[j].o_packets;
359 			dst[j].i_bytes   += src[j].i_bytes;
360 			dst[j].i_packets += src[j].i_packets;
361 		}
362 	}
363 
364 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 	kfree(dst);
366 	return 0;
367 }
368 
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 	return single_open(file, rt_acct_proc_show, NULL);
372 }
373 
374 static const struct file_operations rt_acct_proc_fops = {
375 	.owner		= THIS_MODULE,
376 	.open		= rt_acct_proc_open,
377 	.read		= seq_read,
378 	.llseek		= seq_lseek,
379 	.release	= single_release,
380 };
381 #endif
382 
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 	struct proc_dir_entry *pde;
386 
387 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 			&rt_cache_seq_fops);
389 	if (!pde)
390 		goto err1;
391 
392 	pde = proc_create("rt_cache", S_IRUGO,
393 			  net->proc_net_stat, &rt_cpu_seq_fops);
394 	if (!pde)
395 		goto err2;
396 
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 	if (!pde)
400 		goto err3;
401 #endif
402 	return 0;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 	remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 	remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 	return -ENOMEM;
412 }
413 
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 	remove_proc_entry("rt_cache", net->proc_net_stat);
417 	remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 	remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422 
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424 	.init = ip_rt_do_proc_init,
425 	.exit = ip_rt_do_proc_exit,
426 };
427 
428 static int __init ip_rt_proc_init(void)
429 {
430 	return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432 
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 	return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439 
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444 
445 void rt_cache_flush(struct net *net)
446 {
447 	rt_genid_bump(net);
448 }
449 
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 					   struct sk_buff *skb,
452 					   const void *daddr)
453 {
454 	struct net_device *dev = dst->dev;
455 	const __be32 *pkey = daddr;
456 	const struct rtable *rt;
457 	struct neighbour *n;
458 
459 	rt = (const struct rtable *) dst;
460 	if (rt->rt_gateway)
461 		pkey = (const __be32 *) &rt->rt_gateway;
462 	else if (skb)
463 		pkey = &ip_hdr(skb)->daddr;
464 
465 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 	if (n)
467 		return n;
468 	return neigh_create(&arp_tbl, pkey, dev);
469 }
470 
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480 	static DEFINE_SPINLOCK(ip_fb_id_lock);
481 	static u32 ip_fallback_id;
482 	u32 salt;
483 
484 	spin_lock_bh(&ip_fb_id_lock);
485 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 	iph->id = htons(salt & 0xFFFF);
487 	ip_fallback_id = salt;
488 	spin_unlock_bh(&ip_fb_id_lock);
489 }
490 
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493 	struct net *net = dev_net(dst->dev);
494 	struct inet_peer *peer;
495 
496 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 	if (peer) {
498 		iph->id = htons(inet_getid(peer, more));
499 		inet_putpeer(peer);
500 		return;
501 	}
502 
503 	ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506 
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 			     const struct iphdr *iph,
509 			     int oif, u8 tos,
510 			     u8 prot, u32 mark, int flow_flags)
511 {
512 	if (sk) {
513 		const struct inet_sock *inet = inet_sk(sk);
514 
515 		oif = sk->sk_bound_dev_if;
516 		mark = sk->sk_mark;
517 		tos = RT_CONN_FLAGS(sk);
518 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 	}
520 	flowi4_init_output(fl4, oif, mark, tos,
521 			   RT_SCOPE_UNIVERSE, prot,
522 			   flow_flags,
523 			   iph->daddr, iph->saddr, 0, 0);
524 }
525 
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 			       const struct sock *sk)
528 {
529 	const struct iphdr *iph = ip_hdr(skb);
530 	int oif = skb->dev->ifindex;
531 	u8 tos = RT_TOS(iph->tos);
532 	u8 prot = iph->protocol;
533 	u32 mark = skb->mark;
534 
535 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537 
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540 	const struct inet_sock *inet = inet_sk(sk);
541 	const struct ip_options_rcu *inet_opt;
542 	__be32 daddr = inet->inet_daddr;
543 
544 	rcu_read_lock();
545 	inet_opt = rcu_dereference(inet->inet_opt);
546 	if (inet_opt && inet_opt->opt.srr)
547 		daddr = inet_opt->opt.faddr;
548 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 			   inet_sk_flowi_flags(sk),
552 			   daddr, inet->inet_saddr, 0, 0);
553 	rcu_read_unlock();
554 }
555 
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 				 const struct sk_buff *skb)
558 {
559 	if (skb)
560 		build_skb_flow_key(fl4, skb, sk);
561 	else
562 		build_sk_flow_key(fl4, sk);
563 }
564 
565 static inline void rt_free(struct rtable *rt)
566 {
567 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569 
570 static DEFINE_SPINLOCK(fnhe_lock);
571 
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574 	struct fib_nh_exception *fnhe, *oldest;
575 	struct rtable *orig;
576 
577 	oldest = rcu_dereference(hash->chain);
578 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 			oldest = fnhe;
582 	}
583 	orig = rcu_dereference(oldest->fnhe_rth);
584 	if (orig) {
585 		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 		rt_free(orig);
587 	}
588 	return oldest;
589 }
590 
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593 	u32 hval;
594 
595 	hval = (__force u32) daddr;
596 	hval ^= (hval >> 11) ^ (hval >> 22);
597 
598 	return hval & (FNHE_HASH_SIZE - 1);
599 }
600 
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 				  u32 pmtu, unsigned long expires)
603 {
604 	struct fnhe_hash_bucket *hash;
605 	struct fib_nh_exception *fnhe;
606 	int depth;
607 	u32 hval = fnhe_hashfun(daddr);
608 
609 	spin_lock_bh(&fnhe_lock);
610 
611 	hash = nh->nh_exceptions;
612 	if (!hash) {
613 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 		if (!hash)
615 			goto out_unlock;
616 		nh->nh_exceptions = hash;
617 	}
618 
619 	hash += hval;
620 
621 	depth = 0;
622 	for (fnhe = rcu_dereference(hash->chain); fnhe;
623 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 		if (fnhe->fnhe_daddr == daddr)
625 			break;
626 		depth++;
627 	}
628 
629 	if (fnhe) {
630 		if (gw)
631 			fnhe->fnhe_gw = gw;
632 		if (pmtu) {
633 			fnhe->fnhe_pmtu = pmtu;
634 			fnhe->fnhe_expires = expires;
635 		}
636 	} else {
637 		if (depth > FNHE_RECLAIM_DEPTH)
638 			fnhe = fnhe_oldest(hash);
639 		else {
640 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 			if (!fnhe)
642 				goto out_unlock;
643 
644 			fnhe->fnhe_next = hash->chain;
645 			rcu_assign_pointer(hash->chain, fnhe);
646 		}
647 		fnhe->fnhe_daddr = daddr;
648 		fnhe->fnhe_gw = gw;
649 		fnhe->fnhe_pmtu = pmtu;
650 		fnhe->fnhe_expires = expires;
651 	}
652 
653 	fnhe->fnhe_stamp = jiffies;
654 
655 out_unlock:
656 	spin_unlock_bh(&fnhe_lock);
657 	return;
658 }
659 
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 			     bool kill_route)
662 {
663 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
664 	__be32 old_gw = ip_hdr(skb)->saddr;
665 	struct net_device *dev = skb->dev;
666 	struct in_device *in_dev;
667 	struct fib_result res;
668 	struct neighbour *n;
669 	struct net *net;
670 
671 	switch (icmp_hdr(skb)->code & 7) {
672 	case ICMP_REDIR_NET:
673 	case ICMP_REDIR_NETTOS:
674 	case ICMP_REDIR_HOST:
675 	case ICMP_REDIR_HOSTTOS:
676 		break;
677 
678 	default:
679 		return;
680 	}
681 
682 	if (rt->rt_gateway != old_gw)
683 		return;
684 
685 	in_dev = __in_dev_get_rcu(dev);
686 	if (!in_dev)
687 		return;
688 
689 	net = dev_net(dev);
690 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 	    ipv4_is_zeronet(new_gw))
693 		goto reject_redirect;
694 
695 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 			goto reject_redirect;
698 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 			goto reject_redirect;
700 	} else {
701 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 			goto reject_redirect;
703 	}
704 
705 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 	if (n) {
707 		if (!(n->nud_state & NUD_VALID)) {
708 			neigh_event_send(n, NULL);
709 		} else {
710 			if (fib_lookup(net, fl4, &res) == 0) {
711 				struct fib_nh *nh = &FIB_RES_NH(res);
712 
713 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 						      0, 0);
715 			}
716 			if (kill_route)
717 				rt->dst.obsolete = DST_OBSOLETE_KILL;
718 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 		}
720 		neigh_release(n);
721 	}
722 	return;
723 
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 		const struct iphdr *iph = (const struct iphdr *) skb->data;
728 		__be32 daddr = iph->daddr;
729 		__be32 saddr = iph->saddr;
730 
731 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 				     "  Advised path = %pI4 -> %pI4\n",
733 				     &old_gw, dev->name, &new_gw,
734 				     &saddr, &daddr);
735 	}
736 #endif
737 	;
738 }
739 
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742 	struct rtable *rt;
743 	struct flowi4 fl4;
744 
745 	rt = (struct rtable *) dst;
746 
747 	ip_rt_build_flow_key(&fl4, sk, skb);
748 	__ip_do_redirect(rt, skb, &fl4, true);
749 }
750 
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753 	struct rtable *rt = (struct rtable *)dst;
754 	struct dst_entry *ret = dst;
755 
756 	if (rt) {
757 		if (dst->obsolete > 0) {
758 			ip_rt_put(rt);
759 			ret = NULL;
760 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 			   rt->dst.expires) {
762 			ip_rt_put(rt);
763 			ret = NULL;
764 		}
765 	}
766 	return ret;
767 }
768 
769 /*
770  * Algorithm:
771  *	1. The first ip_rt_redirect_number redirects are sent
772  *	   with exponential backoff, then we stop sending them at all,
773  *	   assuming that the host ignores our redirects.
774  *	2. If we did not see packets requiring redirects
775  *	   during ip_rt_redirect_silence, we assume that the host
776  *	   forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784 
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787 	struct rtable *rt = skb_rtable(skb);
788 	struct in_device *in_dev;
789 	struct inet_peer *peer;
790 	struct net *net;
791 	int log_martians;
792 
793 	rcu_read_lock();
794 	in_dev = __in_dev_get_rcu(rt->dst.dev);
795 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 		rcu_read_unlock();
797 		return;
798 	}
799 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 	rcu_read_unlock();
801 
802 	net = dev_net(rt->dst.dev);
803 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 	if (!peer) {
805 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
807 		return;
808 	}
809 
810 	/* No redirected packets during ip_rt_redirect_silence;
811 	 * reset the algorithm.
812 	 */
813 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814 		peer->rate_tokens = 0;
815 
816 	/* Too many ignored redirects; do not send anything
817 	 * set dst.rate_last to the last seen redirected packet.
818 	 */
819 	if (peer->rate_tokens >= ip_rt_redirect_number) {
820 		peer->rate_last = jiffies;
821 		goto out_put_peer;
822 	}
823 
824 	/* Check for load limit; set rate_last to the latest sent
825 	 * redirect.
826 	 */
827 	if (peer->rate_tokens == 0 ||
828 	    time_after(jiffies,
829 		       (peer->rate_last +
830 			(ip_rt_redirect_load << peer->rate_tokens)))) {
831 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832 
833 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 		peer->rate_last = jiffies;
835 		++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 		if (log_martians &&
838 		    peer->rate_tokens == ip_rt_redirect_number)
839 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840 					     &ip_hdr(skb)->saddr, inet_iif(skb),
841 					     &ip_hdr(skb)->daddr, &gw);
842 #endif
843 	}
844 out_put_peer:
845 	inet_putpeer(peer);
846 }
847 
848 static int ip_error(struct sk_buff *skb)
849 {
850 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 	struct rtable *rt = skb_rtable(skb);
852 	struct inet_peer *peer;
853 	unsigned long now;
854 	struct net *net;
855 	bool send;
856 	int code;
857 
858 	net = dev_net(rt->dst.dev);
859 	if (!IN_DEV_FORWARD(in_dev)) {
860 		switch (rt->dst.error) {
861 		case EHOSTUNREACH:
862 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 			break;
864 
865 		case ENETUNREACH:
866 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 			break;
868 		}
869 		goto out;
870 	}
871 
872 	switch (rt->dst.error) {
873 	case EINVAL:
874 	default:
875 		goto out;
876 	case EHOSTUNREACH:
877 		code = ICMP_HOST_UNREACH;
878 		break;
879 	case ENETUNREACH:
880 		code = ICMP_NET_UNREACH;
881 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882 		break;
883 	case EACCES:
884 		code = ICMP_PKT_FILTERED;
885 		break;
886 	}
887 
888 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889 
890 	send = true;
891 	if (peer) {
892 		now = jiffies;
893 		peer->rate_tokens += now - peer->rate_last;
894 		if (peer->rate_tokens > ip_rt_error_burst)
895 			peer->rate_tokens = ip_rt_error_burst;
896 		peer->rate_last = now;
897 		if (peer->rate_tokens >= ip_rt_error_cost)
898 			peer->rate_tokens -= ip_rt_error_cost;
899 		else
900 			send = false;
901 		inet_putpeer(peer);
902 	}
903 	if (send)
904 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905 
906 out:	kfree_skb(skb);
907 	return 0;
908 }
909 
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912 	struct dst_entry *dst = &rt->dst;
913 	struct fib_result res;
914 
915 	if (dst_metric_locked(dst, RTAX_MTU))
916 		return;
917 
918 	if (dst->dev->mtu < mtu)
919 		return;
920 
921 	if (mtu < ip_rt_min_pmtu)
922 		mtu = ip_rt_min_pmtu;
923 
924 	if (!rt->rt_pmtu) {
925 		dst->obsolete = DST_OBSOLETE_KILL;
926 	} else {
927 		rt->rt_pmtu = mtu;
928 		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
929 	}
930 
931 	rcu_read_lock();
932 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
933 		struct fib_nh *nh = &FIB_RES_NH(res);
934 
935 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
936 				      jiffies + ip_rt_mtu_expires);
937 	}
938 	rcu_read_unlock();
939 }
940 
941 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
942 			      struct sk_buff *skb, u32 mtu)
943 {
944 	struct rtable *rt = (struct rtable *) dst;
945 	struct flowi4 fl4;
946 
947 	ip_rt_build_flow_key(&fl4, sk, skb);
948 	__ip_rt_update_pmtu(rt, &fl4, mtu);
949 }
950 
951 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
952 		      int oif, u32 mark, u8 protocol, int flow_flags)
953 {
954 	const struct iphdr *iph = (const struct iphdr *) skb->data;
955 	struct flowi4 fl4;
956 	struct rtable *rt;
957 
958 	__build_flow_key(&fl4, NULL, iph, oif,
959 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
960 	rt = __ip_route_output_key(net, &fl4);
961 	if (!IS_ERR(rt)) {
962 		__ip_rt_update_pmtu(rt, &fl4, mtu);
963 		ip_rt_put(rt);
964 	}
965 }
966 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
967 
968 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
969 {
970 	const struct iphdr *iph = (const struct iphdr *) skb->data;
971 	struct flowi4 fl4;
972 	struct rtable *rt;
973 
974 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
975 	rt = __ip_route_output_key(sock_net(sk), &fl4);
976 	if (!IS_ERR(rt)) {
977 		__ip_rt_update_pmtu(rt, &fl4, mtu);
978 		ip_rt_put(rt);
979 	}
980 }
981 
982 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
983 {
984 	const struct iphdr *iph = (const struct iphdr *) skb->data;
985 	struct flowi4 fl4;
986 	struct rtable *rt;
987 	struct dst_entry *dst;
988 	bool new = false;
989 
990 	bh_lock_sock(sk);
991 	rt = (struct rtable *) __sk_dst_get(sk);
992 
993 	if (sock_owned_by_user(sk) || !rt) {
994 		__ipv4_sk_update_pmtu(skb, sk, mtu);
995 		goto out;
996 	}
997 
998 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
999 
1000 	if (!__sk_dst_check(sk, 0)) {
1001 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1002 		if (IS_ERR(rt))
1003 			goto out;
1004 
1005 		new = true;
1006 	}
1007 
1008 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1009 
1010 	dst = dst_check(&rt->dst, 0);
1011 	if (!dst) {
1012 		if (new)
1013 			dst_release(&rt->dst);
1014 
1015 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1016 		if (IS_ERR(rt))
1017 			goto out;
1018 
1019 		new = true;
1020 	}
1021 
1022 	if (new)
1023 		__sk_dst_set(sk, &rt->dst);
1024 
1025 out:
1026 	bh_unlock_sock(sk);
1027 }
1028 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1029 
1030 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1031 		   int oif, u32 mark, u8 protocol, int flow_flags)
1032 {
1033 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1034 	struct flowi4 fl4;
1035 	struct rtable *rt;
1036 
1037 	__build_flow_key(&fl4, NULL, iph, oif,
1038 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1039 	rt = __ip_route_output_key(net, &fl4);
1040 	if (!IS_ERR(rt)) {
1041 		__ip_do_redirect(rt, skb, &fl4, false);
1042 		ip_rt_put(rt);
1043 	}
1044 }
1045 EXPORT_SYMBOL_GPL(ipv4_redirect);
1046 
1047 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1048 {
1049 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1050 	struct flowi4 fl4;
1051 	struct rtable *rt;
1052 
1053 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1054 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1055 	if (!IS_ERR(rt)) {
1056 		__ip_do_redirect(rt, skb, &fl4, false);
1057 		ip_rt_put(rt);
1058 	}
1059 }
1060 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1061 
1062 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1063 {
1064 	struct rtable *rt = (struct rtable *) dst;
1065 
1066 	/* All IPV4 dsts are created with ->obsolete set to the value
1067 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1068 	 * into this function always.
1069 	 *
1070 	 * When a PMTU/redirect information update invalidates a
1071 	 * route, this is indicated by setting obsolete to
1072 	 * DST_OBSOLETE_KILL.
1073 	 */
1074 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1075 		return NULL;
1076 	return dst;
1077 }
1078 
1079 static void ipv4_link_failure(struct sk_buff *skb)
1080 {
1081 	struct rtable *rt;
1082 
1083 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1084 
1085 	rt = skb_rtable(skb);
1086 	if (rt)
1087 		dst_set_expires(&rt->dst, 0);
1088 }
1089 
1090 static int ip_rt_bug(struct sk_buff *skb)
1091 {
1092 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1093 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1094 		 skb->dev ? skb->dev->name : "?");
1095 	kfree_skb(skb);
1096 	WARN_ON(1);
1097 	return 0;
1098 }
1099 
1100 /*
1101    We do not cache source address of outgoing interface,
1102    because it is used only by IP RR, TS and SRR options,
1103    so that it out of fast path.
1104 
1105    BTW remember: "addr" is allowed to be not aligned
1106    in IP options!
1107  */
1108 
1109 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1110 {
1111 	__be32 src;
1112 
1113 	if (rt_is_output_route(rt))
1114 		src = ip_hdr(skb)->saddr;
1115 	else {
1116 		struct fib_result res;
1117 		struct flowi4 fl4;
1118 		struct iphdr *iph;
1119 
1120 		iph = ip_hdr(skb);
1121 
1122 		memset(&fl4, 0, sizeof(fl4));
1123 		fl4.daddr = iph->daddr;
1124 		fl4.saddr = iph->saddr;
1125 		fl4.flowi4_tos = RT_TOS(iph->tos);
1126 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1127 		fl4.flowi4_iif = skb->dev->ifindex;
1128 		fl4.flowi4_mark = skb->mark;
1129 
1130 		rcu_read_lock();
1131 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1132 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1133 		else
1134 			src = inet_select_addr(rt->dst.dev,
1135 					       rt_nexthop(rt, iph->daddr),
1136 					       RT_SCOPE_UNIVERSE);
1137 		rcu_read_unlock();
1138 	}
1139 	memcpy(addr, &src, 4);
1140 }
1141 
1142 #ifdef CONFIG_IP_ROUTE_CLASSID
1143 static void set_class_tag(struct rtable *rt, u32 tag)
1144 {
1145 	if (!(rt->dst.tclassid & 0xFFFF))
1146 		rt->dst.tclassid |= tag & 0xFFFF;
1147 	if (!(rt->dst.tclassid & 0xFFFF0000))
1148 		rt->dst.tclassid |= tag & 0xFFFF0000;
1149 }
1150 #endif
1151 
1152 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1153 {
1154 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1155 
1156 	if (advmss == 0) {
1157 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1158 			       ip_rt_min_advmss);
1159 		if (advmss > 65535 - 40)
1160 			advmss = 65535 - 40;
1161 	}
1162 	return advmss;
1163 }
1164 
1165 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1166 {
1167 	const struct rtable *rt = (const struct rtable *) dst;
1168 	unsigned int mtu = rt->rt_pmtu;
1169 
1170 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1171 		mtu = dst_metric_raw(dst, RTAX_MTU);
1172 
1173 	if (mtu)
1174 		return mtu;
1175 
1176 	mtu = dst->dev->mtu;
1177 
1178 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1179 		if (rt->rt_uses_gateway && mtu > 576)
1180 			mtu = 576;
1181 	}
1182 
1183 	if (mtu > IP_MAX_MTU)
1184 		mtu = IP_MAX_MTU;
1185 
1186 	return mtu;
1187 }
1188 
1189 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1190 {
1191 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1192 	struct fib_nh_exception *fnhe;
1193 	u32 hval;
1194 
1195 	if (!hash)
1196 		return NULL;
1197 
1198 	hval = fnhe_hashfun(daddr);
1199 
1200 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1201 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1202 		if (fnhe->fnhe_daddr == daddr)
1203 			return fnhe;
1204 	}
1205 	return NULL;
1206 }
1207 
1208 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1209 			      __be32 daddr)
1210 {
1211 	bool ret = false;
1212 
1213 	spin_lock_bh(&fnhe_lock);
1214 
1215 	if (daddr == fnhe->fnhe_daddr) {
1216 		struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1217 		if (orig && rt_is_expired(orig)) {
1218 			fnhe->fnhe_gw = 0;
1219 			fnhe->fnhe_pmtu = 0;
1220 			fnhe->fnhe_expires = 0;
1221 		}
1222 		if (fnhe->fnhe_pmtu) {
1223 			unsigned long expires = fnhe->fnhe_expires;
1224 			unsigned long diff = expires - jiffies;
1225 
1226 			if (time_before(jiffies, expires)) {
1227 				rt->rt_pmtu = fnhe->fnhe_pmtu;
1228 				dst_set_expires(&rt->dst, diff);
1229 			}
1230 		}
1231 		if (fnhe->fnhe_gw) {
1232 			rt->rt_flags |= RTCF_REDIRECTED;
1233 			rt->rt_gateway = fnhe->fnhe_gw;
1234 			rt->rt_uses_gateway = 1;
1235 		} else if (!rt->rt_gateway)
1236 			rt->rt_gateway = daddr;
1237 
1238 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
1239 		if (orig)
1240 			rt_free(orig);
1241 
1242 		fnhe->fnhe_stamp = jiffies;
1243 		ret = true;
1244 	}
1245 	spin_unlock_bh(&fnhe_lock);
1246 
1247 	return ret;
1248 }
1249 
1250 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1251 {
1252 	struct rtable *orig, *prev, **p;
1253 	bool ret = true;
1254 
1255 	if (rt_is_input_route(rt)) {
1256 		p = (struct rtable **)&nh->nh_rth_input;
1257 	} else {
1258 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1259 	}
1260 	orig = *p;
1261 
1262 	prev = cmpxchg(p, orig, rt);
1263 	if (prev == orig) {
1264 		if (orig)
1265 			rt_free(orig);
1266 	} else
1267 		ret = false;
1268 
1269 	return ret;
1270 }
1271 
1272 static DEFINE_SPINLOCK(rt_uncached_lock);
1273 static LIST_HEAD(rt_uncached_list);
1274 
1275 static void rt_add_uncached_list(struct rtable *rt)
1276 {
1277 	spin_lock_bh(&rt_uncached_lock);
1278 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1279 	spin_unlock_bh(&rt_uncached_lock);
1280 }
1281 
1282 static void ipv4_dst_destroy(struct dst_entry *dst)
1283 {
1284 	struct rtable *rt = (struct rtable *) dst;
1285 
1286 	if (!list_empty(&rt->rt_uncached)) {
1287 		spin_lock_bh(&rt_uncached_lock);
1288 		list_del(&rt->rt_uncached);
1289 		spin_unlock_bh(&rt_uncached_lock);
1290 	}
1291 }
1292 
1293 void rt_flush_dev(struct net_device *dev)
1294 {
1295 	if (!list_empty(&rt_uncached_list)) {
1296 		struct net *net = dev_net(dev);
1297 		struct rtable *rt;
1298 
1299 		spin_lock_bh(&rt_uncached_lock);
1300 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1301 			if (rt->dst.dev != dev)
1302 				continue;
1303 			rt->dst.dev = net->loopback_dev;
1304 			dev_hold(rt->dst.dev);
1305 			dev_put(dev);
1306 		}
1307 		spin_unlock_bh(&rt_uncached_lock);
1308 	}
1309 }
1310 
1311 static bool rt_cache_valid(const struct rtable *rt)
1312 {
1313 	return	rt &&
1314 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1315 		!rt_is_expired(rt);
1316 }
1317 
1318 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1319 			   const struct fib_result *res,
1320 			   struct fib_nh_exception *fnhe,
1321 			   struct fib_info *fi, u16 type, u32 itag)
1322 {
1323 	bool cached = false;
1324 
1325 	if (fi) {
1326 		struct fib_nh *nh = &FIB_RES_NH(*res);
1327 
1328 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1329 			rt->rt_gateway = nh->nh_gw;
1330 			rt->rt_uses_gateway = 1;
1331 		}
1332 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1333 #ifdef CONFIG_IP_ROUTE_CLASSID
1334 		rt->dst.tclassid = nh->nh_tclassid;
1335 #endif
1336 		if (unlikely(fnhe))
1337 			cached = rt_bind_exception(rt, fnhe, daddr);
1338 		else if (!(rt->dst.flags & DST_NOCACHE))
1339 			cached = rt_cache_route(nh, rt);
1340 		if (unlikely(!cached)) {
1341 			/* Routes we intend to cache in nexthop exception or
1342 			 * FIB nexthop have the DST_NOCACHE bit clear.
1343 			 * However, if we are unsuccessful at storing this
1344 			 * route into the cache we really need to set it.
1345 			 */
1346 			rt->dst.flags |= DST_NOCACHE;
1347 			if (!rt->rt_gateway)
1348 				rt->rt_gateway = daddr;
1349 			rt_add_uncached_list(rt);
1350 		}
1351 	} else
1352 		rt_add_uncached_list(rt);
1353 
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355 #ifdef CONFIG_IP_MULTIPLE_TABLES
1356 	set_class_tag(rt, res->tclassid);
1357 #endif
1358 	set_class_tag(rt, itag);
1359 #endif
1360 }
1361 
1362 static struct rtable *rt_dst_alloc(struct net_device *dev,
1363 				   bool nopolicy, bool noxfrm, bool will_cache)
1364 {
1365 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1366 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1367 			 (nopolicy ? DST_NOPOLICY : 0) |
1368 			 (noxfrm ? DST_NOXFRM : 0));
1369 }
1370 
1371 /* called in rcu_read_lock() section */
1372 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1373 				u8 tos, struct net_device *dev, int our)
1374 {
1375 	struct rtable *rth;
1376 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1377 	u32 itag = 0;
1378 	int err;
1379 
1380 	/* Primary sanity checks. */
1381 
1382 	if (in_dev == NULL)
1383 		return -EINVAL;
1384 
1385 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1386 	    skb->protocol != htons(ETH_P_IP))
1387 		goto e_inval;
1388 
1389 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1390 		if (ipv4_is_loopback(saddr))
1391 			goto e_inval;
1392 
1393 	if (ipv4_is_zeronet(saddr)) {
1394 		if (!ipv4_is_local_multicast(daddr))
1395 			goto e_inval;
1396 	} else {
1397 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1398 					  in_dev, &itag);
1399 		if (err < 0)
1400 			goto e_err;
1401 	}
1402 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1403 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1404 	if (!rth)
1405 		goto e_nobufs;
1406 
1407 #ifdef CONFIG_IP_ROUTE_CLASSID
1408 	rth->dst.tclassid = itag;
1409 #endif
1410 	rth->dst.output = ip_rt_bug;
1411 
1412 	rth->rt_genid	= rt_genid(dev_net(dev));
1413 	rth->rt_flags	= RTCF_MULTICAST;
1414 	rth->rt_type	= RTN_MULTICAST;
1415 	rth->rt_is_input= 1;
1416 	rth->rt_iif	= 0;
1417 	rth->rt_pmtu	= 0;
1418 	rth->rt_gateway	= 0;
1419 	rth->rt_uses_gateway = 0;
1420 	INIT_LIST_HEAD(&rth->rt_uncached);
1421 	if (our) {
1422 		rth->dst.input= ip_local_deliver;
1423 		rth->rt_flags |= RTCF_LOCAL;
1424 	}
1425 
1426 #ifdef CONFIG_IP_MROUTE
1427 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1428 		rth->dst.input = ip_mr_input;
1429 #endif
1430 	RT_CACHE_STAT_INC(in_slow_mc);
1431 
1432 	skb_dst_set(skb, &rth->dst);
1433 	return 0;
1434 
1435 e_nobufs:
1436 	return -ENOBUFS;
1437 e_inval:
1438 	return -EINVAL;
1439 e_err:
1440 	return err;
1441 }
1442 
1443 
1444 static void ip_handle_martian_source(struct net_device *dev,
1445 				     struct in_device *in_dev,
1446 				     struct sk_buff *skb,
1447 				     __be32 daddr,
1448 				     __be32 saddr)
1449 {
1450 	RT_CACHE_STAT_INC(in_martian_src);
1451 #ifdef CONFIG_IP_ROUTE_VERBOSE
1452 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1453 		/*
1454 		 *	RFC1812 recommendation, if source is martian,
1455 		 *	the only hint is MAC header.
1456 		 */
1457 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1458 			&daddr, &saddr, dev->name);
1459 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1460 			print_hex_dump(KERN_WARNING, "ll header: ",
1461 				       DUMP_PREFIX_OFFSET, 16, 1,
1462 				       skb_mac_header(skb),
1463 				       dev->hard_header_len, true);
1464 		}
1465 	}
1466 #endif
1467 }
1468 
1469 /* called in rcu_read_lock() section */
1470 static int __mkroute_input(struct sk_buff *skb,
1471 			   const struct fib_result *res,
1472 			   struct in_device *in_dev,
1473 			   __be32 daddr, __be32 saddr, u32 tos)
1474 {
1475 	struct rtable *rth;
1476 	int err;
1477 	struct in_device *out_dev;
1478 	unsigned int flags = 0;
1479 	bool do_cache;
1480 	u32 itag;
1481 
1482 	/* get a working reference to the output device */
1483 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1484 	if (out_dev == NULL) {
1485 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1486 		return -EINVAL;
1487 	}
1488 
1489 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1490 				  in_dev->dev, in_dev, &itag);
1491 	if (err < 0) {
1492 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1493 					 saddr);
1494 
1495 		goto cleanup;
1496 	}
1497 
1498 	do_cache = res->fi && !itag;
1499 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1500 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1501 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1502 		flags |= RTCF_DOREDIRECT;
1503 		do_cache = false;
1504 	}
1505 
1506 	if (skb->protocol != htons(ETH_P_IP)) {
1507 		/* Not IP (i.e. ARP). Do not create route, if it is
1508 		 * invalid for proxy arp. DNAT routes are always valid.
1509 		 *
1510 		 * Proxy arp feature have been extended to allow, ARP
1511 		 * replies back to the same interface, to support
1512 		 * Private VLAN switch technologies. See arp.c.
1513 		 */
1514 		if (out_dev == in_dev &&
1515 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1516 			err = -EINVAL;
1517 			goto cleanup;
1518 		}
1519 	}
1520 
1521 	if (do_cache) {
1522 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1523 		if (rt_cache_valid(rth)) {
1524 			skb_dst_set_noref(skb, &rth->dst);
1525 			goto out;
1526 		}
1527 	}
1528 
1529 	rth = rt_dst_alloc(out_dev->dev,
1530 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1531 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1532 	if (!rth) {
1533 		err = -ENOBUFS;
1534 		goto cleanup;
1535 	}
1536 
1537 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1538 	rth->rt_flags = flags;
1539 	rth->rt_type = res->type;
1540 	rth->rt_is_input = 1;
1541 	rth->rt_iif 	= 0;
1542 	rth->rt_pmtu	= 0;
1543 	rth->rt_gateway	= 0;
1544 	rth->rt_uses_gateway = 0;
1545 	INIT_LIST_HEAD(&rth->rt_uncached);
1546 
1547 	rth->dst.input = ip_forward;
1548 	rth->dst.output = ip_output;
1549 
1550 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1551 	skb_dst_set(skb, &rth->dst);
1552 out:
1553 	err = 0;
1554  cleanup:
1555 	return err;
1556 }
1557 
1558 static int ip_mkroute_input(struct sk_buff *skb,
1559 			    struct fib_result *res,
1560 			    const struct flowi4 *fl4,
1561 			    struct in_device *in_dev,
1562 			    __be32 daddr, __be32 saddr, u32 tos)
1563 {
1564 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1565 	if (res->fi && res->fi->fib_nhs > 1)
1566 		fib_select_multipath(res);
1567 #endif
1568 
1569 	/* create a routing cache entry */
1570 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1571 }
1572 
1573 /*
1574  *	NOTE. We drop all the packets that has local source
1575  *	addresses, because every properly looped back packet
1576  *	must have correct destination already attached by output routine.
1577  *
1578  *	Such approach solves two big problems:
1579  *	1. Not simplex devices are handled properly.
1580  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1581  *	called with rcu_read_lock()
1582  */
1583 
1584 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585 			       u8 tos, struct net_device *dev)
1586 {
1587 	struct fib_result res;
1588 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1589 	struct flowi4	fl4;
1590 	unsigned int	flags = 0;
1591 	u32		itag = 0;
1592 	struct rtable	*rth;
1593 	int		err = -EINVAL;
1594 	struct net    *net = dev_net(dev);
1595 	bool do_cache;
1596 
1597 	/* IP on this device is disabled. */
1598 
1599 	if (!in_dev)
1600 		goto out;
1601 
1602 	/* Check for the most weird martians, which can be not detected
1603 	   by fib_lookup.
1604 	 */
1605 
1606 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1607 		goto martian_source;
1608 
1609 	res.fi = NULL;
1610 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1611 		goto brd_input;
1612 
1613 	/* Accept zero addresses only to limited broadcast;
1614 	 * I even do not know to fix it or not. Waiting for complains :-)
1615 	 */
1616 	if (ipv4_is_zeronet(saddr))
1617 		goto martian_source;
1618 
1619 	if (ipv4_is_zeronet(daddr))
1620 		goto martian_destination;
1621 
1622 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1623 	 * and call it once if daddr or/and saddr are loopback addresses
1624 	 */
1625 	if (ipv4_is_loopback(daddr)) {
1626 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1627 			goto martian_destination;
1628 	} else if (ipv4_is_loopback(saddr)) {
1629 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1630 			goto martian_source;
1631 	}
1632 
1633 	/*
1634 	 *	Now we are ready to route packet.
1635 	 */
1636 	fl4.flowi4_oif = 0;
1637 	fl4.flowi4_iif = dev->ifindex;
1638 	fl4.flowi4_mark = skb->mark;
1639 	fl4.flowi4_tos = tos;
1640 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1641 	fl4.daddr = daddr;
1642 	fl4.saddr = saddr;
1643 	err = fib_lookup(net, &fl4, &res);
1644 	if (err != 0)
1645 		goto no_route;
1646 
1647 	RT_CACHE_STAT_INC(in_slow_tot);
1648 
1649 	if (res.type == RTN_BROADCAST)
1650 		goto brd_input;
1651 
1652 	if (res.type == RTN_LOCAL) {
1653 		err = fib_validate_source(skb, saddr, daddr, tos,
1654 					  LOOPBACK_IFINDEX,
1655 					  dev, in_dev, &itag);
1656 		if (err < 0)
1657 			goto martian_source_keep_err;
1658 		goto local_input;
1659 	}
1660 
1661 	if (!IN_DEV_FORWARD(in_dev))
1662 		goto no_route;
1663 	if (res.type != RTN_UNICAST)
1664 		goto martian_destination;
1665 
1666 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1667 out:	return err;
1668 
1669 brd_input:
1670 	if (skb->protocol != htons(ETH_P_IP))
1671 		goto e_inval;
1672 
1673 	if (!ipv4_is_zeronet(saddr)) {
1674 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1675 					  in_dev, &itag);
1676 		if (err < 0)
1677 			goto martian_source_keep_err;
1678 	}
1679 	flags |= RTCF_BROADCAST;
1680 	res.type = RTN_BROADCAST;
1681 	RT_CACHE_STAT_INC(in_brd);
1682 
1683 local_input:
1684 	do_cache = false;
1685 	if (res.fi) {
1686 		if (!itag) {
1687 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1688 			if (rt_cache_valid(rth)) {
1689 				skb_dst_set_noref(skb, &rth->dst);
1690 				err = 0;
1691 				goto out;
1692 			}
1693 			do_cache = true;
1694 		}
1695 	}
1696 
1697 	rth = rt_dst_alloc(net->loopback_dev,
1698 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1699 	if (!rth)
1700 		goto e_nobufs;
1701 
1702 	rth->dst.input= ip_local_deliver;
1703 	rth->dst.output= ip_rt_bug;
1704 #ifdef CONFIG_IP_ROUTE_CLASSID
1705 	rth->dst.tclassid = itag;
1706 #endif
1707 
1708 	rth->rt_genid = rt_genid(net);
1709 	rth->rt_flags 	= flags|RTCF_LOCAL;
1710 	rth->rt_type	= res.type;
1711 	rth->rt_is_input = 1;
1712 	rth->rt_iif	= 0;
1713 	rth->rt_pmtu	= 0;
1714 	rth->rt_gateway	= 0;
1715 	rth->rt_uses_gateway = 0;
1716 	INIT_LIST_HEAD(&rth->rt_uncached);
1717 	if (res.type == RTN_UNREACHABLE) {
1718 		rth->dst.input= ip_error;
1719 		rth->dst.error= -err;
1720 		rth->rt_flags 	&= ~RTCF_LOCAL;
1721 	}
1722 	if (do_cache)
1723 		rt_cache_route(&FIB_RES_NH(res), rth);
1724 	skb_dst_set(skb, &rth->dst);
1725 	err = 0;
1726 	goto out;
1727 
1728 no_route:
1729 	RT_CACHE_STAT_INC(in_no_route);
1730 	res.type = RTN_UNREACHABLE;
1731 	if (err == -ESRCH)
1732 		err = -ENETUNREACH;
1733 	goto local_input;
1734 
1735 	/*
1736 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1737 	 */
1738 martian_destination:
1739 	RT_CACHE_STAT_INC(in_martian_dst);
1740 #ifdef CONFIG_IP_ROUTE_VERBOSE
1741 	if (IN_DEV_LOG_MARTIANS(in_dev))
1742 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1743 				     &daddr, &saddr, dev->name);
1744 #endif
1745 
1746 e_inval:
1747 	err = -EINVAL;
1748 	goto out;
1749 
1750 e_nobufs:
1751 	err = -ENOBUFS;
1752 	goto out;
1753 
1754 martian_source:
1755 	err = -EINVAL;
1756 martian_source_keep_err:
1757 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1758 	goto out;
1759 }
1760 
1761 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1762 			 u8 tos, struct net_device *dev)
1763 {
1764 	int res;
1765 
1766 	rcu_read_lock();
1767 
1768 	/* Multicast recognition logic is moved from route cache to here.
1769 	   The problem was that too many Ethernet cards have broken/missing
1770 	   hardware multicast filters :-( As result the host on multicasting
1771 	   network acquires a lot of useless route cache entries, sort of
1772 	   SDR messages from all the world. Now we try to get rid of them.
1773 	   Really, provided software IP multicast filter is organized
1774 	   reasonably (at least, hashed), it does not result in a slowdown
1775 	   comparing with route cache reject entries.
1776 	   Note, that multicast routers are not affected, because
1777 	   route cache entry is created eventually.
1778 	 */
1779 	if (ipv4_is_multicast(daddr)) {
1780 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1781 
1782 		if (in_dev) {
1783 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1784 						  ip_hdr(skb)->protocol);
1785 			if (our
1786 #ifdef CONFIG_IP_MROUTE
1787 				||
1788 			    (!ipv4_is_local_multicast(daddr) &&
1789 			     IN_DEV_MFORWARD(in_dev))
1790 #endif
1791 			   ) {
1792 				int res = ip_route_input_mc(skb, daddr, saddr,
1793 							    tos, dev, our);
1794 				rcu_read_unlock();
1795 				return res;
1796 			}
1797 		}
1798 		rcu_read_unlock();
1799 		return -EINVAL;
1800 	}
1801 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1802 	rcu_read_unlock();
1803 	return res;
1804 }
1805 EXPORT_SYMBOL(ip_route_input_noref);
1806 
1807 /* called with rcu_read_lock() */
1808 static struct rtable *__mkroute_output(const struct fib_result *res,
1809 				       const struct flowi4 *fl4, int orig_oif,
1810 				       struct net_device *dev_out,
1811 				       unsigned int flags)
1812 {
1813 	struct fib_info *fi = res->fi;
1814 	struct fib_nh_exception *fnhe;
1815 	struct in_device *in_dev;
1816 	u16 type = res->type;
1817 	struct rtable *rth;
1818 	bool do_cache;
1819 
1820 	in_dev = __in_dev_get_rcu(dev_out);
1821 	if (!in_dev)
1822 		return ERR_PTR(-EINVAL);
1823 
1824 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1825 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1826 			return ERR_PTR(-EINVAL);
1827 
1828 	if (ipv4_is_lbcast(fl4->daddr))
1829 		type = RTN_BROADCAST;
1830 	else if (ipv4_is_multicast(fl4->daddr))
1831 		type = RTN_MULTICAST;
1832 	else if (ipv4_is_zeronet(fl4->daddr))
1833 		return ERR_PTR(-EINVAL);
1834 
1835 	if (dev_out->flags & IFF_LOOPBACK)
1836 		flags |= RTCF_LOCAL;
1837 
1838 	do_cache = true;
1839 	if (type == RTN_BROADCAST) {
1840 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1841 		fi = NULL;
1842 	} else if (type == RTN_MULTICAST) {
1843 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1844 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1845 				     fl4->flowi4_proto))
1846 			flags &= ~RTCF_LOCAL;
1847 		else
1848 			do_cache = false;
1849 		/* If multicast route do not exist use
1850 		 * default one, but do not gateway in this case.
1851 		 * Yes, it is hack.
1852 		 */
1853 		if (fi && res->prefixlen < 4)
1854 			fi = NULL;
1855 	}
1856 
1857 	fnhe = NULL;
1858 	do_cache &= fi != NULL;
1859 	if (do_cache) {
1860 		struct rtable __rcu **prth;
1861 		struct fib_nh *nh = &FIB_RES_NH(*res);
1862 
1863 		fnhe = find_exception(nh, fl4->daddr);
1864 		if (fnhe)
1865 			prth = &fnhe->fnhe_rth;
1866 		else {
1867 			if (unlikely(fl4->flowi4_flags &
1868 				     FLOWI_FLAG_KNOWN_NH &&
1869 				     !(nh->nh_gw &&
1870 				       nh->nh_scope == RT_SCOPE_LINK))) {
1871 				do_cache = false;
1872 				goto add;
1873 			}
1874 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1875 		}
1876 		rth = rcu_dereference(*prth);
1877 		if (rt_cache_valid(rth)) {
1878 			dst_hold(&rth->dst);
1879 			return rth;
1880 		}
1881 	}
1882 
1883 add:
1884 	rth = rt_dst_alloc(dev_out,
1885 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1886 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1887 			   do_cache);
1888 	if (!rth)
1889 		return ERR_PTR(-ENOBUFS);
1890 
1891 	rth->dst.output = ip_output;
1892 
1893 	rth->rt_genid = rt_genid(dev_net(dev_out));
1894 	rth->rt_flags	= flags;
1895 	rth->rt_type	= type;
1896 	rth->rt_is_input = 0;
1897 	rth->rt_iif	= orig_oif ? : 0;
1898 	rth->rt_pmtu	= 0;
1899 	rth->rt_gateway = 0;
1900 	rth->rt_uses_gateway = 0;
1901 	INIT_LIST_HEAD(&rth->rt_uncached);
1902 
1903 	RT_CACHE_STAT_INC(out_slow_tot);
1904 
1905 	if (flags & RTCF_LOCAL)
1906 		rth->dst.input = ip_local_deliver;
1907 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1908 		if (flags & RTCF_LOCAL &&
1909 		    !(dev_out->flags & IFF_LOOPBACK)) {
1910 			rth->dst.output = ip_mc_output;
1911 			RT_CACHE_STAT_INC(out_slow_mc);
1912 		}
1913 #ifdef CONFIG_IP_MROUTE
1914 		if (type == RTN_MULTICAST) {
1915 			if (IN_DEV_MFORWARD(in_dev) &&
1916 			    !ipv4_is_local_multicast(fl4->daddr)) {
1917 				rth->dst.input = ip_mr_input;
1918 				rth->dst.output = ip_mc_output;
1919 			}
1920 		}
1921 #endif
1922 	}
1923 
1924 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1925 
1926 	return rth;
1927 }
1928 
1929 /*
1930  * Major route resolver routine.
1931  */
1932 
1933 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1934 {
1935 	struct net_device *dev_out = NULL;
1936 	__u8 tos = RT_FL_TOS(fl4);
1937 	unsigned int flags = 0;
1938 	struct fib_result res;
1939 	struct rtable *rth;
1940 	int orig_oif;
1941 
1942 	res.tclassid	= 0;
1943 	res.fi		= NULL;
1944 	res.table	= NULL;
1945 
1946 	orig_oif = fl4->flowi4_oif;
1947 
1948 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
1949 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1950 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1951 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1952 
1953 	rcu_read_lock();
1954 	if (fl4->saddr) {
1955 		rth = ERR_PTR(-EINVAL);
1956 		if (ipv4_is_multicast(fl4->saddr) ||
1957 		    ipv4_is_lbcast(fl4->saddr) ||
1958 		    ipv4_is_zeronet(fl4->saddr))
1959 			goto out;
1960 
1961 		/* I removed check for oif == dev_out->oif here.
1962 		   It was wrong for two reasons:
1963 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1964 		      is assigned to multiple interfaces.
1965 		   2. Moreover, we are allowed to send packets with saddr
1966 		      of another iface. --ANK
1967 		 */
1968 
1969 		if (fl4->flowi4_oif == 0 &&
1970 		    (ipv4_is_multicast(fl4->daddr) ||
1971 		     ipv4_is_lbcast(fl4->daddr))) {
1972 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1973 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1974 			if (dev_out == NULL)
1975 				goto out;
1976 
1977 			/* Special hack: user can direct multicasts
1978 			   and limited broadcast via necessary interface
1979 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1980 			   This hack is not just for fun, it allows
1981 			   vic,vat and friends to work.
1982 			   They bind socket to loopback, set ttl to zero
1983 			   and expect that it will work.
1984 			   From the viewpoint of routing cache they are broken,
1985 			   because we are not allowed to build multicast path
1986 			   with loopback source addr (look, routing cache
1987 			   cannot know, that ttl is zero, so that packet
1988 			   will not leave this host and route is valid).
1989 			   Luckily, this hack is good workaround.
1990 			 */
1991 
1992 			fl4->flowi4_oif = dev_out->ifindex;
1993 			goto make_route;
1994 		}
1995 
1996 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1997 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1998 			if (!__ip_dev_find(net, fl4->saddr, false))
1999 				goto out;
2000 		}
2001 	}
2002 
2003 
2004 	if (fl4->flowi4_oif) {
2005 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2006 		rth = ERR_PTR(-ENODEV);
2007 		if (dev_out == NULL)
2008 			goto out;
2009 
2010 		/* RACE: Check return value of inet_select_addr instead. */
2011 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2012 			rth = ERR_PTR(-ENETUNREACH);
2013 			goto out;
2014 		}
2015 		if (ipv4_is_local_multicast(fl4->daddr) ||
2016 		    ipv4_is_lbcast(fl4->daddr)) {
2017 			if (!fl4->saddr)
2018 				fl4->saddr = inet_select_addr(dev_out, 0,
2019 							      RT_SCOPE_LINK);
2020 			goto make_route;
2021 		}
2022 		if (fl4->saddr) {
2023 			if (ipv4_is_multicast(fl4->daddr))
2024 				fl4->saddr = inet_select_addr(dev_out, 0,
2025 							      fl4->flowi4_scope);
2026 			else if (!fl4->daddr)
2027 				fl4->saddr = inet_select_addr(dev_out, 0,
2028 							      RT_SCOPE_HOST);
2029 		}
2030 	}
2031 
2032 	if (!fl4->daddr) {
2033 		fl4->daddr = fl4->saddr;
2034 		if (!fl4->daddr)
2035 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2036 		dev_out = net->loopback_dev;
2037 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2038 		res.type = RTN_LOCAL;
2039 		flags |= RTCF_LOCAL;
2040 		goto make_route;
2041 	}
2042 
2043 	if (fib_lookup(net, fl4, &res)) {
2044 		res.fi = NULL;
2045 		res.table = NULL;
2046 		if (fl4->flowi4_oif) {
2047 			/* Apparently, routing tables are wrong. Assume,
2048 			   that the destination is on link.
2049 
2050 			   WHY? DW.
2051 			   Because we are allowed to send to iface
2052 			   even if it has NO routes and NO assigned
2053 			   addresses. When oif is specified, routing
2054 			   tables are looked up with only one purpose:
2055 			   to catch if destination is gatewayed, rather than
2056 			   direct. Moreover, if MSG_DONTROUTE is set,
2057 			   we send packet, ignoring both routing tables
2058 			   and ifaddr state. --ANK
2059 
2060 
2061 			   We could make it even if oif is unknown,
2062 			   likely IPv6, but we do not.
2063 			 */
2064 
2065 			if (fl4->saddr == 0)
2066 				fl4->saddr = inet_select_addr(dev_out, 0,
2067 							      RT_SCOPE_LINK);
2068 			res.type = RTN_UNICAST;
2069 			goto make_route;
2070 		}
2071 		rth = ERR_PTR(-ENETUNREACH);
2072 		goto out;
2073 	}
2074 
2075 	if (res.type == RTN_LOCAL) {
2076 		if (!fl4->saddr) {
2077 			if (res.fi->fib_prefsrc)
2078 				fl4->saddr = res.fi->fib_prefsrc;
2079 			else
2080 				fl4->saddr = fl4->daddr;
2081 		}
2082 		dev_out = net->loopback_dev;
2083 		fl4->flowi4_oif = dev_out->ifindex;
2084 		flags |= RTCF_LOCAL;
2085 		goto make_route;
2086 	}
2087 
2088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2089 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2090 		fib_select_multipath(&res);
2091 	else
2092 #endif
2093 	if (!res.prefixlen &&
2094 	    res.table->tb_num_default > 1 &&
2095 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2096 		fib_select_default(&res);
2097 
2098 	if (!fl4->saddr)
2099 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2100 
2101 	dev_out = FIB_RES_DEV(res);
2102 	fl4->flowi4_oif = dev_out->ifindex;
2103 
2104 
2105 make_route:
2106 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2107 
2108 out:
2109 	rcu_read_unlock();
2110 	return rth;
2111 }
2112 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2113 
2114 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2115 {
2116 	return NULL;
2117 }
2118 
2119 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2120 {
2121 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2122 
2123 	return mtu ? : dst->dev->mtu;
2124 }
2125 
2126 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2127 					  struct sk_buff *skb, u32 mtu)
2128 {
2129 }
2130 
2131 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2132 				       struct sk_buff *skb)
2133 {
2134 }
2135 
2136 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2137 					  unsigned long old)
2138 {
2139 	return NULL;
2140 }
2141 
2142 static struct dst_ops ipv4_dst_blackhole_ops = {
2143 	.family			=	AF_INET,
2144 	.protocol		=	cpu_to_be16(ETH_P_IP),
2145 	.check			=	ipv4_blackhole_dst_check,
2146 	.mtu			=	ipv4_blackhole_mtu,
2147 	.default_advmss		=	ipv4_default_advmss,
2148 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2149 	.redirect		=	ipv4_rt_blackhole_redirect,
2150 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2151 	.neigh_lookup		=	ipv4_neigh_lookup,
2152 };
2153 
2154 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2155 {
2156 	struct rtable *ort = (struct rtable *) dst_orig;
2157 	struct rtable *rt;
2158 
2159 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2160 	if (rt) {
2161 		struct dst_entry *new = &rt->dst;
2162 
2163 		new->__use = 1;
2164 		new->input = dst_discard;
2165 		new->output = dst_discard;
2166 
2167 		new->dev = ort->dst.dev;
2168 		if (new->dev)
2169 			dev_hold(new->dev);
2170 
2171 		rt->rt_is_input = ort->rt_is_input;
2172 		rt->rt_iif = ort->rt_iif;
2173 		rt->rt_pmtu = ort->rt_pmtu;
2174 
2175 		rt->rt_genid = rt_genid(net);
2176 		rt->rt_flags = ort->rt_flags;
2177 		rt->rt_type = ort->rt_type;
2178 		rt->rt_gateway = ort->rt_gateway;
2179 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2180 
2181 		INIT_LIST_HEAD(&rt->rt_uncached);
2182 
2183 		dst_free(new);
2184 	}
2185 
2186 	dst_release(dst_orig);
2187 
2188 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2189 }
2190 
2191 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2192 				    struct sock *sk)
2193 {
2194 	struct rtable *rt = __ip_route_output_key(net, flp4);
2195 
2196 	if (IS_ERR(rt))
2197 		return rt;
2198 
2199 	if (flp4->flowi4_proto)
2200 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2201 						   flowi4_to_flowi(flp4),
2202 						   sk, 0);
2203 
2204 	return rt;
2205 }
2206 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2207 
2208 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2209 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2210 			u32 seq, int event, int nowait, unsigned int flags)
2211 {
2212 	struct rtable *rt = skb_rtable(skb);
2213 	struct rtmsg *r;
2214 	struct nlmsghdr *nlh;
2215 	unsigned long expires = 0;
2216 	u32 error;
2217 	u32 metrics[RTAX_MAX];
2218 
2219 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2220 	if (nlh == NULL)
2221 		return -EMSGSIZE;
2222 
2223 	r = nlmsg_data(nlh);
2224 	r->rtm_family	 = AF_INET;
2225 	r->rtm_dst_len	= 32;
2226 	r->rtm_src_len	= 0;
2227 	r->rtm_tos	= fl4->flowi4_tos;
2228 	r->rtm_table	= RT_TABLE_MAIN;
2229 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2230 		goto nla_put_failure;
2231 	r->rtm_type	= rt->rt_type;
2232 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2233 	r->rtm_protocol = RTPROT_UNSPEC;
2234 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2235 	if (rt->rt_flags & RTCF_NOTIFY)
2236 		r->rtm_flags |= RTM_F_NOTIFY;
2237 
2238 	if (nla_put_be32(skb, RTA_DST, dst))
2239 		goto nla_put_failure;
2240 	if (src) {
2241 		r->rtm_src_len = 32;
2242 		if (nla_put_be32(skb, RTA_SRC, src))
2243 			goto nla_put_failure;
2244 	}
2245 	if (rt->dst.dev &&
2246 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2247 		goto nla_put_failure;
2248 #ifdef CONFIG_IP_ROUTE_CLASSID
2249 	if (rt->dst.tclassid &&
2250 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2251 		goto nla_put_failure;
2252 #endif
2253 	if (!rt_is_input_route(rt) &&
2254 	    fl4->saddr != src) {
2255 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2256 			goto nla_put_failure;
2257 	}
2258 	if (rt->rt_uses_gateway &&
2259 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2260 		goto nla_put_failure;
2261 
2262 	expires = rt->dst.expires;
2263 	if (expires) {
2264 		unsigned long now = jiffies;
2265 
2266 		if (time_before(now, expires))
2267 			expires -= now;
2268 		else
2269 			expires = 0;
2270 	}
2271 
2272 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2273 	if (rt->rt_pmtu && expires)
2274 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2275 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2276 		goto nla_put_failure;
2277 
2278 	if (fl4->flowi4_mark &&
2279 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2280 		goto nla_put_failure;
2281 
2282 	error = rt->dst.error;
2283 
2284 	if (rt_is_input_route(rt)) {
2285 #ifdef CONFIG_IP_MROUTE
2286 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2287 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2288 			int err = ipmr_get_route(net, skb,
2289 						 fl4->saddr, fl4->daddr,
2290 						 r, nowait);
2291 			if (err <= 0) {
2292 				if (!nowait) {
2293 					if (err == 0)
2294 						return 0;
2295 					goto nla_put_failure;
2296 				} else {
2297 					if (err == -EMSGSIZE)
2298 						goto nla_put_failure;
2299 					error = err;
2300 				}
2301 			}
2302 		} else
2303 #endif
2304 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2305 				goto nla_put_failure;
2306 	}
2307 
2308 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2309 		goto nla_put_failure;
2310 
2311 	return nlmsg_end(skb, nlh);
2312 
2313 nla_put_failure:
2314 	nlmsg_cancel(skb, nlh);
2315 	return -EMSGSIZE;
2316 }
2317 
2318 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2319 {
2320 	struct net *net = sock_net(in_skb->sk);
2321 	struct rtmsg *rtm;
2322 	struct nlattr *tb[RTA_MAX+1];
2323 	struct rtable *rt = NULL;
2324 	struct flowi4 fl4;
2325 	__be32 dst = 0;
2326 	__be32 src = 0;
2327 	u32 iif;
2328 	int err;
2329 	int mark;
2330 	struct sk_buff *skb;
2331 
2332 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2333 	if (err < 0)
2334 		goto errout;
2335 
2336 	rtm = nlmsg_data(nlh);
2337 
2338 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2339 	if (skb == NULL) {
2340 		err = -ENOBUFS;
2341 		goto errout;
2342 	}
2343 
2344 	/* Reserve room for dummy headers, this skb can pass
2345 	   through good chunk of routing engine.
2346 	 */
2347 	skb_reset_mac_header(skb);
2348 	skb_reset_network_header(skb);
2349 
2350 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2351 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2352 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2353 
2354 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2355 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2356 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2357 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2358 
2359 	memset(&fl4, 0, sizeof(fl4));
2360 	fl4.daddr = dst;
2361 	fl4.saddr = src;
2362 	fl4.flowi4_tos = rtm->rtm_tos;
2363 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2364 	fl4.flowi4_mark = mark;
2365 
2366 	if (iif) {
2367 		struct net_device *dev;
2368 
2369 		dev = __dev_get_by_index(net, iif);
2370 		if (dev == NULL) {
2371 			err = -ENODEV;
2372 			goto errout_free;
2373 		}
2374 
2375 		skb->protocol	= htons(ETH_P_IP);
2376 		skb->dev	= dev;
2377 		skb->mark	= mark;
2378 		local_bh_disable();
2379 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2380 		local_bh_enable();
2381 
2382 		rt = skb_rtable(skb);
2383 		if (err == 0 && rt->dst.error)
2384 			err = -rt->dst.error;
2385 	} else {
2386 		rt = ip_route_output_key(net, &fl4);
2387 
2388 		err = 0;
2389 		if (IS_ERR(rt))
2390 			err = PTR_ERR(rt);
2391 	}
2392 
2393 	if (err)
2394 		goto errout_free;
2395 
2396 	skb_dst_set(skb, &rt->dst);
2397 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2398 		rt->rt_flags |= RTCF_NOTIFY;
2399 
2400 	err = rt_fill_info(net, dst, src, &fl4, skb,
2401 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2402 			   RTM_NEWROUTE, 0, 0);
2403 	if (err <= 0)
2404 		goto errout_free;
2405 
2406 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2407 errout:
2408 	return err;
2409 
2410 errout_free:
2411 	kfree_skb(skb);
2412 	goto errout;
2413 }
2414 
2415 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2416 {
2417 	return skb->len;
2418 }
2419 
2420 void ip_rt_multicast_event(struct in_device *in_dev)
2421 {
2422 	rt_cache_flush(dev_net(in_dev->dev));
2423 }
2424 
2425 #ifdef CONFIG_SYSCTL
2426 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2427 					void __user *buffer,
2428 					size_t *lenp, loff_t *ppos)
2429 {
2430 	if (write) {
2431 		rt_cache_flush((struct net *)__ctl->extra1);
2432 		return 0;
2433 	}
2434 
2435 	return -EINVAL;
2436 }
2437 
2438 static ctl_table ipv4_route_table[] = {
2439 	{
2440 		.procname	= "gc_thresh",
2441 		.data		= &ipv4_dst_ops.gc_thresh,
2442 		.maxlen		= sizeof(int),
2443 		.mode		= 0644,
2444 		.proc_handler	= proc_dointvec,
2445 	},
2446 	{
2447 		.procname	= "max_size",
2448 		.data		= &ip_rt_max_size,
2449 		.maxlen		= sizeof(int),
2450 		.mode		= 0644,
2451 		.proc_handler	= proc_dointvec,
2452 	},
2453 	{
2454 		/*  Deprecated. Use gc_min_interval_ms */
2455 
2456 		.procname	= "gc_min_interval",
2457 		.data		= &ip_rt_gc_min_interval,
2458 		.maxlen		= sizeof(int),
2459 		.mode		= 0644,
2460 		.proc_handler	= proc_dointvec_jiffies,
2461 	},
2462 	{
2463 		.procname	= "gc_min_interval_ms",
2464 		.data		= &ip_rt_gc_min_interval,
2465 		.maxlen		= sizeof(int),
2466 		.mode		= 0644,
2467 		.proc_handler	= proc_dointvec_ms_jiffies,
2468 	},
2469 	{
2470 		.procname	= "gc_timeout",
2471 		.data		= &ip_rt_gc_timeout,
2472 		.maxlen		= sizeof(int),
2473 		.mode		= 0644,
2474 		.proc_handler	= proc_dointvec_jiffies,
2475 	},
2476 	{
2477 		.procname	= "gc_interval",
2478 		.data		= &ip_rt_gc_interval,
2479 		.maxlen		= sizeof(int),
2480 		.mode		= 0644,
2481 		.proc_handler	= proc_dointvec_jiffies,
2482 	},
2483 	{
2484 		.procname	= "redirect_load",
2485 		.data		= &ip_rt_redirect_load,
2486 		.maxlen		= sizeof(int),
2487 		.mode		= 0644,
2488 		.proc_handler	= proc_dointvec,
2489 	},
2490 	{
2491 		.procname	= "redirect_number",
2492 		.data		= &ip_rt_redirect_number,
2493 		.maxlen		= sizeof(int),
2494 		.mode		= 0644,
2495 		.proc_handler	= proc_dointvec,
2496 	},
2497 	{
2498 		.procname	= "redirect_silence",
2499 		.data		= &ip_rt_redirect_silence,
2500 		.maxlen		= sizeof(int),
2501 		.mode		= 0644,
2502 		.proc_handler	= proc_dointvec,
2503 	},
2504 	{
2505 		.procname	= "error_cost",
2506 		.data		= &ip_rt_error_cost,
2507 		.maxlen		= sizeof(int),
2508 		.mode		= 0644,
2509 		.proc_handler	= proc_dointvec,
2510 	},
2511 	{
2512 		.procname	= "error_burst",
2513 		.data		= &ip_rt_error_burst,
2514 		.maxlen		= sizeof(int),
2515 		.mode		= 0644,
2516 		.proc_handler	= proc_dointvec,
2517 	},
2518 	{
2519 		.procname	= "gc_elasticity",
2520 		.data		= &ip_rt_gc_elasticity,
2521 		.maxlen		= sizeof(int),
2522 		.mode		= 0644,
2523 		.proc_handler	= proc_dointvec,
2524 	},
2525 	{
2526 		.procname	= "mtu_expires",
2527 		.data		= &ip_rt_mtu_expires,
2528 		.maxlen		= sizeof(int),
2529 		.mode		= 0644,
2530 		.proc_handler	= proc_dointvec_jiffies,
2531 	},
2532 	{
2533 		.procname	= "min_pmtu",
2534 		.data		= &ip_rt_min_pmtu,
2535 		.maxlen		= sizeof(int),
2536 		.mode		= 0644,
2537 		.proc_handler	= proc_dointvec,
2538 	},
2539 	{
2540 		.procname	= "min_adv_mss",
2541 		.data		= &ip_rt_min_advmss,
2542 		.maxlen		= sizeof(int),
2543 		.mode		= 0644,
2544 		.proc_handler	= proc_dointvec,
2545 	},
2546 	{ }
2547 };
2548 
2549 static struct ctl_table ipv4_route_flush_table[] = {
2550 	{
2551 		.procname	= "flush",
2552 		.maxlen		= sizeof(int),
2553 		.mode		= 0200,
2554 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2555 	},
2556 	{ },
2557 };
2558 
2559 static __net_init int sysctl_route_net_init(struct net *net)
2560 {
2561 	struct ctl_table *tbl;
2562 
2563 	tbl = ipv4_route_flush_table;
2564 	if (!net_eq(net, &init_net)) {
2565 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2566 		if (tbl == NULL)
2567 			goto err_dup;
2568 
2569 		/* Don't export sysctls to unprivileged users */
2570 		if (net->user_ns != &init_user_ns)
2571 			tbl[0].procname = NULL;
2572 	}
2573 	tbl[0].extra1 = net;
2574 
2575 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2576 	if (net->ipv4.route_hdr == NULL)
2577 		goto err_reg;
2578 	return 0;
2579 
2580 err_reg:
2581 	if (tbl != ipv4_route_flush_table)
2582 		kfree(tbl);
2583 err_dup:
2584 	return -ENOMEM;
2585 }
2586 
2587 static __net_exit void sysctl_route_net_exit(struct net *net)
2588 {
2589 	struct ctl_table *tbl;
2590 
2591 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2592 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2593 	BUG_ON(tbl == ipv4_route_flush_table);
2594 	kfree(tbl);
2595 }
2596 
2597 static __net_initdata struct pernet_operations sysctl_route_ops = {
2598 	.init = sysctl_route_net_init,
2599 	.exit = sysctl_route_net_exit,
2600 };
2601 #endif
2602 
2603 static __net_init int rt_genid_init(struct net *net)
2604 {
2605 	atomic_set(&net->rt_genid, 0);
2606 	get_random_bytes(&net->ipv4.dev_addr_genid,
2607 			 sizeof(net->ipv4.dev_addr_genid));
2608 	return 0;
2609 }
2610 
2611 static __net_initdata struct pernet_operations rt_genid_ops = {
2612 	.init = rt_genid_init,
2613 };
2614 
2615 static int __net_init ipv4_inetpeer_init(struct net *net)
2616 {
2617 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2618 
2619 	if (!bp)
2620 		return -ENOMEM;
2621 	inet_peer_base_init(bp);
2622 	net->ipv4.peers = bp;
2623 	return 0;
2624 }
2625 
2626 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2627 {
2628 	struct inet_peer_base *bp = net->ipv4.peers;
2629 
2630 	net->ipv4.peers = NULL;
2631 	inetpeer_invalidate_tree(bp);
2632 	kfree(bp);
2633 }
2634 
2635 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2636 	.init	=	ipv4_inetpeer_init,
2637 	.exit	=	ipv4_inetpeer_exit,
2638 };
2639 
2640 #ifdef CONFIG_IP_ROUTE_CLASSID
2641 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2642 #endif /* CONFIG_IP_ROUTE_CLASSID */
2643 
2644 int __init ip_rt_init(void)
2645 {
2646 	int rc = 0;
2647 
2648 #ifdef CONFIG_IP_ROUTE_CLASSID
2649 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2650 	if (!ip_rt_acct)
2651 		panic("IP: failed to allocate ip_rt_acct\n");
2652 #endif
2653 
2654 	ipv4_dst_ops.kmem_cachep =
2655 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2656 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2657 
2658 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2659 
2660 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2661 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2662 
2663 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2664 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2665 
2666 	ipv4_dst_ops.gc_thresh = ~0;
2667 	ip_rt_max_size = INT_MAX;
2668 
2669 	devinet_init();
2670 	ip_fib_init();
2671 
2672 	if (ip_rt_proc_init())
2673 		pr_err("Unable to create route proc files\n");
2674 #ifdef CONFIG_XFRM
2675 	xfrm_init();
2676 	xfrm4_init();
2677 #endif
2678 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2679 
2680 #ifdef CONFIG_SYSCTL
2681 	register_pernet_subsys(&sysctl_route_ops);
2682 #endif
2683 	register_pernet_subsys(&rt_genid_ops);
2684 	register_pernet_subsys(&ipv4_inetpeer_ops);
2685 	return rc;
2686 }
2687 
2688 #ifdef CONFIG_SYSCTL
2689 /*
2690  * We really need to sanitize the damn ipv4 init order, then all
2691  * this nonsense will go away.
2692  */
2693 void __init ip_static_sysctl_init(void)
2694 {
2695 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2696 }
2697 #endif
2698