xref: /linux/net/ipv4/route.c (revision 7056741fd9fc14a65608549a4657cf5178f05f63)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu);
144 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 					struct sk_buff *skb);
146 static void		ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 			    int how)
150 {
151 }
152 
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 	WARN_ON(1);
156 	return NULL;
157 }
158 
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 					   struct sk_buff *skb,
161 					   const void *daddr);
162 
163 static struct dst_ops ipv4_dst_ops = {
164 	.family =		AF_INET,
165 	.protocol =		cpu_to_be16(ETH_P_IP),
166 	.check =		ipv4_dst_check,
167 	.default_advmss =	ipv4_default_advmss,
168 	.mtu =			ipv4_mtu,
169 	.cow_metrics =		ipv4_cow_metrics,
170 	.destroy =		ipv4_dst_destroy,
171 	.ifdown =		ipv4_dst_ifdown,
172 	.negative_advice =	ipv4_negative_advice,
173 	.link_failure =		ipv4_link_failure,
174 	.update_pmtu =		ip_rt_update_pmtu,
175 	.redirect =		ip_do_redirect,
176 	.local_out =		__ip_local_out,
177 	.neigh_lookup =		ipv4_neigh_lookup,
178 };
179 
180 #define ECN_OR_COST(class)	TC_PRIO_##class
181 
182 const __u8 ip_tos2prio[16] = {
183 	TC_PRIO_BESTEFFORT,
184 	ECN_OR_COST(BESTEFFORT),
185 	TC_PRIO_BESTEFFORT,
186 	ECN_OR_COST(BESTEFFORT),
187 	TC_PRIO_BULK,
188 	ECN_OR_COST(BULK),
189 	TC_PRIO_BULK,
190 	ECN_OR_COST(BULK),
191 	TC_PRIO_INTERACTIVE,
192 	ECN_OR_COST(INTERACTIVE),
193 	TC_PRIO_INTERACTIVE,
194 	ECN_OR_COST(INTERACTIVE),
195 	TC_PRIO_INTERACTIVE_BULK,
196 	ECN_OR_COST(INTERACTIVE_BULK),
197 	TC_PRIO_INTERACTIVE_BULK,
198 	ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201 
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 	if (*pos)
209 		return NULL;
210 	return SEQ_START_TOKEN;
211 }
212 
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 	++*pos;
216 	return NULL;
217 }
218 
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222 
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 	if (v == SEQ_START_TOKEN)
226 		seq_printf(seq, "%-127s\n",
227 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 			   "HHUptod\tSpecDst");
230 	return 0;
231 }
232 
233 static const struct seq_operations rt_cache_seq_ops = {
234 	.start  = rt_cache_seq_start,
235 	.next   = rt_cache_seq_next,
236 	.stop   = rt_cache_seq_stop,
237 	.show   = rt_cache_seq_show,
238 };
239 
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 	return seq_open(file, &rt_cache_seq_ops);
243 }
244 
245 static const struct file_operations rt_cache_seq_fops = {
246 	.owner	 = THIS_MODULE,
247 	.open	 = rt_cache_seq_open,
248 	.read	 = seq_read,
249 	.llseek	 = seq_lseek,
250 	.release = seq_release,
251 };
252 
253 
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 	int cpu;
257 
258 	if (*pos == 0)
259 		return SEQ_START_TOKEN;
260 
261 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 		if (!cpu_possible(cpu))
263 			continue;
264 		*pos = cpu+1;
265 		return &per_cpu(rt_cache_stat, cpu);
266 	}
267 	return NULL;
268 }
269 
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 	int cpu;
273 
274 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 		if (!cpu_possible(cpu))
276 			continue;
277 		*pos = cpu+1;
278 		return &per_cpu(rt_cache_stat, cpu);
279 	}
280 	return NULL;
281 
282 }
283 
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286 
287 }
288 
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 	struct rt_cache_stat *st = v;
292 
293 	if (v == SEQ_START_TOKEN) {
294 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 		return 0;
296 	}
297 
298 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 		   dst_entries_get_slow(&ipv4_dst_ops),
301 		   st->in_hit,
302 		   st->in_slow_tot,
303 		   st->in_slow_mc,
304 		   st->in_no_route,
305 		   st->in_brd,
306 		   st->in_martian_dst,
307 		   st->in_martian_src,
308 
309 		   st->out_hit,
310 		   st->out_slow_tot,
311 		   st->out_slow_mc,
312 
313 		   st->gc_total,
314 		   st->gc_ignored,
315 		   st->gc_goal_miss,
316 		   st->gc_dst_overflow,
317 		   st->in_hlist_search,
318 		   st->out_hlist_search
319 		);
320 	return 0;
321 }
322 
323 static const struct seq_operations rt_cpu_seq_ops = {
324 	.start  = rt_cpu_seq_start,
325 	.next   = rt_cpu_seq_next,
326 	.stop   = rt_cpu_seq_stop,
327 	.show   = rt_cpu_seq_show,
328 };
329 
330 
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 	return seq_open(file, &rt_cpu_seq_ops);
334 }
335 
336 static const struct file_operations rt_cpu_seq_fops = {
337 	.owner	 = THIS_MODULE,
338 	.open	 = rt_cpu_seq_open,
339 	.read	 = seq_read,
340 	.llseek	 = seq_lseek,
341 	.release = seq_release,
342 };
343 
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 	struct ip_rt_acct *dst, *src;
348 	unsigned int i, j;
349 
350 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 	if (!dst)
352 		return -ENOMEM;
353 
354 	for_each_possible_cpu(i) {
355 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 		for (j = 0; j < 256; j++) {
357 			dst[j].o_bytes   += src[j].o_bytes;
358 			dst[j].o_packets += src[j].o_packets;
359 			dst[j].i_bytes   += src[j].i_bytes;
360 			dst[j].i_packets += src[j].i_packets;
361 		}
362 	}
363 
364 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 	kfree(dst);
366 	return 0;
367 }
368 
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 	return single_open(file, rt_acct_proc_show, NULL);
372 }
373 
374 static const struct file_operations rt_acct_proc_fops = {
375 	.owner		= THIS_MODULE,
376 	.open		= rt_acct_proc_open,
377 	.read		= seq_read,
378 	.llseek		= seq_lseek,
379 	.release	= single_release,
380 };
381 #endif
382 
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 	struct proc_dir_entry *pde;
386 
387 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 			&rt_cache_seq_fops);
389 	if (!pde)
390 		goto err1;
391 
392 	pde = proc_create("rt_cache", S_IRUGO,
393 			  net->proc_net_stat, &rt_cpu_seq_fops);
394 	if (!pde)
395 		goto err2;
396 
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 	if (!pde)
400 		goto err3;
401 #endif
402 	return 0;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 	remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 	remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 	return -ENOMEM;
412 }
413 
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 	remove_proc_entry("rt_cache", net->proc_net_stat);
417 	remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 	remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422 
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424 	.init = ip_rt_do_proc_init,
425 	.exit = ip_rt_do_proc_exit,
426 };
427 
428 static int __init ip_rt_proc_init(void)
429 {
430 	return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432 
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 	return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439 
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444 
445 void rt_cache_flush(struct net *net)
446 {
447 	rt_genid_bump(net);
448 }
449 
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 					   struct sk_buff *skb,
452 					   const void *daddr)
453 {
454 	struct net_device *dev = dst->dev;
455 	const __be32 *pkey = daddr;
456 	const struct rtable *rt;
457 	struct neighbour *n;
458 
459 	rt = (const struct rtable *) dst;
460 	if (rt->rt_gateway)
461 		pkey = (const __be32 *) &rt->rt_gateway;
462 	else if (skb)
463 		pkey = &ip_hdr(skb)->daddr;
464 
465 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 	if (n)
467 		return n;
468 	return neigh_create(&arp_tbl, pkey, dev);
469 }
470 
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480 	static DEFINE_SPINLOCK(ip_fb_id_lock);
481 	static u32 ip_fallback_id;
482 	u32 salt;
483 
484 	spin_lock_bh(&ip_fb_id_lock);
485 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 	iph->id = htons(salt & 0xFFFF);
487 	ip_fallback_id = salt;
488 	spin_unlock_bh(&ip_fb_id_lock);
489 }
490 
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493 	struct net *net = dev_net(dst->dev);
494 	struct inet_peer *peer;
495 
496 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 	if (peer) {
498 		iph->id = htons(inet_getid(peer, more));
499 		inet_putpeer(peer);
500 		return;
501 	}
502 
503 	ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506 
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 			     const struct iphdr *iph,
509 			     int oif, u8 tos,
510 			     u8 prot, u32 mark, int flow_flags)
511 {
512 	if (sk) {
513 		const struct inet_sock *inet = inet_sk(sk);
514 
515 		oif = sk->sk_bound_dev_if;
516 		mark = sk->sk_mark;
517 		tos = RT_CONN_FLAGS(sk);
518 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 	}
520 	flowi4_init_output(fl4, oif, mark, tos,
521 			   RT_SCOPE_UNIVERSE, prot,
522 			   flow_flags,
523 			   iph->daddr, iph->saddr, 0, 0);
524 }
525 
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 			       const struct sock *sk)
528 {
529 	const struct iphdr *iph = ip_hdr(skb);
530 	int oif = skb->dev->ifindex;
531 	u8 tos = RT_TOS(iph->tos);
532 	u8 prot = iph->protocol;
533 	u32 mark = skb->mark;
534 
535 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537 
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540 	const struct inet_sock *inet = inet_sk(sk);
541 	const struct ip_options_rcu *inet_opt;
542 	__be32 daddr = inet->inet_daddr;
543 
544 	rcu_read_lock();
545 	inet_opt = rcu_dereference(inet->inet_opt);
546 	if (inet_opt && inet_opt->opt.srr)
547 		daddr = inet_opt->opt.faddr;
548 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 			   inet_sk_flowi_flags(sk),
552 			   daddr, inet->inet_saddr, 0, 0);
553 	rcu_read_unlock();
554 }
555 
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 				 const struct sk_buff *skb)
558 {
559 	if (skb)
560 		build_skb_flow_key(fl4, skb, sk);
561 	else
562 		build_sk_flow_key(fl4, sk);
563 }
564 
565 static inline void rt_free(struct rtable *rt)
566 {
567 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569 
570 static DEFINE_SPINLOCK(fnhe_lock);
571 
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574 	struct fib_nh_exception *fnhe, *oldest;
575 	struct rtable *orig;
576 
577 	oldest = rcu_dereference(hash->chain);
578 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 			oldest = fnhe;
582 	}
583 	orig = rcu_dereference(oldest->fnhe_rth);
584 	if (orig) {
585 		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 		rt_free(orig);
587 	}
588 	return oldest;
589 }
590 
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593 	u32 hval;
594 
595 	hval = (__force u32) daddr;
596 	hval ^= (hval >> 11) ^ (hval >> 22);
597 
598 	return hval & (FNHE_HASH_SIZE - 1);
599 }
600 
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 				  u32 pmtu, unsigned long expires)
603 {
604 	struct fnhe_hash_bucket *hash;
605 	struct fib_nh_exception *fnhe;
606 	int depth;
607 	u32 hval = fnhe_hashfun(daddr);
608 
609 	spin_lock_bh(&fnhe_lock);
610 
611 	hash = nh->nh_exceptions;
612 	if (!hash) {
613 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 		if (!hash)
615 			goto out_unlock;
616 		nh->nh_exceptions = hash;
617 	}
618 
619 	hash += hval;
620 
621 	depth = 0;
622 	for (fnhe = rcu_dereference(hash->chain); fnhe;
623 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 		if (fnhe->fnhe_daddr == daddr)
625 			break;
626 		depth++;
627 	}
628 
629 	if (fnhe) {
630 		if (gw)
631 			fnhe->fnhe_gw = gw;
632 		if (pmtu) {
633 			fnhe->fnhe_pmtu = pmtu;
634 			fnhe->fnhe_expires = expires;
635 		}
636 	} else {
637 		if (depth > FNHE_RECLAIM_DEPTH)
638 			fnhe = fnhe_oldest(hash);
639 		else {
640 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 			if (!fnhe)
642 				goto out_unlock;
643 
644 			fnhe->fnhe_next = hash->chain;
645 			rcu_assign_pointer(hash->chain, fnhe);
646 		}
647 		fnhe->fnhe_daddr = daddr;
648 		fnhe->fnhe_gw = gw;
649 		fnhe->fnhe_pmtu = pmtu;
650 		fnhe->fnhe_expires = expires;
651 	}
652 
653 	fnhe->fnhe_stamp = jiffies;
654 
655 out_unlock:
656 	spin_unlock_bh(&fnhe_lock);
657 	return;
658 }
659 
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 			     bool kill_route)
662 {
663 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
664 	__be32 old_gw = ip_hdr(skb)->saddr;
665 	struct net_device *dev = skb->dev;
666 	struct in_device *in_dev;
667 	struct fib_result res;
668 	struct neighbour *n;
669 	struct net *net;
670 
671 	switch (icmp_hdr(skb)->code & 7) {
672 	case ICMP_REDIR_NET:
673 	case ICMP_REDIR_NETTOS:
674 	case ICMP_REDIR_HOST:
675 	case ICMP_REDIR_HOSTTOS:
676 		break;
677 
678 	default:
679 		return;
680 	}
681 
682 	if (rt->rt_gateway != old_gw)
683 		return;
684 
685 	in_dev = __in_dev_get_rcu(dev);
686 	if (!in_dev)
687 		return;
688 
689 	net = dev_net(dev);
690 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 	    ipv4_is_zeronet(new_gw))
693 		goto reject_redirect;
694 
695 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 			goto reject_redirect;
698 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 			goto reject_redirect;
700 	} else {
701 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 			goto reject_redirect;
703 	}
704 
705 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 	if (n) {
707 		if (!(n->nud_state & NUD_VALID)) {
708 			neigh_event_send(n, NULL);
709 		} else {
710 			if (fib_lookup(net, fl4, &res) == 0) {
711 				struct fib_nh *nh = &FIB_RES_NH(res);
712 
713 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 						      0, 0);
715 			}
716 			if (kill_route)
717 				rt->dst.obsolete = DST_OBSOLETE_KILL;
718 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 		}
720 		neigh_release(n);
721 	}
722 	return;
723 
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 		const struct iphdr *iph = (const struct iphdr *) skb->data;
728 		__be32 daddr = iph->daddr;
729 		__be32 saddr = iph->saddr;
730 
731 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 				     "  Advised path = %pI4 -> %pI4\n",
733 				     &old_gw, dev->name, &new_gw,
734 				     &saddr, &daddr);
735 	}
736 #endif
737 	;
738 }
739 
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742 	struct rtable *rt;
743 	struct flowi4 fl4;
744 
745 	rt = (struct rtable *) dst;
746 
747 	ip_rt_build_flow_key(&fl4, sk, skb);
748 	__ip_do_redirect(rt, skb, &fl4, true);
749 }
750 
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753 	struct rtable *rt = (struct rtable *)dst;
754 	struct dst_entry *ret = dst;
755 
756 	if (rt) {
757 		if (dst->obsolete > 0) {
758 			ip_rt_put(rt);
759 			ret = NULL;
760 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 			   rt->dst.expires) {
762 			ip_rt_put(rt);
763 			ret = NULL;
764 		}
765 	}
766 	return ret;
767 }
768 
769 /*
770  * Algorithm:
771  *	1. The first ip_rt_redirect_number redirects are sent
772  *	   with exponential backoff, then we stop sending them at all,
773  *	   assuming that the host ignores our redirects.
774  *	2. If we did not see packets requiring redirects
775  *	   during ip_rt_redirect_silence, we assume that the host
776  *	   forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784 
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787 	struct rtable *rt = skb_rtable(skb);
788 	struct in_device *in_dev;
789 	struct inet_peer *peer;
790 	struct net *net;
791 	int log_martians;
792 
793 	rcu_read_lock();
794 	in_dev = __in_dev_get_rcu(rt->dst.dev);
795 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 		rcu_read_unlock();
797 		return;
798 	}
799 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 	rcu_read_unlock();
801 
802 	net = dev_net(rt->dst.dev);
803 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 	if (!peer) {
805 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
807 		return;
808 	}
809 
810 	/* No redirected packets during ip_rt_redirect_silence;
811 	 * reset the algorithm.
812 	 */
813 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814 		peer->rate_tokens = 0;
815 
816 	/* Too many ignored redirects; do not send anything
817 	 * set dst.rate_last to the last seen redirected packet.
818 	 */
819 	if (peer->rate_tokens >= ip_rt_redirect_number) {
820 		peer->rate_last = jiffies;
821 		goto out_put_peer;
822 	}
823 
824 	/* Check for load limit; set rate_last to the latest sent
825 	 * redirect.
826 	 */
827 	if (peer->rate_tokens == 0 ||
828 	    time_after(jiffies,
829 		       (peer->rate_last +
830 			(ip_rt_redirect_load << peer->rate_tokens)))) {
831 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832 
833 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 		peer->rate_last = jiffies;
835 		++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 		if (log_martians &&
838 		    peer->rate_tokens == ip_rt_redirect_number)
839 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840 					     &ip_hdr(skb)->saddr, inet_iif(skb),
841 					     &ip_hdr(skb)->daddr, &gw);
842 #endif
843 	}
844 out_put_peer:
845 	inet_putpeer(peer);
846 }
847 
848 static int ip_error(struct sk_buff *skb)
849 {
850 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 	struct rtable *rt = skb_rtable(skb);
852 	struct inet_peer *peer;
853 	unsigned long now;
854 	struct net *net;
855 	bool send;
856 	int code;
857 
858 	net = dev_net(rt->dst.dev);
859 	if (!IN_DEV_FORWARD(in_dev)) {
860 		switch (rt->dst.error) {
861 		case EHOSTUNREACH:
862 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 			break;
864 
865 		case ENETUNREACH:
866 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 			break;
868 		}
869 		goto out;
870 	}
871 
872 	switch (rt->dst.error) {
873 	case EINVAL:
874 	default:
875 		goto out;
876 	case EHOSTUNREACH:
877 		code = ICMP_HOST_UNREACH;
878 		break;
879 	case ENETUNREACH:
880 		code = ICMP_NET_UNREACH;
881 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882 		break;
883 	case EACCES:
884 		code = ICMP_PKT_FILTERED;
885 		break;
886 	}
887 
888 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889 
890 	send = true;
891 	if (peer) {
892 		now = jiffies;
893 		peer->rate_tokens += now - peer->rate_last;
894 		if (peer->rate_tokens > ip_rt_error_burst)
895 			peer->rate_tokens = ip_rt_error_burst;
896 		peer->rate_last = now;
897 		if (peer->rate_tokens >= ip_rt_error_cost)
898 			peer->rate_tokens -= ip_rt_error_cost;
899 		else
900 			send = false;
901 		inet_putpeer(peer);
902 	}
903 	if (send)
904 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905 
906 out:	kfree_skb(skb);
907 	return 0;
908 }
909 
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912 	struct dst_entry *dst = &rt->dst;
913 	struct fib_result res;
914 
915 	if (dst->dev->mtu < mtu)
916 		return;
917 
918 	if (mtu < ip_rt_min_pmtu)
919 		mtu = ip_rt_min_pmtu;
920 
921 	if (!rt->rt_pmtu) {
922 		dst->obsolete = DST_OBSOLETE_KILL;
923 	} else {
924 		rt->rt_pmtu = mtu;
925 		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926 	}
927 
928 	rcu_read_lock();
929 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930 		struct fib_nh *nh = &FIB_RES_NH(res);
931 
932 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933 				      jiffies + ip_rt_mtu_expires);
934 	}
935 	rcu_read_unlock();
936 }
937 
938 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939 			      struct sk_buff *skb, u32 mtu)
940 {
941 	struct rtable *rt = (struct rtable *) dst;
942 	struct flowi4 fl4;
943 
944 	ip_rt_build_flow_key(&fl4, sk, skb);
945 	__ip_rt_update_pmtu(rt, &fl4, mtu);
946 }
947 
948 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949 		      int oif, u32 mark, u8 protocol, int flow_flags)
950 {
951 	const struct iphdr *iph = (const struct iphdr *) skb->data;
952 	struct flowi4 fl4;
953 	struct rtable *rt;
954 
955 	__build_flow_key(&fl4, NULL, iph, oif,
956 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
957 	rt = __ip_route_output_key(net, &fl4);
958 	if (!IS_ERR(rt)) {
959 		__ip_rt_update_pmtu(rt, &fl4, mtu);
960 		ip_rt_put(rt);
961 	}
962 }
963 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964 
965 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966 {
967 	const struct iphdr *iph = (const struct iphdr *) skb->data;
968 	struct flowi4 fl4;
969 	struct rtable *rt;
970 
971 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972 	rt = __ip_route_output_key(sock_net(sk), &fl4);
973 	if (!IS_ERR(rt)) {
974 		__ip_rt_update_pmtu(rt, &fl4, mtu);
975 		ip_rt_put(rt);
976 	}
977 }
978 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979 
980 void ipv4_redirect(struct sk_buff *skb, struct net *net,
981 		   int oif, u32 mark, u8 protocol, int flow_flags)
982 {
983 	const struct iphdr *iph = (const struct iphdr *) skb->data;
984 	struct flowi4 fl4;
985 	struct rtable *rt;
986 
987 	__build_flow_key(&fl4, NULL, iph, oif,
988 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
989 	rt = __ip_route_output_key(net, &fl4);
990 	if (!IS_ERR(rt)) {
991 		__ip_do_redirect(rt, skb, &fl4, false);
992 		ip_rt_put(rt);
993 	}
994 }
995 EXPORT_SYMBOL_GPL(ipv4_redirect);
996 
997 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998 {
999 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 	struct flowi4 fl4;
1001 	struct rtable *rt;
1002 
1003 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1005 	if (!IS_ERR(rt)) {
1006 		__ip_do_redirect(rt, skb, &fl4, false);
1007 		ip_rt_put(rt);
1008 	}
1009 }
1010 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011 
1012 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013 {
1014 	struct rtable *rt = (struct rtable *) dst;
1015 
1016 	/* All IPV4 dsts are created with ->obsolete set to the value
1017 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018 	 * into this function always.
1019 	 *
1020 	 * When a PMTU/redirect information update invalidates a
1021 	 * route, this is indicated by setting obsolete to
1022 	 * DST_OBSOLETE_KILL.
1023 	 */
1024 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025 		return NULL;
1026 	return dst;
1027 }
1028 
1029 static void ipv4_link_failure(struct sk_buff *skb)
1030 {
1031 	struct rtable *rt;
1032 
1033 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034 
1035 	rt = skb_rtable(skb);
1036 	if (rt)
1037 		dst_set_expires(&rt->dst, 0);
1038 }
1039 
1040 static int ip_rt_bug(struct sk_buff *skb)
1041 {
1042 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1043 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044 		 skb->dev ? skb->dev->name : "?");
1045 	kfree_skb(skb);
1046 	WARN_ON(1);
1047 	return 0;
1048 }
1049 
1050 /*
1051    We do not cache source address of outgoing interface,
1052    because it is used only by IP RR, TS and SRR options,
1053    so that it out of fast path.
1054 
1055    BTW remember: "addr" is allowed to be not aligned
1056    in IP options!
1057  */
1058 
1059 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060 {
1061 	__be32 src;
1062 
1063 	if (rt_is_output_route(rt))
1064 		src = ip_hdr(skb)->saddr;
1065 	else {
1066 		struct fib_result res;
1067 		struct flowi4 fl4;
1068 		struct iphdr *iph;
1069 
1070 		iph = ip_hdr(skb);
1071 
1072 		memset(&fl4, 0, sizeof(fl4));
1073 		fl4.daddr = iph->daddr;
1074 		fl4.saddr = iph->saddr;
1075 		fl4.flowi4_tos = RT_TOS(iph->tos);
1076 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1077 		fl4.flowi4_iif = skb->dev->ifindex;
1078 		fl4.flowi4_mark = skb->mark;
1079 
1080 		rcu_read_lock();
1081 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083 		else
1084 			src = inet_select_addr(rt->dst.dev,
1085 					       rt_nexthop(rt, iph->daddr),
1086 					       RT_SCOPE_UNIVERSE);
1087 		rcu_read_unlock();
1088 	}
1089 	memcpy(addr, &src, 4);
1090 }
1091 
1092 #ifdef CONFIG_IP_ROUTE_CLASSID
1093 static void set_class_tag(struct rtable *rt, u32 tag)
1094 {
1095 	if (!(rt->dst.tclassid & 0xFFFF))
1096 		rt->dst.tclassid |= tag & 0xFFFF;
1097 	if (!(rt->dst.tclassid & 0xFFFF0000))
1098 		rt->dst.tclassid |= tag & 0xFFFF0000;
1099 }
1100 #endif
1101 
1102 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103 {
1104 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105 
1106 	if (advmss == 0) {
1107 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108 			       ip_rt_min_advmss);
1109 		if (advmss > 65535 - 40)
1110 			advmss = 65535 - 40;
1111 	}
1112 	return advmss;
1113 }
1114 
1115 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116 {
1117 	const struct rtable *rt = (const struct rtable *) dst;
1118 	unsigned int mtu = rt->rt_pmtu;
1119 
1120 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121 		mtu = dst_metric_raw(dst, RTAX_MTU);
1122 
1123 	if (mtu && rt_is_output_route(rt))
1124 		return mtu;
1125 
1126 	mtu = dst->dev->mtu;
1127 
1128 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129 		if (rt->rt_uses_gateway && mtu > 576)
1130 			mtu = 576;
1131 	}
1132 
1133 	if (mtu > IP_MAX_MTU)
1134 		mtu = IP_MAX_MTU;
1135 
1136 	return mtu;
1137 }
1138 
1139 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1140 {
1141 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1142 	struct fib_nh_exception *fnhe;
1143 	u32 hval;
1144 
1145 	if (!hash)
1146 		return NULL;
1147 
1148 	hval = fnhe_hashfun(daddr);
1149 
1150 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152 		if (fnhe->fnhe_daddr == daddr)
1153 			return fnhe;
1154 	}
1155 	return NULL;
1156 }
1157 
1158 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159 			      __be32 daddr)
1160 {
1161 	bool ret = false;
1162 
1163 	spin_lock_bh(&fnhe_lock);
1164 
1165 	if (daddr == fnhe->fnhe_daddr) {
1166 		struct rtable *orig;
1167 
1168 		if (fnhe->fnhe_pmtu) {
1169 			unsigned long expires = fnhe->fnhe_expires;
1170 			unsigned long diff = expires - jiffies;
1171 
1172 			if (time_before(jiffies, expires)) {
1173 				rt->rt_pmtu = fnhe->fnhe_pmtu;
1174 				dst_set_expires(&rt->dst, diff);
1175 			}
1176 		}
1177 		if (fnhe->fnhe_gw) {
1178 			rt->rt_flags |= RTCF_REDIRECTED;
1179 			rt->rt_gateway = fnhe->fnhe_gw;
1180 			rt->rt_uses_gateway = 1;
1181 		} else if (!rt->rt_gateway)
1182 			rt->rt_gateway = daddr;
1183 
1184 		orig = rcu_dereference(fnhe->fnhe_rth);
1185 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
1186 		if (orig)
1187 			rt_free(orig);
1188 
1189 		fnhe->fnhe_stamp = jiffies;
1190 		ret = true;
1191 	}
1192 	spin_unlock_bh(&fnhe_lock);
1193 
1194 	return ret;
1195 }
1196 
1197 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1198 {
1199 	struct rtable *orig, *prev, **p;
1200 	bool ret = true;
1201 
1202 	if (rt_is_input_route(rt)) {
1203 		p = (struct rtable **)&nh->nh_rth_input;
1204 	} else {
1205 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1206 	}
1207 	orig = *p;
1208 
1209 	prev = cmpxchg(p, orig, rt);
1210 	if (prev == orig) {
1211 		if (orig)
1212 			rt_free(orig);
1213 	} else
1214 		ret = false;
1215 
1216 	return ret;
1217 }
1218 
1219 static DEFINE_SPINLOCK(rt_uncached_lock);
1220 static LIST_HEAD(rt_uncached_list);
1221 
1222 static void rt_add_uncached_list(struct rtable *rt)
1223 {
1224 	spin_lock_bh(&rt_uncached_lock);
1225 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1226 	spin_unlock_bh(&rt_uncached_lock);
1227 }
1228 
1229 static void ipv4_dst_destroy(struct dst_entry *dst)
1230 {
1231 	struct rtable *rt = (struct rtable *) dst;
1232 
1233 	if (!list_empty(&rt->rt_uncached)) {
1234 		spin_lock_bh(&rt_uncached_lock);
1235 		list_del(&rt->rt_uncached);
1236 		spin_unlock_bh(&rt_uncached_lock);
1237 	}
1238 }
1239 
1240 void rt_flush_dev(struct net_device *dev)
1241 {
1242 	if (!list_empty(&rt_uncached_list)) {
1243 		struct net *net = dev_net(dev);
1244 		struct rtable *rt;
1245 
1246 		spin_lock_bh(&rt_uncached_lock);
1247 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1248 			if (rt->dst.dev != dev)
1249 				continue;
1250 			rt->dst.dev = net->loopback_dev;
1251 			dev_hold(rt->dst.dev);
1252 			dev_put(dev);
1253 		}
1254 		spin_unlock_bh(&rt_uncached_lock);
1255 	}
1256 }
1257 
1258 static bool rt_cache_valid(const struct rtable *rt)
1259 {
1260 	return	rt &&
1261 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1262 		!rt_is_expired(rt);
1263 }
1264 
1265 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1266 			   const struct fib_result *res,
1267 			   struct fib_nh_exception *fnhe,
1268 			   struct fib_info *fi, u16 type, u32 itag)
1269 {
1270 	bool cached = false;
1271 
1272 	if (fi) {
1273 		struct fib_nh *nh = &FIB_RES_NH(*res);
1274 
1275 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1276 			rt->rt_gateway = nh->nh_gw;
1277 			rt->rt_uses_gateway = 1;
1278 		}
1279 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1280 #ifdef CONFIG_IP_ROUTE_CLASSID
1281 		rt->dst.tclassid = nh->nh_tclassid;
1282 #endif
1283 		if (unlikely(fnhe))
1284 			cached = rt_bind_exception(rt, fnhe, daddr);
1285 		else if (!(rt->dst.flags & DST_NOCACHE))
1286 			cached = rt_cache_route(nh, rt);
1287 		if (unlikely(!cached)) {
1288 			/* Routes we intend to cache in nexthop exception or
1289 			 * FIB nexthop have the DST_NOCACHE bit clear.
1290 			 * However, if we are unsuccessful at storing this
1291 			 * route into the cache we really need to set it.
1292 			 */
1293 			rt->dst.flags |= DST_NOCACHE;
1294 			if (!rt->rt_gateway)
1295 				rt->rt_gateway = daddr;
1296 			rt_add_uncached_list(rt);
1297 		}
1298 	} else
1299 		rt_add_uncached_list(rt);
1300 
1301 #ifdef CONFIG_IP_ROUTE_CLASSID
1302 #ifdef CONFIG_IP_MULTIPLE_TABLES
1303 	set_class_tag(rt, res->tclassid);
1304 #endif
1305 	set_class_tag(rt, itag);
1306 #endif
1307 }
1308 
1309 static struct rtable *rt_dst_alloc(struct net_device *dev,
1310 				   bool nopolicy, bool noxfrm, bool will_cache)
1311 {
1312 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1313 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1314 			 (nopolicy ? DST_NOPOLICY : 0) |
1315 			 (noxfrm ? DST_NOXFRM : 0));
1316 }
1317 
1318 /* called in rcu_read_lock() section */
1319 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1320 				u8 tos, struct net_device *dev, int our)
1321 {
1322 	struct rtable *rth;
1323 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1324 	u32 itag = 0;
1325 	int err;
1326 
1327 	/* Primary sanity checks. */
1328 
1329 	if (in_dev == NULL)
1330 		return -EINVAL;
1331 
1332 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1333 	    skb->protocol != htons(ETH_P_IP))
1334 		goto e_inval;
1335 
1336 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1337 		if (ipv4_is_loopback(saddr))
1338 			goto e_inval;
1339 
1340 	if (ipv4_is_zeronet(saddr)) {
1341 		if (!ipv4_is_local_multicast(daddr))
1342 			goto e_inval;
1343 	} else {
1344 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1345 					  in_dev, &itag);
1346 		if (err < 0)
1347 			goto e_err;
1348 	}
1349 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1350 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1351 	if (!rth)
1352 		goto e_nobufs;
1353 
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355 	rth->dst.tclassid = itag;
1356 #endif
1357 	rth->dst.output = ip_rt_bug;
1358 
1359 	rth->rt_genid	= rt_genid(dev_net(dev));
1360 	rth->rt_flags	= RTCF_MULTICAST;
1361 	rth->rt_type	= RTN_MULTICAST;
1362 	rth->rt_is_input= 1;
1363 	rth->rt_iif	= 0;
1364 	rth->rt_pmtu	= 0;
1365 	rth->rt_gateway	= 0;
1366 	rth->rt_uses_gateway = 0;
1367 	INIT_LIST_HEAD(&rth->rt_uncached);
1368 	if (our) {
1369 		rth->dst.input= ip_local_deliver;
1370 		rth->rt_flags |= RTCF_LOCAL;
1371 	}
1372 
1373 #ifdef CONFIG_IP_MROUTE
1374 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1375 		rth->dst.input = ip_mr_input;
1376 #endif
1377 	RT_CACHE_STAT_INC(in_slow_mc);
1378 
1379 	skb_dst_set(skb, &rth->dst);
1380 	return 0;
1381 
1382 e_nobufs:
1383 	return -ENOBUFS;
1384 e_inval:
1385 	return -EINVAL;
1386 e_err:
1387 	return err;
1388 }
1389 
1390 
1391 static void ip_handle_martian_source(struct net_device *dev,
1392 				     struct in_device *in_dev,
1393 				     struct sk_buff *skb,
1394 				     __be32 daddr,
1395 				     __be32 saddr)
1396 {
1397 	RT_CACHE_STAT_INC(in_martian_src);
1398 #ifdef CONFIG_IP_ROUTE_VERBOSE
1399 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1400 		/*
1401 		 *	RFC1812 recommendation, if source is martian,
1402 		 *	the only hint is MAC header.
1403 		 */
1404 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1405 			&daddr, &saddr, dev->name);
1406 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1407 			print_hex_dump(KERN_WARNING, "ll header: ",
1408 				       DUMP_PREFIX_OFFSET, 16, 1,
1409 				       skb_mac_header(skb),
1410 				       dev->hard_header_len, true);
1411 		}
1412 	}
1413 #endif
1414 }
1415 
1416 /* called in rcu_read_lock() section */
1417 static int __mkroute_input(struct sk_buff *skb,
1418 			   const struct fib_result *res,
1419 			   struct in_device *in_dev,
1420 			   __be32 daddr, __be32 saddr, u32 tos)
1421 {
1422 	struct rtable *rth;
1423 	int err;
1424 	struct in_device *out_dev;
1425 	unsigned int flags = 0;
1426 	bool do_cache;
1427 	u32 itag;
1428 
1429 	/* get a working reference to the output device */
1430 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1431 	if (out_dev == NULL) {
1432 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1433 		return -EINVAL;
1434 	}
1435 
1436 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1437 				  in_dev->dev, in_dev, &itag);
1438 	if (err < 0) {
1439 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1440 					 saddr);
1441 
1442 		goto cleanup;
1443 	}
1444 
1445 	do_cache = res->fi && !itag;
1446 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1447 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1448 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1449 		flags |= RTCF_DOREDIRECT;
1450 		do_cache = false;
1451 	}
1452 
1453 	if (skb->protocol != htons(ETH_P_IP)) {
1454 		/* Not IP (i.e. ARP). Do not create route, if it is
1455 		 * invalid for proxy arp. DNAT routes are always valid.
1456 		 *
1457 		 * Proxy arp feature have been extended to allow, ARP
1458 		 * replies back to the same interface, to support
1459 		 * Private VLAN switch technologies. See arp.c.
1460 		 */
1461 		if (out_dev == in_dev &&
1462 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1463 			err = -EINVAL;
1464 			goto cleanup;
1465 		}
1466 	}
1467 
1468 	if (do_cache) {
1469 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1470 		if (rt_cache_valid(rth)) {
1471 			skb_dst_set_noref(skb, &rth->dst);
1472 			goto out;
1473 		}
1474 	}
1475 
1476 	rth = rt_dst_alloc(out_dev->dev,
1477 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1478 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1479 	if (!rth) {
1480 		err = -ENOBUFS;
1481 		goto cleanup;
1482 	}
1483 
1484 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1485 	rth->rt_flags = flags;
1486 	rth->rt_type = res->type;
1487 	rth->rt_is_input = 1;
1488 	rth->rt_iif 	= 0;
1489 	rth->rt_pmtu	= 0;
1490 	rth->rt_gateway	= 0;
1491 	rth->rt_uses_gateway = 0;
1492 	INIT_LIST_HEAD(&rth->rt_uncached);
1493 
1494 	rth->dst.input = ip_forward;
1495 	rth->dst.output = ip_output;
1496 
1497 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1498 	skb_dst_set(skb, &rth->dst);
1499 out:
1500 	err = 0;
1501  cleanup:
1502 	return err;
1503 }
1504 
1505 static int ip_mkroute_input(struct sk_buff *skb,
1506 			    struct fib_result *res,
1507 			    const struct flowi4 *fl4,
1508 			    struct in_device *in_dev,
1509 			    __be32 daddr, __be32 saddr, u32 tos)
1510 {
1511 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1512 	if (res->fi && res->fi->fib_nhs > 1)
1513 		fib_select_multipath(res);
1514 #endif
1515 
1516 	/* create a routing cache entry */
1517 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1518 }
1519 
1520 /*
1521  *	NOTE. We drop all the packets that has local source
1522  *	addresses, because every properly looped back packet
1523  *	must have correct destination already attached by output routine.
1524  *
1525  *	Such approach solves two big problems:
1526  *	1. Not simplex devices are handled properly.
1527  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1528  *	called with rcu_read_lock()
1529  */
1530 
1531 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1532 			       u8 tos, struct net_device *dev)
1533 {
1534 	struct fib_result res;
1535 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1536 	struct flowi4	fl4;
1537 	unsigned int	flags = 0;
1538 	u32		itag = 0;
1539 	struct rtable	*rth;
1540 	int		err = -EINVAL;
1541 	struct net    *net = dev_net(dev);
1542 	bool do_cache;
1543 
1544 	/* IP on this device is disabled. */
1545 
1546 	if (!in_dev)
1547 		goto out;
1548 
1549 	/* Check for the most weird martians, which can be not detected
1550 	   by fib_lookup.
1551 	 */
1552 
1553 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1554 		goto martian_source;
1555 
1556 	res.fi = NULL;
1557 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1558 		goto brd_input;
1559 
1560 	/* Accept zero addresses only to limited broadcast;
1561 	 * I even do not know to fix it or not. Waiting for complains :-)
1562 	 */
1563 	if (ipv4_is_zeronet(saddr))
1564 		goto martian_source;
1565 
1566 	if (ipv4_is_zeronet(daddr))
1567 		goto martian_destination;
1568 
1569 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1570 	 * and call it once if daddr or/and saddr are loopback addresses
1571 	 */
1572 	if (ipv4_is_loopback(daddr)) {
1573 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1574 			goto martian_destination;
1575 	} else if (ipv4_is_loopback(saddr)) {
1576 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577 			goto martian_source;
1578 	}
1579 
1580 	/*
1581 	 *	Now we are ready to route packet.
1582 	 */
1583 	fl4.flowi4_oif = 0;
1584 	fl4.flowi4_iif = dev->ifindex;
1585 	fl4.flowi4_mark = skb->mark;
1586 	fl4.flowi4_tos = tos;
1587 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1588 	fl4.daddr = daddr;
1589 	fl4.saddr = saddr;
1590 	err = fib_lookup(net, &fl4, &res);
1591 	if (err != 0)
1592 		goto no_route;
1593 
1594 	RT_CACHE_STAT_INC(in_slow_tot);
1595 
1596 	if (res.type == RTN_BROADCAST)
1597 		goto brd_input;
1598 
1599 	if (res.type == RTN_LOCAL) {
1600 		err = fib_validate_source(skb, saddr, daddr, tos,
1601 					  LOOPBACK_IFINDEX,
1602 					  dev, in_dev, &itag);
1603 		if (err < 0)
1604 			goto martian_source_keep_err;
1605 		goto local_input;
1606 	}
1607 
1608 	if (!IN_DEV_FORWARD(in_dev))
1609 		goto no_route;
1610 	if (res.type != RTN_UNICAST)
1611 		goto martian_destination;
1612 
1613 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1614 out:	return err;
1615 
1616 brd_input:
1617 	if (skb->protocol != htons(ETH_P_IP))
1618 		goto e_inval;
1619 
1620 	if (!ipv4_is_zeronet(saddr)) {
1621 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1622 					  in_dev, &itag);
1623 		if (err < 0)
1624 			goto martian_source_keep_err;
1625 	}
1626 	flags |= RTCF_BROADCAST;
1627 	res.type = RTN_BROADCAST;
1628 	RT_CACHE_STAT_INC(in_brd);
1629 
1630 local_input:
1631 	do_cache = false;
1632 	if (res.fi) {
1633 		if (!itag) {
1634 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1635 			if (rt_cache_valid(rth)) {
1636 				skb_dst_set_noref(skb, &rth->dst);
1637 				err = 0;
1638 				goto out;
1639 			}
1640 			do_cache = true;
1641 		}
1642 	}
1643 
1644 	rth = rt_dst_alloc(net->loopback_dev,
1645 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1646 	if (!rth)
1647 		goto e_nobufs;
1648 
1649 	rth->dst.input= ip_local_deliver;
1650 	rth->dst.output= ip_rt_bug;
1651 #ifdef CONFIG_IP_ROUTE_CLASSID
1652 	rth->dst.tclassid = itag;
1653 #endif
1654 
1655 	rth->rt_genid = rt_genid(net);
1656 	rth->rt_flags 	= flags|RTCF_LOCAL;
1657 	rth->rt_type	= res.type;
1658 	rth->rt_is_input = 1;
1659 	rth->rt_iif	= 0;
1660 	rth->rt_pmtu	= 0;
1661 	rth->rt_gateway	= 0;
1662 	rth->rt_uses_gateway = 0;
1663 	INIT_LIST_HEAD(&rth->rt_uncached);
1664 	if (res.type == RTN_UNREACHABLE) {
1665 		rth->dst.input= ip_error;
1666 		rth->dst.error= -err;
1667 		rth->rt_flags 	&= ~RTCF_LOCAL;
1668 	}
1669 	if (do_cache)
1670 		rt_cache_route(&FIB_RES_NH(res), rth);
1671 	skb_dst_set(skb, &rth->dst);
1672 	err = 0;
1673 	goto out;
1674 
1675 no_route:
1676 	RT_CACHE_STAT_INC(in_no_route);
1677 	res.type = RTN_UNREACHABLE;
1678 	if (err == -ESRCH)
1679 		err = -ENETUNREACH;
1680 	goto local_input;
1681 
1682 	/*
1683 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1684 	 */
1685 martian_destination:
1686 	RT_CACHE_STAT_INC(in_martian_dst);
1687 #ifdef CONFIG_IP_ROUTE_VERBOSE
1688 	if (IN_DEV_LOG_MARTIANS(in_dev))
1689 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1690 				     &daddr, &saddr, dev->name);
1691 #endif
1692 
1693 e_inval:
1694 	err = -EINVAL;
1695 	goto out;
1696 
1697 e_nobufs:
1698 	err = -ENOBUFS;
1699 	goto out;
1700 
1701 martian_source:
1702 	err = -EINVAL;
1703 martian_source_keep_err:
1704 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1705 	goto out;
1706 }
1707 
1708 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1709 			 u8 tos, struct net_device *dev)
1710 {
1711 	int res;
1712 
1713 	rcu_read_lock();
1714 
1715 	/* Multicast recognition logic is moved from route cache to here.
1716 	   The problem was that too many Ethernet cards have broken/missing
1717 	   hardware multicast filters :-( As result the host on multicasting
1718 	   network acquires a lot of useless route cache entries, sort of
1719 	   SDR messages from all the world. Now we try to get rid of them.
1720 	   Really, provided software IP multicast filter is organized
1721 	   reasonably (at least, hashed), it does not result in a slowdown
1722 	   comparing with route cache reject entries.
1723 	   Note, that multicast routers are not affected, because
1724 	   route cache entry is created eventually.
1725 	 */
1726 	if (ipv4_is_multicast(daddr)) {
1727 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1728 
1729 		if (in_dev) {
1730 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1731 						  ip_hdr(skb)->protocol);
1732 			if (our
1733 #ifdef CONFIG_IP_MROUTE
1734 				||
1735 			    (!ipv4_is_local_multicast(daddr) &&
1736 			     IN_DEV_MFORWARD(in_dev))
1737 #endif
1738 			   ) {
1739 				int res = ip_route_input_mc(skb, daddr, saddr,
1740 							    tos, dev, our);
1741 				rcu_read_unlock();
1742 				return res;
1743 			}
1744 		}
1745 		rcu_read_unlock();
1746 		return -EINVAL;
1747 	}
1748 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1749 	rcu_read_unlock();
1750 	return res;
1751 }
1752 EXPORT_SYMBOL(ip_route_input_noref);
1753 
1754 /* called with rcu_read_lock() */
1755 static struct rtable *__mkroute_output(const struct fib_result *res,
1756 				       const struct flowi4 *fl4, int orig_oif,
1757 				       struct net_device *dev_out,
1758 				       unsigned int flags)
1759 {
1760 	struct fib_info *fi = res->fi;
1761 	struct fib_nh_exception *fnhe;
1762 	struct in_device *in_dev;
1763 	u16 type = res->type;
1764 	struct rtable *rth;
1765 	bool do_cache;
1766 
1767 	in_dev = __in_dev_get_rcu(dev_out);
1768 	if (!in_dev)
1769 		return ERR_PTR(-EINVAL);
1770 
1771 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1772 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1773 			return ERR_PTR(-EINVAL);
1774 
1775 	if (ipv4_is_lbcast(fl4->daddr))
1776 		type = RTN_BROADCAST;
1777 	else if (ipv4_is_multicast(fl4->daddr))
1778 		type = RTN_MULTICAST;
1779 	else if (ipv4_is_zeronet(fl4->daddr))
1780 		return ERR_PTR(-EINVAL);
1781 
1782 	if (dev_out->flags & IFF_LOOPBACK)
1783 		flags |= RTCF_LOCAL;
1784 
1785 	if (type == RTN_BROADCAST) {
1786 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1787 		fi = NULL;
1788 	} else if (type == RTN_MULTICAST) {
1789 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1790 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1791 				     fl4->flowi4_proto))
1792 			flags &= ~RTCF_LOCAL;
1793 		/* If multicast route do not exist use
1794 		 * default one, but do not gateway in this case.
1795 		 * Yes, it is hack.
1796 		 */
1797 		if (fi && res->prefixlen < 4)
1798 			fi = NULL;
1799 	}
1800 
1801 	fnhe = NULL;
1802 	do_cache = fi != NULL;
1803 	if (fi) {
1804 		struct rtable __rcu **prth;
1805 		struct fib_nh *nh = &FIB_RES_NH(*res);
1806 
1807 		fnhe = find_exception(nh, fl4->daddr);
1808 		if (fnhe)
1809 			prth = &fnhe->fnhe_rth;
1810 		else {
1811 			if (unlikely(fl4->flowi4_flags &
1812 				     FLOWI_FLAG_KNOWN_NH &&
1813 				     !(nh->nh_gw &&
1814 				       nh->nh_scope == RT_SCOPE_LINK))) {
1815 				do_cache = false;
1816 				goto add;
1817 			}
1818 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1819 		}
1820 		rth = rcu_dereference(*prth);
1821 		if (rt_cache_valid(rth)) {
1822 			dst_hold(&rth->dst);
1823 			return rth;
1824 		}
1825 	}
1826 
1827 add:
1828 	rth = rt_dst_alloc(dev_out,
1829 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1830 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1831 			   do_cache);
1832 	if (!rth)
1833 		return ERR_PTR(-ENOBUFS);
1834 
1835 	rth->dst.output = ip_output;
1836 
1837 	rth->rt_genid = rt_genid(dev_net(dev_out));
1838 	rth->rt_flags	= flags;
1839 	rth->rt_type	= type;
1840 	rth->rt_is_input = 0;
1841 	rth->rt_iif	= orig_oif ? : 0;
1842 	rth->rt_pmtu	= 0;
1843 	rth->rt_gateway = 0;
1844 	rth->rt_uses_gateway = 0;
1845 	INIT_LIST_HEAD(&rth->rt_uncached);
1846 
1847 	RT_CACHE_STAT_INC(out_slow_tot);
1848 
1849 	if (flags & RTCF_LOCAL)
1850 		rth->dst.input = ip_local_deliver;
1851 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1852 		if (flags & RTCF_LOCAL &&
1853 		    !(dev_out->flags & IFF_LOOPBACK)) {
1854 			rth->dst.output = ip_mc_output;
1855 			RT_CACHE_STAT_INC(out_slow_mc);
1856 		}
1857 #ifdef CONFIG_IP_MROUTE
1858 		if (type == RTN_MULTICAST) {
1859 			if (IN_DEV_MFORWARD(in_dev) &&
1860 			    !ipv4_is_local_multicast(fl4->daddr)) {
1861 				rth->dst.input = ip_mr_input;
1862 				rth->dst.output = ip_mc_output;
1863 			}
1864 		}
1865 #endif
1866 	}
1867 
1868 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1869 
1870 	return rth;
1871 }
1872 
1873 /*
1874  * Major route resolver routine.
1875  */
1876 
1877 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1878 {
1879 	struct net_device *dev_out = NULL;
1880 	__u8 tos = RT_FL_TOS(fl4);
1881 	unsigned int flags = 0;
1882 	struct fib_result res;
1883 	struct rtable *rth;
1884 	int orig_oif;
1885 
1886 	res.tclassid	= 0;
1887 	res.fi		= NULL;
1888 	res.table	= NULL;
1889 
1890 	orig_oif = fl4->flowi4_oif;
1891 
1892 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
1893 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1894 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1895 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1896 
1897 	rcu_read_lock();
1898 	if (fl4->saddr) {
1899 		rth = ERR_PTR(-EINVAL);
1900 		if (ipv4_is_multicast(fl4->saddr) ||
1901 		    ipv4_is_lbcast(fl4->saddr) ||
1902 		    ipv4_is_zeronet(fl4->saddr))
1903 			goto out;
1904 
1905 		/* I removed check for oif == dev_out->oif here.
1906 		   It was wrong for two reasons:
1907 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1908 		      is assigned to multiple interfaces.
1909 		   2. Moreover, we are allowed to send packets with saddr
1910 		      of another iface. --ANK
1911 		 */
1912 
1913 		if (fl4->flowi4_oif == 0 &&
1914 		    (ipv4_is_multicast(fl4->daddr) ||
1915 		     ipv4_is_lbcast(fl4->daddr))) {
1916 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1917 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1918 			if (dev_out == NULL)
1919 				goto out;
1920 
1921 			/* Special hack: user can direct multicasts
1922 			   and limited broadcast via necessary interface
1923 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1924 			   This hack is not just for fun, it allows
1925 			   vic,vat and friends to work.
1926 			   They bind socket to loopback, set ttl to zero
1927 			   and expect that it will work.
1928 			   From the viewpoint of routing cache they are broken,
1929 			   because we are not allowed to build multicast path
1930 			   with loopback source addr (look, routing cache
1931 			   cannot know, that ttl is zero, so that packet
1932 			   will not leave this host and route is valid).
1933 			   Luckily, this hack is good workaround.
1934 			 */
1935 
1936 			fl4->flowi4_oif = dev_out->ifindex;
1937 			goto make_route;
1938 		}
1939 
1940 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1941 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1942 			if (!__ip_dev_find(net, fl4->saddr, false))
1943 				goto out;
1944 		}
1945 	}
1946 
1947 
1948 	if (fl4->flowi4_oif) {
1949 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1950 		rth = ERR_PTR(-ENODEV);
1951 		if (dev_out == NULL)
1952 			goto out;
1953 
1954 		/* RACE: Check return value of inet_select_addr instead. */
1955 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1956 			rth = ERR_PTR(-ENETUNREACH);
1957 			goto out;
1958 		}
1959 		if (ipv4_is_local_multicast(fl4->daddr) ||
1960 		    ipv4_is_lbcast(fl4->daddr)) {
1961 			if (!fl4->saddr)
1962 				fl4->saddr = inet_select_addr(dev_out, 0,
1963 							      RT_SCOPE_LINK);
1964 			goto make_route;
1965 		}
1966 		if (fl4->saddr) {
1967 			if (ipv4_is_multicast(fl4->daddr))
1968 				fl4->saddr = inet_select_addr(dev_out, 0,
1969 							      fl4->flowi4_scope);
1970 			else if (!fl4->daddr)
1971 				fl4->saddr = inet_select_addr(dev_out, 0,
1972 							      RT_SCOPE_HOST);
1973 		}
1974 	}
1975 
1976 	if (!fl4->daddr) {
1977 		fl4->daddr = fl4->saddr;
1978 		if (!fl4->daddr)
1979 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1980 		dev_out = net->loopback_dev;
1981 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
1982 		res.type = RTN_LOCAL;
1983 		flags |= RTCF_LOCAL;
1984 		goto make_route;
1985 	}
1986 
1987 	if (fib_lookup(net, fl4, &res)) {
1988 		res.fi = NULL;
1989 		res.table = NULL;
1990 		if (fl4->flowi4_oif) {
1991 			/* Apparently, routing tables are wrong. Assume,
1992 			   that the destination is on link.
1993 
1994 			   WHY? DW.
1995 			   Because we are allowed to send to iface
1996 			   even if it has NO routes and NO assigned
1997 			   addresses. When oif is specified, routing
1998 			   tables are looked up with only one purpose:
1999 			   to catch if destination is gatewayed, rather than
2000 			   direct. Moreover, if MSG_DONTROUTE is set,
2001 			   we send packet, ignoring both routing tables
2002 			   and ifaddr state. --ANK
2003 
2004 
2005 			   We could make it even if oif is unknown,
2006 			   likely IPv6, but we do not.
2007 			 */
2008 
2009 			if (fl4->saddr == 0)
2010 				fl4->saddr = inet_select_addr(dev_out, 0,
2011 							      RT_SCOPE_LINK);
2012 			res.type = RTN_UNICAST;
2013 			goto make_route;
2014 		}
2015 		rth = ERR_PTR(-ENETUNREACH);
2016 		goto out;
2017 	}
2018 
2019 	if (res.type == RTN_LOCAL) {
2020 		if (!fl4->saddr) {
2021 			if (res.fi->fib_prefsrc)
2022 				fl4->saddr = res.fi->fib_prefsrc;
2023 			else
2024 				fl4->saddr = fl4->daddr;
2025 		}
2026 		dev_out = net->loopback_dev;
2027 		fl4->flowi4_oif = dev_out->ifindex;
2028 		flags |= RTCF_LOCAL;
2029 		goto make_route;
2030 	}
2031 
2032 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2033 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2034 		fib_select_multipath(&res);
2035 	else
2036 #endif
2037 	if (!res.prefixlen &&
2038 	    res.table->tb_num_default > 1 &&
2039 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2040 		fib_select_default(&res);
2041 
2042 	if (!fl4->saddr)
2043 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2044 
2045 	dev_out = FIB_RES_DEV(res);
2046 	fl4->flowi4_oif = dev_out->ifindex;
2047 
2048 
2049 make_route:
2050 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2051 
2052 out:
2053 	rcu_read_unlock();
2054 	return rth;
2055 }
2056 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2057 
2058 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2059 {
2060 	return NULL;
2061 }
2062 
2063 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2064 {
2065 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2066 
2067 	return mtu ? : dst->dev->mtu;
2068 }
2069 
2070 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2071 					  struct sk_buff *skb, u32 mtu)
2072 {
2073 }
2074 
2075 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2076 				       struct sk_buff *skb)
2077 {
2078 }
2079 
2080 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2081 					  unsigned long old)
2082 {
2083 	return NULL;
2084 }
2085 
2086 static struct dst_ops ipv4_dst_blackhole_ops = {
2087 	.family			=	AF_INET,
2088 	.protocol		=	cpu_to_be16(ETH_P_IP),
2089 	.check			=	ipv4_blackhole_dst_check,
2090 	.mtu			=	ipv4_blackhole_mtu,
2091 	.default_advmss		=	ipv4_default_advmss,
2092 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2093 	.redirect		=	ipv4_rt_blackhole_redirect,
2094 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2095 	.neigh_lookup		=	ipv4_neigh_lookup,
2096 };
2097 
2098 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099 {
2100 	struct rtable *ort = (struct rtable *) dst_orig;
2101 	struct rtable *rt;
2102 
2103 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2104 	if (rt) {
2105 		struct dst_entry *new = &rt->dst;
2106 
2107 		new->__use = 1;
2108 		new->input = dst_discard;
2109 		new->output = dst_discard;
2110 
2111 		new->dev = ort->dst.dev;
2112 		if (new->dev)
2113 			dev_hold(new->dev);
2114 
2115 		rt->rt_is_input = ort->rt_is_input;
2116 		rt->rt_iif = ort->rt_iif;
2117 		rt->rt_pmtu = ort->rt_pmtu;
2118 
2119 		rt->rt_genid = rt_genid(net);
2120 		rt->rt_flags = ort->rt_flags;
2121 		rt->rt_type = ort->rt_type;
2122 		rt->rt_gateway = ort->rt_gateway;
2123 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2124 
2125 		INIT_LIST_HEAD(&rt->rt_uncached);
2126 
2127 		dst_free(new);
2128 	}
2129 
2130 	dst_release(dst_orig);
2131 
2132 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2133 }
2134 
2135 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2136 				    struct sock *sk)
2137 {
2138 	struct rtable *rt = __ip_route_output_key(net, flp4);
2139 
2140 	if (IS_ERR(rt))
2141 		return rt;
2142 
2143 	if (flp4->flowi4_proto)
2144 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2145 						   flowi4_to_flowi(flp4),
2146 						   sk, 0);
2147 
2148 	return rt;
2149 }
2150 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2151 
2152 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2153 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2154 			u32 seq, int event, int nowait, unsigned int flags)
2155 {
2156 	struct rtable *rt = skb_rtable(skb);
2157 	struct rtmsg *r;
2158 	struct nlmsghdr *nlh;
2159 	unsigned long expires = 0;
2160 	u32 error;
2161 	u32 metrics[RTAX_MAX];
2162 
2163 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2164 	if (nlh == NULL)
2165 		return -EMSGSIZE;
2166 
2167 	r = nlmsg_data(nlh);
2168 	r->rtm_family	 = AF_INET;
2169 	r->rtm_dst_len	= 32;
2170 	r->rtm_src_len	= 0;
2171 	r->rtm_tos	= fl4->flowi4_tos;
2172 	r->rtm_table	= RT_TABLE_MAIN;
2173 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2174 		goto nla_put_failure;
2175 	r->rtm_type	= rt->rt_type;
2176 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2177 	r->rtm_protocol = RTPROT_UNSPEC;
2178 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2179 	if (rt->rt_flags & RTCF_NOTIFY)
2180 		r->rtm_flags |= RTM_F_NOTIFY;
2181 
2182 	if (nla_put_be32(skb, RTA_DST, dst))
2183 		goto nla_put_failure;
2184 	if (src) {
2185 		r->rtm_src_len = 32;
2186 		if (nla_put_be32(skb, RTA_SRC, src))
2187 			goto nla_put_failure;
2188 	}
2189 	if (rt->dst.dev &&
2190 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2191 		goto nla_put_failure;
2192 #ifdef CONFIG_IP_ROUTE_CLASSID
2193 	if (rt->dst.tclassid &&
2194 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2195 		goto nla_put_failure;
2196 #endif
2197 	if (!rt_is_input_route(rt) &&
2198 	    fl4->saddr != src) {
2199 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2200 			goto nla_put_failure;
2201 	}
2202 	if (rt->rt_uses_gateway &&
2203 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2204 		goto nla_put_failure;
2205 
2206 	expires = rt->dst.expires;
2207 	if (expires) {
2208 		unsigned long now = jiffies;
2209 
2210 		if (time_before(now, expires))
2211 			expires -= now;
2212 		else
2213 			expires = 0;
2214 	}
2215 
2216 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2217 	if (rt->rt_pmtu && expires)
2218 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2219 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2220 		goto nla_put_failure;
2221 
2222 	if (fl4->flowi4_mark &&
2223 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2224 		goto nla_put_failure;
2225 
2226 	error = rt->dst.error;
2227 
2228 	if (rt_is_input_route(rt)) {
2229 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2230 			goto nla_put_failure;
2231 	}
2232 
2233 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2234 		goto nla_put_failure;
2235 
2236 	return nlmsg_end(skb, nlh);
2237 
2238 nla_put_failure:
2239 	nlmsg_cancel(skb, nlh);
2240 	return -EMSGSIZE;
2241 }
2242 
2243 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2244 {
2245 	struct net *net = sock_net(in_skb->sk);
2246 	struct rtmsg *rtm;
2247 	struct nlattr *tb[RTA_MAX+1];
2248 	struct rtable *rt = NULL;
2249 	struct flowi4 fl4;
2250 	__be32 dst = 0;
2251 	__be32 src = 0;
2252 	u32 iif;
2253 	int err;
2254 	int mark;
2255 	struct sk_buff *skb;
2256 
2257 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2258 	if (err < 0)
2259 		goto errout;
2260 
2261 	rtm = nlmsg_data(nlh);
2262 
2263 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2264 	if (skb == NULL) {
2265 		err = -ENOBUFS;
2266 		goto errout;
2267 	}
2268 
2269 	/* Reserve room for dummy headers, this skb can pass
2270 	   through good chunk of routing engine.
2271 	 */
2272 	skb_reset_mac_header(skb);
2273 	skb_reset_network_header(skb);
2274 
2275 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2276 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2277 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2278 
2279 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2280 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2281 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2282 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2283 
2284 	memset(&fl4, 0, sizeof(fl4));
2285 	fl4.daddr = dst;
2286 	fl4.saddr = src;
2287 	fl4.flowi4_tos = rtm->rtm_tos;
2288 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2289 	fl4.flowi4_mark = mark;
2290 
2291 	if (iif) {
2292 		struct net_device *dev;
2293 
2294 		dev = __dev_get_by_index(net, iif);
2295 		if (dev == NULL) {
2296 			err = -ENODEV;
2297 			goto errout_free;
2298 		}
2299 
2300 		skb->protocol	= htons(ETH_P_IP);
2301 		skb->dev	= dev;
2302 		skb->mark	= mark;
2303 		local_bh_disable();
2304 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2305 		local_bh_enable();
2306 
2307 		rt = skb_rtable(skb);
2308 		if (err == 0 && rt->dst.error)
2309 			err = -rt->dst.error;
2310 	} else {
2311 		rt = ip_route_output_key(net, &fl4);
2312 
2313 		err = 0;
2314 		if (IS_ERR(rt))
2315 			err = PTR_ERR(rt);
2316 	}
2317 
2318 	if (err)
2319 		goto errout_free;
2320 
2321 	skb_dst_set(skb, &rt->dst);
2322 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2323 		rt->rt_flags |= RTCF_NOTIFY;
2324 
2325 	err = rt_fill_info(net, dst, src, &fl4, skb,
2326 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2327 			   RTM_NEWROUTE, 0, 0);
2328 	if (err <= 0)
2329 		goto errout_free;
2330 
2331 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2332 errout:
2333 	return err;
2334 
2335 errout_free:
2336 	kfree_skb(skb);
2337 	goto errout;
2338 }
2339 
2340 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2341 {
2342 	return skb->len;
2343 }
2344 
2345 void ip_rt_multicast_event(struct in_device *in_dev)
2346 {
2347 	rt_cache_flush(dev_net(in_dev->dev));
2348 }
2349 
2350 #ifdef CONFIG_SYSCTL
2351 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2352 					void __user *buffer,
2353 					size_t *lenp, loff_t *ppos)
2354 {
2355 	if (write) {
2356 		rt_cache_flush((struct net *)__ctl->extra1);
2357 		return 0;
2358 	}
2359 
2360 	return -EINVAL;
2361 }
2362 
2363 static ctl_table ipv4_route_table[] = {
2364 	{
2365 		.procname	= "gc_thresh",
2366 		.data		= &ipv4_dst_ops.gc_thresh,
2367 		.maxlen		= sizeof(int),
2368 		.mode		= 0644,
2369 		.proc_handler	= proc_dointvec,
2370 	},
2371 	{
2372 		.procname	= "max_size",
2373 		.data		= &ip_rt_max_size,
2374 		.maxlen		= sizeof(int),
2375 		.mode		= 0644,
2376 		.proc_handler	= proc_dointvec,
2377 	},
2378 	{
2379 		/*  Deprecated. Use gc_min_interval_ms */
2380 
2381 		.procname	= "gc_min_interval",
2382 		.data		= &ip_rt_gc_min_interval,
2383 		.maxlen		= sizeof(int),
2384 		.mode		= 0644,
2385 		.proc_handler	= proc_dointvec_jiffies,
2386 	},
2387 	{
2388 		.procname	= "gc_min_interval_ms",
2389 		.data		= &ip_rt_gc_min_interval,
2390 		.maxlen		= sizeof(int),
2391 		.mode		= 0644,
2392 		.proc_handler	= proc_dointvec_ms_jiffies,
2393 	},
2394 	{
2395 		.procname	= "gc_timeout",
2396 		.data		= &ip_rt_gc_timeout,
2397 		.maxlen		= sizeof(int),
2398 		.mode		= 0644,
2399 		.proc_handler	= proc_dointvec_jiffies,
2400 	},
2401 	{
2402 		.procname	= "gc_interval",
2403 		.data		= &ip_rt_gc_interval,
2404 		.maxlen		= sizeof(int),
2405 		.mode		= 0644,
2406 		.proc_handler	= proc_dointvec_jiffies,
2407 	},
2408 	{
2409 		.procname	= "redirect_load",
2410 		.data		= &ip_rt_redirect_load,
2411 		.maxlen		= sizeof(int),
2412 		.mode		= 0644,
2413 		.proc_handler	= proc_dointvec,
2414 	},
2415 	{
2416 		.procname	= "redirect_number",
2417 		.data		= &ip_rt_redirect_number,
2418 		.maxlen		= sizeof(int),
2419 		.mode		= 0644,
2420 		.proc_handler	= proc_dointvec,
2421 	},
2422 	{
2423 		.procname	= "redirect_silence",
2424 		.data		= &ip_rt_redirect_silence,
2425 		.maxlen		= sizeof(int),
2426 		.mode		= 0644,
2427 		.proc_handler	= proc_dointvec,
2428 	},
2429 	{
2430 		.procname	= "error_cost",
2431 		.data		= &ip_rt_error_cost,
2432 		.maxlen		= sizeof(int),
2433 		.mode		= 0644,
2434 		.proc_handler	= proc_dointvec,
2435 	},
2436 	{
2437 		.procname	= "error_burst",
2438 		.data		= &ip_rt_error_burst,
2439 		.maxlen		= sizeof(int),
2440 		.mode		= 0644,
2441 		.proc_handler	= proc_dointvec,
2442 	},
2443 	{
2444 		.procname	= "gc_elasticity",
2445 		.data		= &ip_rt_gc_elasticity,
2446 		.maxlen		= sizeof(int),
2447 		.mode		= 0644,
2448 		.proc_handler	= proc_dointvec,
2449 	},
2450 	{
2451 		.procname	= "mtu_expires",
2452 		.data		= &ip_rt_mtu_expires,
2453 		.maxlen		= sizeof(int),
2454 		.mode		= 0644,
2455 		.proc_handler	= proc_dointvec_jiffies,
2456 	},
2457 	{
2458 		.procname	= "min_pmtu",
2459 		.data		= &ip_rt_min_pmtu,
2460 		.maxlen		= sizeof(int),
2461 		.mode		= 0644,
2462 		.proc_handler	= proc_dointvec,
2463 	},
2464 	{
2465 		.procname	= "min_adv_mss",
2466 		.data		= &ip_rt_min_advmss,
2467 		.maxlen		= sizeof(int),
2468 		.mode		= 0644,
2469 		.proc_handler	= proc_dointvec,
2470 	},
2471 	{ }
2472 };
2473 
2474 static struct ctl_table ipv4_route_flush_table[] = {
2475 	{
2476 		.procname	= "flush",
2477 		.maxlen		= sizeof(int),
2478 		.mode		= 0200,
2479 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2480 	},
2481 	{ },
2482 };
2483 
2484 static __net_init int sysctl_route_net_init(struct net *net)
2485 {
2486 	struct ctl_table *tbl;
2487 
2488 	tbl = ipv4_route_flush_table;
2489 	if (!net_eq(net, &init_net)) {
2490 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2491 		if (tbl == NULL)
2492 			goto err_dup;
2493 	}
2494 	tbl[0].extra1 = net;
2495 
2496 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2497 	if (net->ipv4.route_hdr == NULL)
2498 		goto err_reg;
2499 	return 0;
2500 
2501 err_reg:
2502 	if (tbl != ipv4_route_flush_table)
2503 		kfree(tbl);
2504 err_dup:
2505 	return -ENOMEM;
2506 }
2507 
2508 static __net_exit void sysctl_route_net_exit(struct net *net)
2509 {
2510 	struct ctl_table *tbl;
2511 
2512 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2513 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2514 	BUG_ON(tbl == ipv4_route_flush_table);
2515 	kfree(tbl);
2516 }
2517 
2518 static __net_initdata struct pernet_operations sysctl_route_ops = {
2519 	.init = sysctl_route_net_init,
2520 	.exit = sysctl_route_net_exit,
2521 };
2522 #endif
2523 
2524 static __net_init int rt_genid_init(struct net *net)
2525 {
2526 	atomic_set(&net->rt_genid, 0);
2527 	get_random_bytes(&net->ipv4.dev_addr_genid,
2528 			 sizeof(net->ipv4.dev_addr_genid));
2529 	return 0;
2530 }
2531 
2532 static __net_initdata struct pernet_operations rt_genid_ops = {
2533 	.init = rt_genid_init,
2534 };
2535 
2536 static int __net_init ipv4_inetpeer_init(struct net *net)
2537 {
2538 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2539 
2540 	if (!bp)
2541 		return -ENOMEM;
2542 	inet_peer_base_init(bp);
2543 	net->ipv4.peers = bp;
2544 	return 0;
2545 }
2546 
2547 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2548 {
2549 	struct inet_peer_base *bp = net->ipv4.peers;
2550 
2551 	net->ipv4.peers = NULL;
2552 	inetpeer_invalidate_tree(bp);
2553 	kfree(bp);
2554 }
2555 
2556 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2557 	.init	=	ipv4_inetpeer_init,
2558 	.exit	=	ipv4_inetpeer_exit,
2559 };
2560 
2561 #ifdef CONFIG_IP_ROUTE_CLASSID
2562 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2563 #endif /* CONFIG_IP_ROUTE_CLASSID */
2564 
2565 int __init ip_rt_init(void)
2566 {
2567 	int rc = 0;
2568 
2569 #ifdef CONFIG_IP_ROUTE_CLASSID
2570 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2571 	if (!ip_rt_acct)
2572 		panic("IP: failed to allocate ip_rt_acct\n");
2573 #endif
2574 
2575 	ipv4_dst_ops.kmem_cachep =
2576 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2577 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2578 
2579 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2580 
2581 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2582 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2583 
2584 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2585 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2586 
2587 	ipv4_dst_ops.gc_thresh = ~0;
2588 	ip_rt_max_size = INT_MAX;
2589 
2590 	devinet_init();
2591 	ip_fib_init();
2592 
2593 	if (ip_rt_proc_init())
2594 		pr_err("Unable to create route proc files\n");
2595 #ifdef CONFIG_XFRM
2596 	xfrm_init();
2597 	xfrm4_init(ip_rt_max_size);
2598 #endif
2599 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2600 
2601 #ifdef CONFIG_SYSCTL
2602 	register_pernet_subsys(&sysctl_route_ops);
2603 #endif
2604 	register_pernet_subsys(&rt_genid_ops);
2605 	register_pernet_subsys(&ipv4_inetpeer_ops);
2606 	return rc;
2607 }
2608 
2609 #ifdef CONFIG_SYSCTL
2610 /*
2611  * We really need to sanitize the damn ipv4 init order, then all
2612  * this nonsense will go away.
2613  */
2614 void __init ip_static_sysctl_init(void)
2615 {
2616 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2617 }
2618 #endif
2619