xref: /linux/net/ipv4/route.c (revision b9ccfda293ee6fca9a89a1584f0900e0627b975e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define IP_MAX_MTU	0xFFF0
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
127 static int ip_rt_redirect_number __read_mostly	= 9;
128 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly	= HZ;
131 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly	= 8;
133 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly	= 256;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 					   struct sk_buff *skb, u32 mtu);
148 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 					struct sk_buff *skb);
150 
151 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152 			    int how)
153 {
154 }
155 
156 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
157 {
158 	WARN_ON(1);
159 	return NULL;
160 }
161 
162 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
163 					   struct sk_buff *skb,
164 					   const void *daddr);
165 
166 static struct dst_ops ipv4_dst_ops = {
167 	.family =		AF_INET,
168 	.protocol =		cpu_to_be16(ETH_P_IP),
169 	.check =		ipv4_dst_check,
170 	.default_advmss =	ipv4_default_advmss,
171 	.mtu =			ipv4_mtu,
172 	.cow_metrics =		ipv4_cow_metrics,
173 	.ifdown =		ipv4_dst_ifdown,
174 	.negative_advice =	ipv4_negative_advice,
175 	.link_failure =		ipv4_link_failure,
176 	.update_pmtu =		ip_rt_update_pmtu,
177 	.redirect =		ip_do_redirect,
178 	.local_out =		__ip_local_out,
179 	.neigh_lookup =		ipv4_neigh_lookup,
180 };
181 
182 #define ECN_OR_COST(class)	TC_PRIO_##class
183 
184 const __u8 ip_tos2prio[16] = {
185 	TC_PRIO_BESTEFFORT,
186 	ECN_OR_COST(BESTEFFORT),
187 	TC_PRIO_BESTEFFORT,
188 	ECN_OR_COST(BESTEFFORT),
189 	TC_PRIO_BULK,
190 	ECN_OR_COST(BULK),
191 	TC_PRIO_BULK,
192 	ECN_OR_COST(BULK),
193 	TC_PRIO_INTERACTIVE,
194 	ECN_OR_COST(INTERACTIVE),
195 	TC_PRIO_INTERACTIVE,
196 	ECN_OR_COST(INTERACTIVE),
197 	TC_PRIO_INTERACTIVE_BULK,
198 	ECN_OR_COST(INTERACTIVE_BULK),
199 	TC_PRIO_INTERACTIVE_BULK,
200 	ECN_OR_COST(INTERACTIVE_BULK)
201 };
202 EXPORT_SYMBOL(ip_tos2prio);
203 
204 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
205 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
206 
207 static inline int rt_genid(struct net *net)
208 {
209 	return atomic_read(&net->ipv4.rt_genid);
210 }
211 
212 #ifdef CONFIG_PROC_FS
213 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
214 {
215 	if (*pos)
216 		return NULL;
217 	return SEQ_START_TOKEN;
218 }
219 
220 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
221 {
222 	++*pos;
223 	return NULL;
224 }
225 
226 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
227 {
228 }
229 
230 static int rt_cache_seq_show(struct seq_file *seq, void *v)
231 {
232 	if (v == SEQ_START_TOKEN)
233 		seq_printf(seq, "%-127s\n",
234 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
235 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
236 			   "HHUptod\tSpecDst");
237 	return 0;
238 }
239 
240 static const struct seq_operations rt_cache_seq_ops = {
241 	.start  = rt_cache_seq_start,
242 	.next   = rt_cache_seq_next,
243 	.stop   = rt_cache_seq_stop,
244 	.show   = rt_cache_seq_show,
245 };
246 
247 static int rt_cache_seq_open(struct inode *inode, struct file *file)
248 {
249 	return seq_open(file, &rt_cache_seq_ops);
250 }
251 
252 static const struct file_operations rt_cache_seq_fops = {
253 	.owner	 = THIS_MODULE,
254 	.open	 = rt_cache_seq_open,
255 	.read	 = seq_read,
256 	.llseek	 = seq_lseek,
257 	.release = seq_release,
258 };
259 
260 
261 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
262 {
263 	int cpu;
264 
265 	if (*pos == 0)
266 		return SEQ_START_TOKEN;
267 
268 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
269 		if (!cpu_possible(cpu))
270 			continue;
271 		*pos = cpu+1;
272 		return &per_cpu(rt_cache_stat, cpu);
273 	}
274 	return NULL;
275 }
276 
277 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
278 {
279 	int cpu;
280 
281 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
282 		if (!cpu_possible(cpu))
283 			continue;
284 		*pos = cpu+1;
285 		return &per_cpu(rt_cache_stat, cpu);
286 	}
287 	return NULL;
288 
289 }
290 
291 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
292 {
293 
294 }
295 
296 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
297 {
298 	struct rt_cache_stat *st = v;
299 
300 	if (v == SEQ_START_TOKEN) {
301 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
302 		return 0;
303 	}
304 
305 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
306 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
307 		   dst_entries_get_slow(&ipv4_dst_ops),
308 		   st->in_hit,
309 		   st->in_slow_tot,
310 		   st->in_slow_mc,
311 		   st->in_no_route,
312 		   st->in_brd,
313 		   st->in_martian_dst,
314 		   st->in_martian_src,
315 
316 		   st->out_hit,
317 		   st->out_slow_tot,
318 		   st->out_slow_mc,
319 
320 		   st->gc_total,
321 		   st->gc_ignored,
322 		   st->gc_goal_miss,
323 		   st->gc_dst_overflow,
324 		   st->in_hlist_search,
325 		   st->out_hlist_search
326 		);
327 	return 0;
328 }
329 
330 static const struct seq_operations rt_cpu_seq_ops = {
331 	.start  = rt_cpu_seq_start,
332 	.next   = rt_cpu_seq_next,
333 	.stop   = rt_cpu_seq_stop,
334 	.show   = rt_cpu_seq_show,
335 };
336 
337 
338 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
339 {
340 	return seq_open(file, &rt_cpu_seq_ops);
341 }
342 
343 static const struct file_operations rt_cpu_seq_fops = {
344 	.owner	 = THIS_MODULE,
345 	.open	 = rt_cpu_seq_open,
346 	.read	 = seq_read,
347 	.llseek	 = seq_lseek,
348 	.release = seq_release,
349 };
350 
351 #ifdef CONFIG_IP_ROUTE_CLASSID
352 static int rt_acct_proc_show(struct seq_file *m, void *v)
353 {
354 	struct ip_rt_acct *dst, *src;
355 	unsigned int i, j;
356 
357 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
358 	if (!dst)
359 		return -ENOMEM;
360 
361 	for_each_possible_cpu(i) {
362 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
363 		for (j = 0; j < 256; j++) {
364 			dst[j].o_bytes   += src[j].o_bytes;
365 			dst[j].o_packets += src[j].o_packets;
366 			dst[j].i_bytes   += src[j].i_bytes;
367 			dst[j].i_packets += src[j].i_packets;
368 		}
369 	}
370 
371 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
372 	kfree(dst);
373 	return 0;
374 }
375 
376 static int rt_acct_proc_open(struct inode *inode, struct file *file)
377 {
378 	return single_open(file, rt_acct_proc_show, NULL);
379 }
380 
381 static const struct file_operations rt_acct_proc_fops = {
382 	.owner		= THIS_MODULE,
383 	.open		= rt_acct_proc_open,
384 	.read		= seq_read,
385 	.llseek		= seq_lseek,
386 	.release	= single_release,
387 };
388 #endif
389 
390 static int __net_init ip_rt_do_proc_init(struct net *net)
391 {
392 	struct proc_dir_entry *pde;
393 
394 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
395 			&rt_cache_seq_fops);
396 	if (!pde)
397 		goto err1;
398 
399 	pde = proc_create("rt_cache", S_IRUGO,
400 			  net->proc_net_stat, &rt_cpu_seq_fops);
401 	if (!pde)
402 		goto err2;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
406 	if (!pde)
407 		goto err3;
408 #endif
409 	return 0;
410 
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412 err3:
413 	remove_proc_entry("rt_cache", net->proc_net_stat);
414 #endif
415 err2:
416 	remove_proc_entry("rt_cache", net->proc_net);
417 err1:
418 	return -ENOMEM;
419 }
420 
421 static void __net_exit ip_rt_do_proc_exit(struct net *net)
422 {
423 	remove_proc_entry("rt_cache", net->proc_net_stat);
424 	remove_proc_entry("rt_cache", net->proc_net);
425 #ifdef CONFIG_IP_ROUTE_CLASSID
426 	remove_proc_entry("rt_acct", net->proc_net);
427 #endif
428 }
429 
430 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
431 	.init = ip_rt_do_proc_init,
432 	.exit = ip_rt_do_proc_exit,
433 };
434 
435 static int __init ip_rt_proc_init(void)
436 {
437 	return register_pernet_subsys(&ip_rt_proc_ops);
438 }
439 
440 #else
441 static inline int ip_rt_proc_init(void)
442 {
443 	return 0;
444 }
445 #endif /* CONFIG_PROC_FS */
446 
447 static inline int rt_is_expired(struct rtable *rth)
448 {
449 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
450 }
451 
452 /*
453  * Perturbation of rt_genid by a small quantity [1..256]
454  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
455  * many times (2^24) without giving recent rt_genid.
456  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
457  */
458 static void rt_cache_invalidate(struct net *net)
459 {
460 	unsigned char shuffle;
461 
462 	get_random_bytes(&shuffle, sizeof(shuffle));
463 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
464 }
465 
466 /*
467  * delay < 0  : invalidate cache (fast : entries will be deleted later)
468  * delay >= 0 : invalidate & flush cache (can be long)
469  */
470 void rt_cache_flush(struct net *net, int delay)
471 {
472 	rt_cache_invalidate(net);
473 }
474 
475 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
476 					   struct sk_buff *skb,
477 					   const void *daddr)
478 {
479 	struct net_device *dev = dst->dev;
480 	const __be32 *pkey = daddr;
481 	const struct rtable *rt;
482 	struct neighbour *n;
483 
484 	rt = (const struct rtable *) dst;
485 	if (rt->rt_gateway)
486 		pkey = (const __be32 *) &rt->rt_gateway;
487 	else if (skb)
488 		pkey = &ip_hdr(skb)->daddr;
489 
490 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
491 	if (n)
492 		return n;
493 	return neigh_create(&arp_tbl, pkey, dev);
494 }
495 
496 /*
497  * Peer allocation may fail only in serious out-of-memory conditions.  However
498  * we still can generate some output.
499  * Random ID selection looks a bit dangerous because we have no chances to
500  * select ID being unique in a reasonable period of time.
501  * But broken packet identifier may be better than no packet at all.
502  */
503 static void ip_select_fb_ident(struct iphdr *iph)
504 {
505 	static DEFINE_SPINLOCK(ip_fb_id_lock);
506 	static u32 ip_fallback_id;
507 	u32 salt;
508 
509 	spin_lock_bh(&ip_fb_id_lock);
510 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
511 	iph->id = htons(salt & 0xFFFF);
512 	ip_fallback_id = salt;
513 	spin_unlock_bh(&ip_fb_id_lock);
514 }
515 
516 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
517 {
518 	struct net *net = dev_net(dst->dev);
519 	struct inet_peer *peer;
520 
521 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
522 	if (peer) {
523 		iph->id = htons(inet_getid(peer, more));
524 		inet_putpeer(peer);
525 		return;
526 	}
527 
528 	ip_select_fb_ident(iph);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
532 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
533 			     const struct iphdr *iph,
534 			     int oif, u8 tos,
535 			     u8 prot, u32 mark, int flow_flags)
536 {
537 	if (sk) {
538 		const struct inet_sock *inet = inet_sk(sk);
539 
540 		oif = sk->sk_bound_dev_if;
541 		mark = sk->sk_mark;
542 		tos = RT_CONN_FLAGS(sk);
543 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 	}
545 	flowi4_init_output(fl4, oif, mark, tos,
546 			   RT_SCOPE_UNIVERSE, prot,
547 			   flow_flags,
548 			   iph->daddr, iph->saddr, 0, 0);
549 }
550 
551 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
552 			       const struct sock *sk)
553 {
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SEQLOCK(fnhe_seqlock);
591 
592 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
593 {
594 	struct fib_nh_exception *fnhe, *oldest;
595 
596 	oldest = rcu_dereference(hash->chain);
597 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
598 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
599 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
600 			oldest = fnhe;
601 	}
602 	return oldest;
603 }
604 
605 static inline u32 fnhe_hashfun(__be32 daddr)
606 {
607 	u32 hval;
608 
609 	hval = (__force u32) daddr;
610 	hval ^= (hval >> 11) ^ (hval >> 22);
611 
612 	return hval & (FNHE_HASH_SIZE - 1);
613 }
614 
615 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
616 				  u32 pmtu, unsigned long expires)
617 {
618 	struct fnhe_hash_bucket *hash;
619 	struct fib_nh_exception *fnhe;
620 	int depth;
621 	u32 hval = fnhe_hashfun(daddr);
622 
623 	write_seqlock_bh(&fnhe_seqlock);
624 
625 	hash = nh->nh_exceptions;
626 	if (!hash) {
627 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
628 		if (!hash)
629 			goto out_unlock;
630 		nh->nh_exceptions = hash;
631 	}
632 
633 	hash += hval;
634 
635 	depth = 0;
636 	for (fnhe = rcu_dereference(hash->chain); fnhe;
637 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
638 		if (fnhe->fnhe_daddr == daddr)
639 			break;
640 		depth++;
641 	}
642 
643 	if (fnhe) {
644 		if (gw)
645 			fnhe->fnhe_gw = gw;
646 		if (pmtu) {
647 			fnhe->fnhe_pmtu = pmtu;
648 			fnhe->fnhe_expires = expires;
649 		}
650 	} else {
651 		if (depth > FNHE_RECLAIM_DEPTH)
652 			fnhe = fnhe_oldest(hash);
653 		else {
654 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
655 			if (!fnhe)
656 				goto out_unlock;
657 
658 			fnhe->fnhe_next = hash->chain;
659 			rcu_assign_pointer(hash->chain, fnhe);
660 		}
661 		fnhe->fnhe_daddr = daddr;
662 		fnhe->fnhe_gw = gw;
663 		fnhe->fnhe_pmtu = pmtu;
664 		fnhe->fnhe_expires = expires;
665 	}
666 
667 	fnhe->fnhe_stamp = jiffies;
668 
669 out_unlock:
670 	write_sequnlock_bh(&fnhe_seqlock);
671 	return;
672 }
673 
674 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
675 			     bool kill_route)
676 {
677 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
678 	__be32 old_gw = ip_hdr(skb)->saddr;
679 	struct net_device *dev = skb->dev;
680 	struct in_device *in_dev;
681 	struct fib_result res;
682 	struct neighbour *n;
683 	struct net *net;
684 
685 	switch (icmp_hdr(skb)->code & 7) {
686 	case ICMP_REDIR_NET:
687 	case ICMP_REDIR_NETTOS:
688 	case ICMP_REDIR_HOST:
689 	case ICMP_REDIR_HOSTTOS:
690 		break;
691 
692 	default:
693 		return;
694 	}
695 
696 	if (rt->rt_gateway != old_gw)
697 		return;
698 
699 	in_dev = __in_dev_get_rcu(dev);
700 	if (!in_dev)
701 		return;
702 
703 	net = dev_net(dev);
704 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
705 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
706 	    ipv4_is_zeronet(new_gw))
707 		goto reject_redirect;
708 
709 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
710 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
711 			goto reject_redirect;
712 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
713 			goto reject_redirect;
714 	} else {
715 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
716 			goto reject_redirect;
717 	}
718 
719 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
720 	if (n) {
721 		if (!(n->nud_state & NUD_VALID)) {
722 			neigh_event_send(n, NULL);
723 		} else {
724 			if (fib_lookup(net, fl4, &res) == 0) {
725 				struct fib_nh *nh = &FIB_RES_NH(res);
726 
727 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
728 						      0, 0);
729 			}
730 			if (kill_route)
731 				rt->dst.obsolete = DST_OBSOLETE_KILL;
732 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
733 		}
734 		neigh_release(n);
735 	}
736 	return;
737 
738 reject_redirect:
739 #ifdef CONFIG_IP_ROUTE_VERBOSE
740 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
741 		const struct iphdr *iph = (const struct iphdr *) skb->data;
742 		__be32 daddr = iph->daddr;
743 		__be32 saddr = iph->saddr;
744 
745 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
746 				     "  Advised path = %pI4 -> %pI4\n",
747 				     &old_gw, dev->name, &new_gw,
748 				     &saddr, &daddr);
749 	}
750 #endif
751 	;
752 }
753 
754 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
755 {
756 	struct rtable *rt;
757 	struct flowi4 fl4;
758 
759 	rt = (struct rtable *) dst;
760 
761 	ip_rt_build_flow_key(&fl4, sk, skb);
762 	__ip_do_redirect(rt, skb, &fl4, true);
763 }
764 
765 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
766 {
767 	struct rtable *rt = (struct rtable *)dst;
768 	struct dst_entry *ret = dst;
769 
770 	if (rt) {
771 		if (dst->obsolete > 0) {
772 			ip_rt_put(rt);
773 			ret = NULL;
774 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
775 			   rt->dst.expires) {
776 			ip_rt_put(rt);
777 			ret = NULL;
778 		}
779 	}
780 	return ret;
781 }
782 
783 /*
784  * Algorithm:
785  *	1. The first ip_rt_redirect_number redirects are sent
786  *	   with exponential backoff, then we stop sending them at all,
787  *	   assuming that the host ignores our redirects.
788  *	2. If we did not see packets requiring redirects
789  *	   during ip_rt_redirect_silence, we assume that the host
790  *	   forgot redirected route and start to send redirects again.
791  *
792  * This algorithm is much cheaper and more intelligent than dumb load limiting
793  * in icmp.c.
794  *
795  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
796  * and "frag. need" (breaks PMTU discovery) in icmp.c.
797  */
798 
799 void ip_rt_send_redirect(struct sk_buff *skb)
800 {
801 	struct rtable *rt = skb_rtable(skb);
802 	struct in_device *in_dev;
803 	struct inet_peer *peer;
804 	struct net *net;
805 	int log_martians;
806 
807 	rcu_read_lock();
808 	in_dev = __in_dev_get_rcu(rt->dst.dev);
809 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
810 		rcu_read_unlock();
811 		return;
812 	}
813 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
814 	rcu_read_unlock();
815 
816 	net = dev_net(rt->dst.dev);
817 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
818 	if (!peer) {
819 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
820 		return;
821 	}
822 
823 	/* No redirected packets during ip_rt_redirect_silence;
824 	 * reset the algorithm.
825 	 */
826 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
827 		peer->rate_tokens = 0;
828 
829 	/* Too many ignored redirects; do not send anything
830 	 * set dst.rate_last to the last seen redirected packet.
831 	 */
832 	if (peer->rate_tokens >= ip_rt_redirect_number) {
833 		peer->rate_last = jiffies;
834 		goto out_put_peer;
835 	}
836 
837 	/* Check for load limit; set rate_last to the latest sent
838 	 * redirect.
839 	 */
840 	if (peer->rate_tokens == 0 ||
841 	    time_after(jiffies,
842 		       (peer->rate_last +
843 			(ip_rt_redirect_load << peer->rate_tokens)))) {
844 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
845 		peer->rate_last = jiffies;
846 		++peer->rate_tokens;
847 #ifdef CONFIG_IP_ROUTE_VERBOSE
848 		if (log_martians &&
849 		    peer->rate_tokens == ip_rt_redirect_number)
850 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
851 					     &ip_hdr(skb)->saddr, inet_iif(skb),
852 					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
853 #endif
854 	}
855 out_put_peer:
856 	inet_putpeer(peer);
857 }
858 
859 static int ip_error(struct sk_buff *skb)
860 {
861 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
862 	struct rtable *rt = skb_rtable(skb);
863 	struct inet_peer *peer;
864 	unsigned long now;
865 	struct net *net;
866 	bool send;
867 	int code;
868 
869 	net = dev_net(rt->dst.dev);
870 	if (!IN_DEV_FORWARD(in_dev)) {
871 		switch (rt->dst.error) {
872 		case EHOSTUNREACH:
873 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
874 			break;
875 
876 		case ENETUNREACH:
877 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
878 			break;
879 		}
880 		goto out;
881 	}
882 
883 	switch (rt->dst.error) {
884 	case EINVAL:
885 	default:
886 		goto out;
887 	case EHOSTUNREACH:
888 		code = ICMP_HOST_UNREACH;
889 		break;
890 	case ENETUNREACH:
891 		code = ICMP_NET_UNREACH;
892 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
893 		break;
894 	case EACCES:
895 		code = ICMP_PKT_FILTERED;
896 		break;
897 	}
898 
899 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
900 
901 	send = true;
902 	if (peer) {
903 		now = jiffies;
904 		peer->rate_tokens += now - peer->rate_last;
905 		if (peer->rate_tokens > ip_rt_error_burst)
906 			peer->rate_tokens = ip_rt_error_burst;
907 		peer->rate_last = now;
908 		if (peer->rate_tokens >= ip_rt_error_cost)
909 			peer->rate_tokens -= ip_rt_error_cost;
910 		else
911 			send = false;
912 		inet_putpeer(peer);
913 	}
914 	if (send)
915 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
916 
917 out:	kfree_skb(skb);
918 	return 0;
919 }
920 
921 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
922 {
923 	struct fib_result res;
924 
925 	if (mtu < ip_rt_min_pmtu)
926 		mtu = ip_rt_min_pmtu;
927 
928 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
929 		struct fib_nh *nh = &FIB_RES_NH(res);
930 
931 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
932 				      jiffies + ip_rt_mtu_expires);
933 	}
934 	return mtu;
935 }
936 
937 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
938 			      struct sk_buff *skb, u32 mtu)
939 {
940 	struct rtable *rt = (struct rtable *) dst;
941 	struct flowi4 fl4;
942 
943 	ip_rt_build_flow_key(&fl4, sk, skb);
944 	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
945 
946 	if (!rt->rt_pmtu) {
947 		dst->obsolete = DST_OBSOLETE_KILL;
948 	} else {
949 		rt->rt_pmtu = mtu;
950 		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
951 	}
952 }
953 
954 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
955 		      int oif, u32 mark, u8 protocol, int flow_flags)
956 {
957 	const struct iphdr *iph = (const struct iphdr *) skb->data;
958 	struct flowi4 fl4;
959 	struct rtable *rt;
960 
961 	__build_flow_key(&fl4, NULL, iph, oif,
962 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
963 	rt = __ip_route_output_key(net, &fl4);
964 	if (!IS_ERR(rt)) {
965 		__ip_rt_update_pmtu(rt, &fl4, mtu);
966 		ip_rt_put(rt);
967 	}
968 }
969 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
970 
971 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
972 {
973 	const struct iphdr *iph = (const struct iphdr *) skb->data;
974 	struct flowi4 fl4;
975 	struct rtable *rt;
976 
977 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
978 	rt = __ip_route_output_key(sock_net(sk), &fl4);
979 	if (!IS_ERR(rt)) {
980 		__ip_rt_update_pmtu(rt, &fl4, mtu);
981 		ip_rt_put(rt);
982 	}
983 }
984 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
985 
986 void ipv4_redirect(struct sk_buff *skb, struct net *net,
987 		   int oif, u32 mark, u8 protocol, int flow_flags)
988 {
989 	const struct iphdr *iph = (const struct iphdr *) skb->data;
990 	struct flowi4 fl4;
991 	struct rtable *rt;
992 
993 	__build_flow_key(&fl4, NULL, iph, oif,
994 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
995 	rt = __ip_route_output_key(net, &fl4);
996 	if (!IS_ERR(rt)) {
997 		__ip_do_redirect(rt, skb, &fl4, false);
998 		ip_rt_put(rt);
999 	}
1000 }
1001 EXPORT_SYMBOL_GPL(ipv4_redirect);
1002 
1003 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1004 {
1005 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1006 	struct flowi4 fl4;
1007 	struct rtable *rt;
1008 
1009 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1010 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1011 	if (!IS_ERR(rt)) {
1012 		__ip_do_redirect(rt, skb, &fl4, false);
1013 		ip_rt_put(rt);
1014 	}
1015 }
1016 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1017 
1018 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1019 {
1020 	struct rtable *rt = (struct rtable *) dst;
1021 
1022 	/* All IPV4 dsts are created with ->obsolete set to the value
1023 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1024 	 * into this function always.
1025 	 *
1026 	 * When a PMTU/redirect information update invalidates a
1027 	 * route, this is indicated by setting obsolete to
1028 	 * DST_OBSOLETE_KILL.
1029 	 */
1030 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1031 		return NULL;
1032 	return dst;
1033 }
1034 
1035 static void ipv4_link_failure(struct sk_buff *skb)
1036 {
1037 	struct rtable *rt;
1038 
1039 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1040 
1041 	rt = skb_rtable(skb);
1042 	if (rt)
1043 		dst_set_expires(&rt->dst, 0);
1044 }
1045 
1046 static int ip_rt_bug(struct sk_buff *skb)
1047 {
1048 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1049 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1050 		 skb->dev ? skb->dev->name : "?");
1051 	kfree_skb(skb);
1052 	WARN_ON(1);
1053 	return 0;
1054 }
1055 
1056 /*
1057    We do not cache source address of outgoing interface,
1058    because it is used only by IP RR, TS and SRR options,
1059    so that it out of fast path.
1060 
1061    BTW remember: "addr" is allowed to be not aligned
1062    in IP options!
1063  */
1064 
1065 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1066 {
1067 	__be32 src;
1068 
1069 	if (rt_is_output_route(rt))
1070 		src = ip_hdr(skb)->saddr;
1071 	else {
1072 		struct fib_result res;
1073 		struct flowi4 fl4;
1074 		struct iphdr *iph;
1075 
1076 		iph = ip_hdr(skb);
1077 
1078 		memset(&fl4, 0, sizeof(fl4));
1079 		fl4.daddr = iph->daddr;
1080 		fl4.saddr = iph->saddr;
1081 		fl4.flowi4_tos = RT_TOS(iph->tos);
1082 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1083 		fl4.flowi4_iif = skb->dev->ifindex;
1084 		fl4.flowi4_mark = skb->mark;
1085 
1086 		rcu_read_lock();
1087 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1088 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1089 		else
1090 			src = inet_select_addr(rt->dst.dev,
1091 					       rt_nexthop(rt, iph->daddr),
1092 					       RT_SCOPE_UNIVERSE);
1093 		rcu_read_unlock();
1094 	}
1095 	memcpy(addr, &src, 4);
1096 }
1097 
1098 #ifdef CONFIG_IP_ROUTE_CLASSID
1099 static void set_class_tag(struct rtable *rt, u32 tag)
1100 {
1101 	if (!(rt->dst.tclassid & 0xFFFF))
1102 		rt->dst.tclassid |= tag & 0xFFFF;
1103 	if (!(rt->dst.tclassid & 0xFFFF0000))
1104 		rt->dst.tclassid |= tag & 0xFFFF0000;
1105 }
1106 #endif
1107 
1108 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1109 {
1110 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1111 
1112 	if (advmss == 0) {
1113 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1114 			       ip_rt_min_advmss);
1115 		if (advmss > 65535 - 40)
1116 			advmss = 65535 - 40;
1117 	}
1118 	return advmss;
1119 }
1120 
1121 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1122 {
1123 	const struct rtable *rt = (const struct rtable *) dst;
1124 	unsigned int mtu = rt->rt_pmtu;
1125 
1126 	if (mtu && time_after_eq(jiffies, rt->dst.expires))
1127 		mtu = 0;
1128 
1129 	if (!mtu)
1130 		mtu = dst_metric_raw(dst, RTAX_MTU);
1131 
1132 	if (mtu && rt_is_output_route(rt))
1133 		return mtu;
1134 
1135 	mtu = dst->dev->mtu;
1136 
1137 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1138 		if (rt->rt_gateway && mtu > 576)
1139 			mtu = 576;
1140 	}
1141 
1142 	if (mtu > IP_MAX_MTU)
1143 		mtu = IP_MAX_MTU;
1144 
1145 	return mtu;
1146 }
1147 
1148 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1149 {
1150 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1151 	struct fib_nh_exception *fnhe;
1152 	u32 hval;
1153 
1154 	if (!hash)
1155 		return NULL;
1156 
1157 	hval = fnhe_hashfun(daddr);
1158 
1159 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1160 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1161 		if (fnhe->fnhe_daddr == daddr)
1162 			return fnhe;
1163 	}
1164 	return NULL;
1165 }
1166 
1167 static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1168 			      __be32 daddr)
1169 {
1170 	__be32 fnhe_daddr, gw;
1171 	unsigned long expires;
1172 	unsigned int seq;
1173 	u32 pmtu;
1174 
1175 restart:
1176 	seq = read_seqbegin(&fnhe_seqlock);
1177 	fnhe_daddr = fnhe->fnhe_daddr;
1178 	gw = fnhe->fnhe_gw;
1179 	pmtu = fnhe->fnhe_pmtu;
1180 	expires = fnhe->fnhe_expires;
1181 	if (read_seqretry(&fnhe_seqlock, seq))
1182 		goto restart;
1183 
1184 	if (daddr != fnhe_daddr)
1185 		return;
1186 
1187 	if (pmtu) {
1188 		unsigned long diff = expires - jiffies;
1189 
1190 		if (time_before(jiffies, expires)) {
1191 			rt->rt_pmtu = pmtu;
1192 			dst_set_expires(&rt->dst, diff);
1193 		}
1194 	}
1195 	if (gw) {
1196 		rt->rt_flags |= RTCF_REDIRECTED;
1197 		rt->rt_gateway = gw;
1198 	}
1199 	fnhe->fnhe_stamp = jiffies;
1200 }
1201 
1202 static inline void rt_release_rcu(struct rcu_head *head)
1203 {
1204 	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
1205 	dst_release(dst);
1206 }
1207 
1208 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1209 {
1210 	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
1211 
1212 	if (rt_is_input_route(rt))
1213 		p = &nh->nh_rth_input;
1214 
1215 	orig = *p;
1216 
1217 	prev = cmpxchg(p, orig, rt);
1218 	if (prev == orig) {
1219 		dst_clone(&rt->dst);
1220 		if (orig)
1221 			call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
1222 	}
1223 }
1224 
1225 static bool rt_cache_valid(struct rtable *rt)
1226 {
1227 	return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
1228 }
1229 
1230 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1231 			   const struct fib_result *res,
1232 			   struct fib_nh_exception *fnhe,
1233 			   struct fib_info *fi, u16 type, u32 itag)
1234 {
1235 	if (fi) {
1236 		struct fib_nh *nh = &FIB_RES_NH(*res);
1237 
1238 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1239 			rt->rt_gateway = nh->nh_gw;
1240 		if (unlikely(fnhe))
1241 			rt_bind_exception(rt, fnhe, daddr);
1242 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1243 #ifdef CONFIG_IP_ROUTE_CLASSID
1244 		rt->dst.tclassid = nh->nh_tclassid;
1245 #endif
1246 		if (!(rt->dst.flags & DST_HOST))
1247 			rt_cache_route(nh, rt);
1248 	}
1249 
1250 #ifdef CONFIG_IP_ROUTE_CLASSID
1251 #ifdef CONFIG_IP_MULTIPLE_TABLES
1252 	set_class_tag(rt, res->tclassid);
1253 #endif
1254 	set_class_tag(rt, itag);
1255 #endif
1256 }
1257 
1258 static struct rtable *rt_dst_alloc(struct net_device *dev,
1259 				   bool nopolicy, bool noxfrm, bool will_cache)
1260 {
1261 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1262 			 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
1263 			 (nopolicy ? DST_NOPOLICY : 0) |
1264 			 (noxfrm ? DST_NOXFRM : 0));
1265 }
1266 
1267 /* called in rcu_read_lock() section */
1268 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1269 				u8 tos, struct net_device *dev, int our)
1270 {
1271 	struct rtable *rth;
1272 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1273 	u32 itag = 0;
1274 	int err;
1275 
1276 	/* Primary sanity checks. */
1277 
1278 	if (in_dev == NULL)
1279 		return -EINVAL;
1280 
1281 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1282 	    skb->protocol != htons(ETH_P_IP))
1283 		goto e_inval;
1284 
1285 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1286 		if (ipv4_is_loopback(saddr))
1287 			goto e_inval;
1288 
1289 	if (ipv4_is_zeronet(saddr)) {
1290 		if (!ipv4_is_local_multicast(daddr))
1291 			goto e_inval;
1292 	} else {
1293 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1294 					  in_dev, &itag);
1295 		if (err < 0)
1296 			goto e_err;
1297 	}
1298 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1299 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1300 	if (!rth)
1301 		goto e_nobufs;
1302 
1303 #ifdef CONFIG_IP_ROUTE_CLASSID
1304 	rth->dst.tclassid = itag;
1305 #endif
1306 	rth->dst.output = ip_rt_bug;
1307 
1308 	rth->rt_genid	= rt_genid(dev_net(dev));
1309 	rth->rt_flags	= RTCF_MULTICAST;
1310 	rth->rt_type	= RTN_MULTICAST;
1311 	rth->rt_is_input= 1;
1312 	rth->rt_iif	= 0;
1313 	rth->rt_pmtu	= 0;
1314 	rth->rt_gateway	= 0;
1315 	if (our) {
1316 		rth->dst.input= ip_local_deliver;
1317 		rth->rt_flags |= RTCF_LOCAL;
1318 	}
1319 
1320 #ifdef CONFIG_IP_MROUTE
1321 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1322 		rth->dst.input = ip_mr_input;
1323 #endif
1324 	RT_CACHE_STAT_INC(in_slow_mc);
1325 
1326 	skb_dst_set(skb, &rth->dst);
1327 	return 0;
1328 
1329 e_nobufs:
1330 	return -ENOBUFS;
1331 e_inval:
1332 	return -EINVAL;
1333 e_err:
1334 	return err;
1335 }
1336 
1337 
1338 static void ip_handle_martian_source(struct net_device *dev,
1339 				     struct in_device *in_dev,
1340 				     struct sk_buff *skb,
1341 				     __be32 daddr,
1342 				     __be32 saddr)
1343 {
1344 	RT_CACHE_STAT_INC(in_martian_src);
1345 #ifdef CONFIG_IP_ROUTE_VERBOSE
1346 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1347 		/*
1348 		 *	RFC1812 recommendation, if source is martian,
1349 		 *	the only hint is MAC header.
1350 		 */
1351 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1352 			&daddr, &saddr, dev->name);
1353 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1354 			print_hex_dump(KERN_WARNING, "ll header: ",
1355 				       DUMP_PREFIX_OFFSET, 16, 1,
1356 				       skb_mac_header(skb),
1357 				       dev->hard_header_len, true);
1358 		}
1359 	}
1360 #endif
1361 }
1362 
1363 /* called in rcu_read_lock() section */
1364 static int __mkroute_input(struct sk_buff *skb,
1365 			   const struct fib_result *res,
1366 			   struct in_device *in_dev,
1367 			   __be32 daddr, __be32 saddr, u32 tos,
1368 			   struct rtable **result)
1369 {
1370 	struct rtable *rth;
1371 	int err;
1372 	struct in_device *out_dev;
1373 	unsigned int flags = 0;
1374 	bool do_cache;
1375 	u32 itag;
1376 
1377 	/* get a working reference to the output device */
1378 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1379 	if (out_dev == NULL) {
1380 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1381 		return -EINVAL;
1382 	}
1383 
1384 
1385 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1386 				  in_dev->dev, in_dev, &itag);
1387 	if (err < 0) {
1388 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1389 					 saddr);
1390 
1391 		goto cleanup;
1392 	}
1393 
1394 	if (out_dev == in_dev && err &&
1395 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1396 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1397 		flags |= RTCF_DOREDIRECT;
1398 
1399 	if (skb->protocol != htons(ETH_P_IP)) {
1400 		/* Not IP (i.e. ARP). Do not create route, if it is
1401 		 * invalid for proxy arp. DNAT routes are always valid.
1402 		 *
1403 		 * Proxy arp feature have been extended to allow, ARP
1404 		 * replies back to the same interface, to support
1405 		 * Private VLAN switch technologies. See arp.c.
1406 		 */
1407 		if (out_dev == in_dev &&
1408 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1409 			err = -EINVAL;
1410 			goto cleanup;
1411 		}
1412 	}
1413 
1414 	do_cache = false;
1415 	if (res->fi) {
1416 		if (!itag) {
1417 			rth = FIB_RES_NH(*res).nh_rth_input;
1418 			if (rt_cache_valid(rth)) {
1419 				dst_hold(&rth->dst);
1420 				goto out;
1421 			}
1422 			do_cache = true;
1423 		}
1424 	}
1425 
1426 	rth = rt_dst_alloc(out_dev->dev,
1427 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1428 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1429 	if (!rth) {
1430 		err = -ENOBUFS;
1431 		goto cleanup;
1432 	}
1433 
1434 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1435 	rth->rt_flags = flags;
1436 	rth->rt_type = res->type;
1437 	rth->rt_is_input = 1;
1438 	rth->rt_iif 	= 0;
1439 	rth->rt_pmtu	= 0;
1440 	rth->rt_gateway	= 0;
1441 
1442 	rth->dst.input = ip_forward;
1443 	rth->dst.output = ip_output;
1444 
1445 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1446 out:
1447 	*result = rth;
1448 	err = 0;
1449  cleanup:
1450 	return err;
1451 }
1452 
1453 static int ip_mkroute_input(struct sk_buff *skb,
1454 			    struct fib_result *res,
1455 			    const struct flowi4 *fl4,
1456 			    struct in_device *in_dev,
1457 			    __be32 daddr, __be32 saddr, u32 tos)
1458 {
1459 	struct rtable *rth = NULL;
1460 	int err;
1461 
1462 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1463 	if (res->fi && res->fi->fib_nhs > 1)
1464 		fib_select_multipath(res);
1465 #endif
1466 
1467 	/* create a routing cache entry */
1468 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1469 	if (err)
1470 		return err;
1471 
1472 	skb_dst_set(skb, &rth->dst);
1473 	return 0;
1474 }
1475 
1476 /*
1477  *	NOTE. We drop all the packets that has local source
1478  *	addresses, because every properly looped back packet
1479  *	must have correct destination already attached by output routine.
1480  *
1481  *	Such approach solves two big problems:
1482  *	1. Not simplex devices are handled properly.
1483  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1484  *	called with rcu_read_lock()
1485  */
1486 
1487 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1488 			       u8 tos, struct net_device *dev)
1489 {
1490 	struct fib_result res;
1491 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1492 	struct flowi4	fl4;
1493 	unsigned int	flags = 0;
1494 	u32		itag = 0;
1495 	struct rtable	*rth;
1496 	int		err = -EINVAL;
1497 	struct net    *net = dev_net(dev);
1498 	bool do_cache;
1499 
1500 	/* IP on this device is disabled. */
1501 
1502 	if (!in_dev)
1503 		goto out;
1504 
1505 	/* Check for the most weird martians, which can be not detected
1506 	   by fib_lookup.
1507 	 */
1508 
1509 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1510 		goto martian_source;
1511 
1512 	res.fi = NULL;
1513 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1514 		goto brd_input;
1515 
1516 	/* Accept zero addresses only to limited broadcast;
1517 	 * I even do not know to fix it or not. Waiting for complains :-)
1518 	 */
1519 	if (ipv4_is_zeronet(saddr))
1520 		goto martian_source;
1521 
1522 	if (ipv4_is_zeronet(daddr))
1523 		goto martian_destination;
1524 
1525 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1526 		if (ipv4_is_loopback(daddr))
1527 			goto martian_destination;
1528 
1529 		if (ipv4_is_loopback(saddr))
1530 			goto martian_source;
1531 	}
1532 
1533 	/*
1534 	 *	Now we are ready to route packet.
1535 	 */
1536 	fl4.flowi4_oif = 0;
1537 	fl4.flowi4_iif = dev->ifindex;
1538 	fl4.flowi4_mark = skb->mark;
1539 	fl4.flowi4_tos = tos;
1540 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1541 	fl4.daddr = daddr;
1542 	fl4.saddr = saddr;
1543 	err = fib_lookup(net, &fl4, &res);
1544 	if (err != 0)
1545 		goto no_route;
1546 
1547 	RT_CACHE_STAT_INC(in_slow_tot);
1548 
1549 	if (res.type == RTN_BROADCAST)
1550 		goto brd_input;
1551 
1552 	if (res.type == RTN_LOCAL) {
1553 		err = fib_validate_source(skb, saddr, daddr, tos,
1554 					  net->loopback_dev->ifindex,
1555 					  dev, in_dev, &itag);
1556 		if (err < 0)
1557 			goto martian_source_keep_err;
1558 		goto local_input;
1559 	}
1560 
1561 	if (!IN_DEV_FORWARD(in_dev))
1562 		goto no_route;
1563 	if (res.type != RTN_UNICAST)
1564 		goto martian_destination;
1565 
1566 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1567 out:	return err;
1568 
1569 brd_input:
1570 	if (skb->protocol != htons(ETH_P_IP))
1571 		goto e_inval;
1572 
1573 	if (!ipv4_is_zeronet(saddr)) {
1574 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1575 					  in_dev, &itag);
1576 		if (err < 0)
1577 			goto martian_source_keep_err;
1578 	}
1579 	flags |= RTCF_BROADCAST;
1580 	res.type = RTN_BROADCAST;
1581 	RT_CACHE_STAT_INC(in_brd);
1582 
1583 local_input:
1584 	do_cache = false;
1585 	if (res.fi) {
1586 		if (!itag) {
1587 			rth = FIB_RES_NH(res).nh_rth_input;
1588 			if (rt_cache_valid(rth)) {
1589 				dst_hold(&rth->dst);
1590 				goto set_and_out;
1591 			}
1592 			do_cache = true;
1593 		}
1594 	}
1595 
1596 	rth = rt_dst_alloc(net->loopback_dev,
1597 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1598 	if (!rth)
1599 		goto e_nobufs;
1600 
1601 	rth->dst.input= ip_local_deliver;
1602 	rth->dst.output= ip_rt_bug;
1603 #ifdef CONFIG_IP_ROUTE_CLASSID
1604 	rth->dst.tclassid = itag;
1605 #endif
1606 
1607 	rth->rt_genid = rt_genid(net);
1608 	rth->rt_flags 	= flags|RTCF_LOCAL;
1609 	rth->rt_type	= res.type;
1610 	rth->rt_is_input = 1;
1611 	rth->rt_iif	= 0;
1612 	rth->rt_pmtu	= 0;
1613 	rth->rt_gateway	= 0;
1614 	if (res.type == RTN_UNREACHABLE) {
1615 		rth->dst.input= ip_error;
1616 		rth->dst.error= -err;
1617 		rth->rt_flags 	&= ~RTCF_LOCAL;
1618 	}
1619 	if (do_cache)
1620 		rt_cache_route(&FIB_RES_NH(res), rth);
1621 set_and_out:
1622 	skb_dst_set(skb, &rth->dst);
1623 	err = 0;
1624 	goto out;
1625 
1626 no_route:
1627 	RT_CACHE_STAT_INC(in_no_route);
1628 	res.type = RTN_UNREACHABLE;
1629 	if (err == -ESRCH)
1630 		err = -ENETUNREACH;
1631 	goto local_input;
1632 
1633 	/*
1634 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1635 	 */
1636 martian_destination:
1637 	RT_CACHE_STAT_INC(in_martian_dst);
1638 #ifdef CONFIG_IP_ROUTE_VERBOSE
1639 	if (IN_DEV_LOG_MARTIANS(in_dev))
1640 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1641 				     &daddr, &saddr, dev->name);
1642 #endif
1643 
1644 e_inval:
1645 	err = -EINVAL;
1646 	goto out;
1647 
1648 e_nobufs:
1649 	err = -ENOBUFS;
1650 	goto out;
1651 
1652 martian_source:
1653 	err = -EINVAL;
1654 martian_source_keep_err:
1655 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1656 	goto out;
1657 }
1658 
1659 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1660 		   u8 tos, struct net_device *dev)
1661 {
1662 	int res;
1663 
1664 	rcu_read_lock();
1665 
1666 	/* Multicast recognition logic is moved from route cache to here.
1667 	   The problem was that too many Ethernet cards have broken/missing
1668 	   hardware multicast filters :-( As result the host on multicasting
1669 	   network acquires a lot of useless route cache entries, sort of
1670 	   SDR messages from all the world. Now we try to get rid of them.
1671 	   Really, provided software IP multicast filter is organized
1672 	   reasonably (at least, hashed), it does not result in a slowdown
1673 	   comparing with route cache reject entries.
1674 	   Note, that multicast routers are not affected, because
1675 	   route cache entry is created eventually.
1676 	 */
1677 	if (ipv4_is_multicast(daddr)) {
1678 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1679 
1680 		if (in_dev) {
1681 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1682 						  ip_hdr(skb)->protocol);
1683 			if (our
1684 #ifdef CONFIG_IP_MROUTE
1685 				||
1686 			    (!ipv4_is_local_multicast(daddr) &&
1687 			     IN_DEV_MFORWARD(in_dev))
1688 #endif
1689 			   ) {
1690 				int res = ip_route_input_mc(skb, daddr, saddr,
1691 							    tos, dev, our);
1692 				rcu_read_unlock();
1693 				return res;
1694 			}
1695 		}
1696 		rcu_read_unlock();
1697 		return -EINVAL;
1698 	}
1699 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1700 	rcu_read_unlock();
1701 	return res;
1702 }
1703 EXPORT_SYMBOL(ip_route_input);
1704 
1705 /* called with rcu_read_lock() */
1706 static struct rtable *__mkroute_output(const struct fib_result *res,
1707 				       const struct flowi4 *fl4, int orig_oif,
1708 				       struct net_device *dev_out,
1709 				       unsigned int flags)
1710 {
1711 	struct fib_info *fi = res->fi;
1712 	struct fib_nh_exception *fnhe;
1713 	struct in_device *in_dev;
1714 	u16 type = res->type;
1715 	struct rtable *rth;
1716 
1717 	in_dev = __in_dev_get_rcu(dev_out);
1718 	if (!in_dev)
1719 		return ERR_PTR(-EINVAL);
1720 
1721 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1722 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1723 			return ERR_PTR(-EINVAL);
1724 
1725 	if (ipv4_is_lbcast(fl4->daddr))
1726 		type = RTN_BROADCAST;
1727 	else if (ipv4_is_multicast(fl4->daddr))
1728 		type = RTN_MULTICAST;
1729 	else if (ipv4_is_zeronet(fl4->daddr))
1730 		return ERR_PTR(-EINVAL);
1731 
1732 	if (dev_out->flags & IFF_LOOPBACK)
1733 		flags |= RTCF_LOCAL;
1734 
1735 	if (type == RTN_BROADCAST) {
1736 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1737 		fi = NULL;
1738 	} else if (type == RTN_MULTICAST) {
1739 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1740 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1741 				     fl4->flowi4_proto))
1742 			flags &= ~RTCF_LOCAL;
1743 		/* If multicast route do not exist use
1744 		 * default one, but do not gateway in this case.
1745 		 * Yes, it is hack.
1746 		 */
1747 		if (fi && res->prefixlen < 4)
1748 			fi = NULL;
1749 	}
1750 
1751 	fnhe = NULL;
1752 	if (fi) {
1753 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1754 		if (!fnhe) {
1755 			rth = FIB_RES_NH(*res).nh_rth_output;
1756 			if (rt_cache_valid(rth)) {
1757 				dst_hold(&rth->dst);
1758 				return rth;
1759 			}
1760 		}
1761 	}
1762 	rth = rt_dst_alloc(dev_out,
1763 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1764 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1765 			   fi && !fnhe);
1766 	if (!rth)
1767 		return ERR_PTR(-ENOBUFS);
1768 
1769 	rth->dst.output = ip_output;
1770 
1771 	rth->rt_genid = rt_genid(dev_net(dev_out));
1772 	rth->rt_flags	= flags;
1773 	rth->rt_type	= type;
1774 	rth->rt_is_input = 0;
1775 	rth->rt_iif	= orig_oif ? : 0;
1776 	rth->rt_pmtu	= 0;
1777 	rth->rt_gateway = 0;
1778 
1779 	RT_CACHE_STAT_INC(out_slow_tot);
1780 
1781 	if (flags & RTCF_LOCAL)
1782 		rth->dst.input = ip_local_deliver;
1783 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1784 		if (flags & RTCF_LOCAL &&
1785 		    !(dev_out->flags & IFF_LOOPBACK)) {
1786 			rth->dst.output = ip_mc_output;
1787 			RT_CACHE_STAT_INC(out_slow_mc);
1788 		}
1789 #ifdef CONFIG_IP_MROUTE
1790 		if (type == RTN_MULTICAST) {
1791 			if (IN_DEV_MFORWARD(in_dev) &&
1792 			    !ipv4_is_local_multicast(fl4->daddr)) {
1793 				rth->dst.input = ip_mr_input;
1794 				rth->dst.output = ip_mc_output;
1795 			}
1796 		}
1797 #endif
1798 	}
1799 
1800 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1801 
1802 	return rth;
1803 }
1804 
1805 /*
1806  * Major route resolver routine.
1807  */
1808 
1809 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1810 {
1811 	struct net_device *dev_out = NULL;
1812 	__u8 tos = RT_FL_TOS(fl4);
1813 	unsigned int flags = 0;
1814 	struct fib_result res;
1815 	struct rtable *rth;
1816 	int orig_oif;
1817 
1818 	res.tclassid	= 0;
1819 	res.fi		= NULL;
1820 	res.table	= NULL;
1821 
1822 	orig_oif = fl4->flowi4_oif;
1823 
1824 	fl4->flowi4_iif = net->loopback_dev->ifindex;
1825 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1826 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1827 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1828 
1829 	rcu_read_lock();
1830 	if (fl4->saddr) {
1831 		rth = ERR_PTR(-EINVAL);
1832 		if (ipv4_is_multicast(fl4->saddr) ||
1833 		    ipv4_is_lbcast(fl4->saddr) ||
1834 		    ipv4_is_zeronet(fl4->saddr))
1835 			goto out;
1836 
1837 		/* I removed check for oif == dev_out->oif here.
1838 		   It was wrong for two reasons:
1839 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1840 		      is assigned to multiple interfaces.
1841 		   2. Moreover, we are allowed to send packets with saddr
1842 		      of another iface. --ANK
1843 		 */
1844 
1845 		if (fl4->flowi4_oif == 0 &&
1846 		    (ipv4_is_multicast(fl4->daddr) ||
1847 		     ipv4_is_lbcast(fl4->daddr))) {
1848 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1849 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1850 			if (dev_out == NULL)
1851 				goto out;
1852 
1853 			/* Special hack: user can direct multicasts
1854 			   and limited broadcast via necessary interface
1855 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1856 			   This hack is not just for fun, it allows
1857 			   vic,vat and friends to work.
1858 			   They bind socket to loopback, set ttl to zero
1859 			   and expect that it will work.
1860 			   From the viewpoint of routing cache they are broken,
1861 			   because we are not allowed to build multicast path
1862 			   with loopback source addr (look, routing cache
1863 			   cannot know, that ttl is zero, so that packet
1864 			   will not leave this host and route is valid).
1865 			   Luckily, this hack is good workaround.
1866 			 */
1867 
1868 			fl4->flowi4_oif = dev_out->ifindex;
1869 			goto make_route;
1870 		}
1871 
1872 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1873 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1874 			if (!__ip_dev_find(net, fl4->saddr, false))
1875 				goto out;
1876 		}
1877 	}
1878 
1879 
1880 	if (fl4->flowi4_oif) {
1881 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1882 		rth = ERR_PTR(-ENODEV);
1883 		if (dev_out == NULL)
1884 			goto out;
1885 
1886 		/* RACE: Check return value of inet_select_addr instead. */
1887 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1888 			rth = ERR_PTR(-ENETUNREACH);
1889 			goto out;
1890 		}
1891 		if (ipv4_is_local_multicast(fl4->daddr) ||
1892 		    ipv4_is_lbcast(fl4->daddr)) {
1893 			if (!fl4->saddr)
1894 				fl4->saddr = inet_select_addr(dev_out, 0,
1895 							      RT_SCOPE_LINK);
1896 			goto make_route;
1897 		}
1898 		if (fl4->saddr) {
1899 			if (ipv4_is_multicast(fl4->daddr))
1900 				fl4->saddr = inet_select_addr(dev_out, 0,
1901 							      fl4->flowi4_scope);
1902 			else if (!fl4->daddr)
1903 				fl4->saddr = inet_select_addr(dev_out, 0,
1904 							      RT_SCOPE_HOST);
1905 		}
1906 	}
1907 
1908 	if (!fl4->daddr) {
1909 		fl4->daddr = fl4->saddr;
1910 		if (!fl4->daddr)
1911 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1912 		dev_out = net->loopback_dev;
1913 		fl4->flowi4_oif = net->loopback_dev->ifindex;
1914 		res.type = RTN_LOCAL;
1915 		flags |= RTCF_LOCAL;
1916 		goto make_route;
1917 	}
1918 
1919 	if (fib_lookup(net, fl4, &res)) {
1920 		res.fi = NULL;
1921 		res.table = NULL;
1922 		if (fl4->flowi4_oif) {
1923 			/* Apparently, routing tables are wrong. Assume,
1924 			   that the destination is on link.
1925 
1926 			   WHY? DW.
1927 			   Because we are allowed to send to iface
1928 			   even if it has NO routes and NO assigned
1929 			   addresses. When oif is specified, routing
1930 			   tables are looked up with only one purpose:
1931 			   to catch if destination is gatewayed, rather than
1932 			   direct. Moreover, if MSG_DONTROUTE is set,
1933 			   we send packet, ignoring both routing tables
1934 			   and ifaddr state. --ANK
1935 
1936 
1937 			   We could make it even if oif is unknown,
1938 			   likely IPv6, but we do not.
1939 			 */
1940 
1941 			if (fl4->saddr == 0)
1942 				fl4->saddr = inet_select_addr(dev_out, 0,
1943 							      RT_SCOPE_LINK);
1944 			res.type = RTN_UNICAST;
1945 			goto make_route;
1946 		}
1947 		rth = ERR_PTR(-ENETUNREACH);
1948 		goto out;
1949 	}
1950 
1951 	if (res.type == RTN_LOCAL) {
1952 		if (!fl4->saddr) {
1953 			if (res.fi->fib_prefsrc)
1954 				fl4->saddr = res.fi->fib_prefsrc;
1955 			else
1956 				fl4->saddr = fl4->daddr;
1957 		}
1958 		dev_out = net->loopback_dev;
1959 		fl4->flowi4_oif = dev_out->ifindex;
1960 		res.fi = NULL;
1961 		flags |= RTCF_LOCAL;
1962 		goto make_route;
1963 	}
1964 
1965 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1966 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1967 		fib_select_multipath(&res);
1968 	else
1969 #endif
1970 	if (!res.prefixlen &&
1971 	    res.table->tb_num_default > 1 &&
1972 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
1973 		fib_select_default(&res);
1974 
1975 	if (!fl4->saddr)
1976 		fl4->saddr = FIB_RES_PREFSRC(net, res);
1977 
1978 	dev_out = FIB_RES_DEV(res);
1979 	fl4->flowi4_oif = dev_out->ifindex;
1980 
1981 
1982 make_route:
1983 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1984 
1985 out:
1986 	rcu_read_unlock();
1987 	return rth;
1988 }
1989 EXPORT_SYMBOL_GPL(__ip_route_output_key);
1990 
1991 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
1992 {
1993 	return NULL;
1994 }
1995 
1996 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
1997 {
1998 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1999 
2000 	return mtu ? : dst->dev->mtu;
2001 }
2002 
2003 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2004 					  struct sk_buff *skb, u32 mtu)
2005 {
2006 }
2007 
2008 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2009 				       struct sk_buff *skb)
2010 {
2011 }
2012 
2013 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2014 					  unsigned long old)
2015 {
2016 	return NULL;
2017 }
2018 
2019 static struct dst_ops ipv4_dst_blackhole_ops = {
2020 	.family			=	AF_INET,
2021 	.protocol		=	cpu_to_be16(ETH_P_IP),
2022 	.check			=	ipv4_blackhole_dst_check,
2023 	.mtu			=	ipv4_blackhole_mtu,
2024 	.default_advmss		=	ipv4_default_advmss,
2025 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2026 	.redirect		=	ipv4_rt_blackhole_redirect,
2027 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2028 	.neigh_lookup		=	ipv4_neigh_lookup,
2029 };
2030 
2031 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2032 {
2033 	struct rtable *ort = (struct rtable *) dst_orig;
2034 	struct rtable *rt;
2035 
2036 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2037 	if (rt) {
2038 		struct dst_entry *new = &rt->dst;
2039 
2040 		new->__use = 1;
2041 		new->input = dst_discard;
2042 		new->output = dst_discard;
2043 
2044 		new->dev = ort->dst.dev;
2045 		if (new->dev)
2046 			dev_hold(new->dev);
2047 
2048 		rt->rt_is_input = ort->rt_is_input;
2049 		rt->rt_iif = ort->rt_iif;
2050 		rt->rt_pmtu = ort->rt_pmtu;
2051 
2052 		rt->rt_genid = rt_genid(net);
2053 		rt->rt_flags = ort->rt_flags;
2054 		rt->rt_type = ort->rt_type;
2055 		rt->rt_gateway = ort->rt_gateway;
2056 
2057 		dst_free(new);
2058 	}
2059 
2060 	dst_release(dst_orig);
2061 
2062 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2063 }
2064 
2065 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2066 				    struct sock *sk)
2067 {
2068 	struct rtable *rt = __ip_route_output_key(net, flp4);
2069 
2070 	if (IS_ERR(rt))
2071 		return rt;
2072 
2073 	if (flp4->flowi4_proto)
2074 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2075 						   flowi4_to_flowi(flp4),
2076 						   sk, 0);
2077 
2078 	return rt;
2079 }
2080 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2081 
2082 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2083 			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2084 			u32 seq, int event, int nowait, unsigned int flags)
2085 {
2086 	struct rtable *rt = skb_rtable(skb);
2087 	struct rtmsg *r;
2088 	struct nlmsghdr *nlh;
2089 	unsigned long expires = 0;
2090 	u32 error;
2091 	u32 metrics[RTAX_MAX];
2092 
2093 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2094 	if (nlh == NULL)
2095 		return -EMSGSIZE;
2096 
2097 	r = nlmsg_data(nlh);
2098 	r->rtm_family	 = AF_INET;
2099 	r->rtm_dst_len	= 32;
2100 	r->rtm_src_len	= 0;
2101 	r->rtm_tos	= fl4->flowi4_tos;
2102 	r->rtm_table	= RT_TABLE_MAIN;
2103 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2104 		goto nla_put_failure;
2105 	r->rtm_type	= rt->rt_type;
2106 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2107 	r->rtm_protocol = RTPROT_UNSPEC;
2108 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2109 	if (rt->rt_flags & RTCF_NOTIFY)
2110 		r->rtm_flags |= RTM_F_NOTIFY;
2111 
2112 	if (nla_put_be32(skb, RTA_DST, dst))
2113 		goto nla_put_failure;
2114 	if (src) {
2115 		r->rtm_src_len = 32;
2116 		if (nla_put_be32(skb, RTA_SRC, src))
2117 			goto nla_put_failure;
2118 	}
2119 	if (rt->dst.dev &&
2120 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2121 		goto nla_put_failure;
2122 #ifdef CONFIG_IP_ROUTE_CLASSID
2123 	if (rt->dst.tclassid &&
2124 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2125 		goto nla_put_failure;
2126 #endif
2127 	if (!rt_is_input_route(rt) &&
2128 	    fl4->saddr != src) {
2129 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2130 			goto nla_put_failure;
2131 	}
2132 	if (rt->rt_gateway &&
2133 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2134 		goto nla_put_failure;
2135 
2136 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2137 	if (rt->rt_pmtu)
2138 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2139 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2140 		goto nla_put_failure;
2141 
2142 	if (fl4->flowi4_mark &&
2143 	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2144 		goto nla_put_failure;
2145 
2146 	error = rt->dst.error;
2147 	expires = rt->dst.expires;
2148 	if (expires) {
2149 		if (time_before(jiffies, expires))
2150 			expires -= jiffies;
2151 		else
2152 			expires = 0;
2153 	}
2154 
2155 	if (rt_is_input_route(rt)) {
2156 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2157 			goto nla_put_failure;
2158 	}
2159 
2160 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2161 		goto nla_put_failure;
2162 
2163 	return nlmsg_end(skb, nlh);
2164 
2165 nla_put_failure:
2166 	nlmsg_cancel(skb, nlh);
2167 	return -EMSGSIZE;
2168 }
2169 
2170 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2171 {
2172 	struct net *net = sock_net(in_skb->sk);
2173 	struct rtmsg *rtm;
2174 	struct nlattr *tb[RTA_MAX+1];
2175 	struct rtable *rt = NULL;
2176 	struct flowi4 fl4;
2177 	__be32 dst = 0;
2178 	__be32 src = 0;
2179 	u32 iif;
2180 	int err;
2181 	int mark;
2182 	struct sk_buff *skb;
2183 
2184 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2185 	if (err < 0)
2186 		goto errout;
2187 
2188 	rtm = nlmsg_data(nlh);
2189 
2190 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2191 	if (skb == NULL) {
2192 		err = -ENOBUFS;
2193 		goto errout;
2194 	}
2195 
2196 	/* Reserve room for dummy headers, this skb can pass
2197 	   through good chunk of routing engine.
2198 	 */
2199 	skb_reset_mac_header(skb);
2200 	skb_reset_network_header(skb);
2201 
2202 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2203 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2204 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2205 
2206 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2207 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2208 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2209 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2210 
2211 	memset(&fl4, 0, sizeof(fl4));
2212 	fl4.daddr = dst;
2213 	fl4.saddr = src;
2214 	fl4.flowi4_tos = rtm->rtm_tos;
2215 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2216 	fl4.flowi4_mark = mark;
2217 
2218 	if (iif) {
2219 		struct net_device *dev;
2220 
2221 		dev = __dev_get_by_index(net, iif);
2222 		if (dev == NULL) {
2223 			err = -ENODEV;
2224 			goto errout_free;
2225 		}
2226 
2227 		skb->protocol	= htons(ETH_P_IP);
2228 		skb->dev	= dev;
2229 		skb->mark	= mark;
2230 		local_bh_disable();
2231 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2232 		local_bh_enable();
2233 
2234 		rt = skb_rtable(skb);
2235 		if (err == 0 && rt->dst.error)
2236 			err = -rt->dst.error;
2237 	} else {
2238 		rt = ip_route_output_key(net, &fl4);
2239 
2240 		err = 0;
2241 		if (IS_ERR(rt))
2242 			err = PTR_ERR(rt);
2243 	}
2244 
2245 	if (err)
2246 		goto errout_free;
2247 
2248 	skb_dst_set(skb, &rt->dst);
2249 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2250 		rt->rt_flags |= RTCF_NOTIFY;
2251 
2252 	err = rt_fill_info(net, dst, src, &fl4, skb,
2253 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2254 			   RTM_NEWROUTE, 0, 0);
2255 	if (err <= 0)
2256 		goto errout_free;
2257 
2258 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2259 errout:
2260 	return err;
2261 
2262 errout_free:
2263 	kfree_skb(skb);
2264 	goto errout;
2265 }
2266 
2267 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2268 {
2269 	return skb->len;
2270 }
2271 
2272 void ip_rt_multicast_event(struct in_device *in_dev)
2273 {
2274 	rt_cache_flush(dev_net(in_dev->dev), 0);
2275 }
2276 
2277 #ifdef CONFIG_SYSCTL
2278 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2279 					void __user *buffer,
2280 					size_t *lenp, loff_t *ppos)
2281 {
2282 	if (write) {
2283 		int flush_delay;
2284 		ctl_table ctl;
2285 		struct net *net;
2286 
2287 		memcpy(&ctl, __ctl, sizeof(ctl));
2288 		ctl.data = &flush_delay;
2289 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
2290 
2291 		net = (struct net *)__ctl->extra1;
2292 		rt_cache_flush(net, flush_delay);
2293 		return 0;
2294 	}
2295 
2296 	return -EINVAL;
2297 }
2298 
2299 static ctl_table ipv4_route_table[] = {
2300 	{
2301 		.procname	= "gc_thresh",
2302 		.data		= &ipv4_dst_ops.gc_thresh,
2303 		.maxlen		= sizeof(int),
2304 		.mode		= 0644,
2305 		.proc_handler	= proc_dointvec,
2306 	},
2307 	{
2308 		.procname	= "max_size",
2309 		.data		= &ip_rt_max_size,
2310 		.maxlen		= sizeof(int),
2311 		.mode		= 0644,
2312 		.proc_handler	= proc_dointvec,
2313 	},
2314 	{
2315 		/*  Deprecated. Use gc_min_interval_ms */
2316 
2317 		.procname	= "gc_min_interval",
2318 		.data		= &ip_rt_gc_min_interval,
2319 		.maxlen		= sizeof(int),
2320 		.mode		= 0644,
2321 		.proc_handler	= proc_dointvec_jiffies,
2322 	},
2323 	{
2324 		.procname	= "gc_min_interval_ms",
2325 		.data		= &ip_rt_gc_min_interval,
2326 		.maxlen		= sizeof(int),
2327 		.mode		= 0644,
2328 		.proc_handler	= proc_dointvec_ms_jiffies,
2329 	},
2330 	{
2331 		.procname	= "gc_timeout",
2332 		.data		= &ip_rt_gc_timeout,
2333 		.maxlen		= sizeof(int),
2334 		.mode		= 0644,
2335 		.proc_handler	= proc_dointvec_jiffies,
2336 	},
2337 	{
2338 		.procname	= "gc_interval",
2339 		.data		= &ip_rt_gc_interval,
2340 		.maxlen		= sizeof(int),
2341 		.mode		= 0644,
2342 		.proc_handler	= proc_dointvec_jiffies,
2343 	},
2344 	{
2345 		.procname	= "redirect_load",
2346 		.data		= &ip_rt_redirect_load,
2347 		.maxlen		= sizeof(int),
2348 		.mode		= 0644,
2349 		.proc_handler	= proc_dointvec,
2350 	},
2351 	{
2352 		.procname	= "redirect_number",
2353 		.data		= &ip_rt_redirect_number,
2354 		.maxlen		= sizeof(int),
2355 		.mode		= 0644,
2356 		.proc_handler	= proc_dointvec,
2357 	},
2358 	{
2359 		.procname	= "redirect_silence",
2360 		.data		= &ip_rt_redirect_silence,
2361 		.maxlen		= sizeof(int),
2362 		.mode		= 0644,
2363 		.proc_handler	= proc_dointvec,
2364 	},
2365 	{
2366 		.procname	= "error_cost",
2367 		.data		= &ip_rt_error_cost,
2368 		.maxlen		= sizeof(int),
2369 		.mode		= 0644,
2370 		.proc_handler	= proc_dointvec,
2371 	},
2372 	{
2373 		.procname	= "error_burst",
2374 		.data		= &ip_rt_error_burst,
2375 		.maxlen		= sizeof(int),
2376 		.mode		= 0644,
2377 		.proc_handler	= proc_dointvec,
2378 	},
2379 	{
2380 		.procname	= "gc_elasticity",
2381 		.data		= &ip_rt_gc_elasticity,
2382 		.maxlen		= sizeof(int),
2383 		.mode		= 0644,
2384 		.proc_handler	= proc_dointvec,
2385 	},
2386 	{
2387 		.procname	= "mtu_expires",
2388 		.data		= &ip_rt_mtu_expires,
2389 		.maxlen		= sizeof(int),
2390 		.mode		= 0644,
2391 		.proc_handler	= proc_dointvec_jiffies,
2392 	},
2393 	{
2394 		.procname	= "min_pmtu",
2395 		.data		= &ip_rt_min_pmtu,
2396 		.maxlen		= sizeof(int),
2397 		.mode		= 0644,
2398 		.proc_handler	= proc_dointvec,
2399 	},
2400 	{
2401 		.procname	= "min_adv_mss",
2402 		.data		= &ip_rt_min_advmss,
2403 		.maxlen		= sizeof(int),
2404 		.mode		= 0644,
2405 		.proc_handler	= proc_dointvec,
2406 	},
2407 	{ }
2408 };
2409 
2410 static struct ctl_table ipv4_route_flush_table[] = {
2411 	{
2412 		.procname	= "flush",
2413 		.maxlen		= sizeof(int),
2414 		.mode		= 0200,
2415 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2416 	},
2417 	{ },
2418 };
2419 
2420 static __net_init int sysctl_route_net_init(struct net *net)
2421 {
2422 	struct ctl_table *tbl;
2423 
2424 	tbl = ipv4_route_flush_table;
2425 	if (!net_eq(net, &init_net)) {
2426 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2427 		if (tbl == NULL)
2428 			goto err_dup;
2429 	}
2430 	tbl[0].extra1 = net;
2431 
2432 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2433 	if (net->ipv4.route_hdr == NULL)
2434 		goto err_reg;
2435 	return 0;
2436 
2437 err_reg:
2438 	if (tbl != ipv4_route_flush_table)
2439 		kfree(tbl);
2440 err_dup:
2441 	return -ENOMEM;
2442 }
2443 
2444 static __net_exit void sysctl_route_net_exit(struct net *net)
2445 {
2446 	struct ctl_table *tbl;
2447 
2448 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2449 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2450 	BUG_ON(tbl == ipv4_route_flush_table);
2451 	kfree(tbl);
2452 }
2453 
2454 static __net_initdata struct pernet_operations sysctl_route_ops = {
2455 	.init = sysctl_route_net_init,
2456 	.exit = sysctl_route_net_exit,
2457 };
2458 #endif
2459 
2460 static __net_init int rt_genid_init(struct net *net)
2461 {
2462 	get_random_bytes(&net->ipv4.rt_genid,
2463 			 sizeof(net->ipv4.rt_genid));
2464 	get_random_bytes(&net->ipv4.dev_addr_genid,
2465 			 sizeof(net->ipv4.dev_addr_genid));
2466 	return 0;
2467 }
2468 
2469 static __net_initdata struct pernet_operations rt_genid_ops = {
2470 	.init = rt_genid_init,
2471 };
2472 
2473 static int __net_init ipv4_inetpeer_init(struct net *net)
2474 {
2475 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2476 
2477 	if (!bp)
2478 		return -ENOMEM;
2479 	inet_peer_base_init(bp);
2480 	net->ipv4.peers = bp;
2481 	return 0;
2482 }
2483 
2484 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2485 {
2486 	struct inet_peer_base *bp = net->ipv4.peers;
2487 
2488 	net->ipv4.peers = NULL;
2489 	inetpeer_invalidate_tree(bp);
2490 	kfree(bp);
2491 }
2492 
2493 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2494 	.init	=	ipv4_inetpeer_init,
2495 	.exit	=	ipv4_inetpeer_exit,
2496 };
2497 
2498 #ifdef CONFIG_IP_ROUTE_CLASSID
2499 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2500 #endif /* CONFIG_IP_ROUTE_CLASSID */
2501 
2502 int __init ip_rt_init(void)
2503 {
2504 	int rc = 0;
2505 
2506 #ifdef CONFIG_IP_ROUTE_CLASSID
2507 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2508 	if (!ip_rt_acct)
2509 		panic("IP: failed to allocate ip_rt_acct\n");
2510 #endif
2511 
2512 	ipv4_dst_ops.kmem_cachep =
2513 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2514 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2515 
2516 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2517 
2518 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2519 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2520 
2521 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2522 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2523 
2524 	ipv4_dst_ops.gc_thresh = ~0;
2525 	ip_rt_max_size = INT_MAX;
2526 
2527 	devinet_init();
2528 	ip_fib_init();
2529 
2530 	if (ip_rt_proc_init())
2531 		pr_err("Unable to create route proc files\n");
2532 #ifdef CONFIG_XFRM
2533 	xfrm_init();
2534 	xfrm4_init(ip_rt_max_size);
2535 #endif
2536 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2537 
2538 #ifdef CONFIG_SYSCTL
2539 	register_pernet_subsys(&sysctl_route_ops);
2540 #endif
2541 	register_pernet_subsys(&rt_genid_ops);
2542 	register_pernet_subsys(&ipv4_inetpeer_ops);
2543 	return rc;
2544 }
2545 
2546 #ifdef CONFIG_SYSCTL
2547 /*
2548  * We really need to sanitize the damn ipv4 init order, then all
2549  * this nonsense will go away.
2550  */
2551 void __init ip_static_sysctl_init(void)
2552 {
2553 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2554 }
2555 #endif
2556