xref: /linux/net/ipv4/route.c (revision d89dffa976bcd13fd87eb76e02e3b71c3a7868e3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define IP_MAX_MTU	0xFFF0
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
127 static int ip_rt_redirect_number __read_mostly	= 9;
128 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly	= HZ;
131 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly	= 8;
133 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly	= 256;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 					   struct sk_buff *skb, u32 mtu);
148 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 					struct sk_buff *skb);
150 
151 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152 			    int how)
153 {
154 }
155 
156 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
157 {
158 	WARN_ON(1);
159 	return NULL;
160 }
161 
162 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
163 					   struct sk_buff *skb,
164 					   const void *daddr);
165 
166 static struct dst_ops ipv4_dst_ops = {
167 	.family =		AF_INET,
168 	.protocol =		cpu_to_be16(ETH_P_IP),
169 	.check =		ipv4_dst_check,
170 	.default_advmss =	ipv4_default_advmss,
171 	.mtu =			ipv4_mtu,
172 	.cow_metrics =		ipv4_cow_metrics,
173 	.ifdown =		ipv4_dst_ifdown,
174 	.negative_advice =	ipv4_negative_advice,
175 	.link_failure =		ipv4_link_failure,
176 	.update_pmtu =		ip_rt_update_pmtu,
177 	.redirect =		ip_do_redirect,
178 	.local_out =		__ip_local_out,
179 	.neigh_lookup =		ipv4_neigh_lookup,
180 };
181 
182 #define ECN_OR_COST(class)	TC_PRIO_##class
183 
184 const __u8 ip_tos2prio[16] = {
185 	TC_PRIO_BESTEFFORT,
186 	ECN_OR_COST(BESTEFFORT),
187 	TC_PRIO_BESTEFFORT,
188 	ECN_OR_COST(BESTEFFORT),
189 	TC_PRIO_BULK,
190 	ECN_OR_COST(BULK),
191 	TC_PRIO_BULK,
192 	ECN_OR_COST(BULK),
193 	TC_PRIO_INTERACTIVE,
194 	ECN_OR_COST(INTERACTIVE),
195 	TC_PRIO_INTERACTIVE,
196 	ECN_OR_COST(INTERACTIVE),
197 	TC_PRIO_INTERACTIVE_BULK,
198 	ECN_OR_COST(INTERACTIVE_BULK),
199 	TC_PRIO_INTERACTIVE_BULK,
200 	ECN_OR_COST(INTERACTIVE_BULK)
201 };
202 EXPORT_SYMBOL(ip_tos2prio);
203 
204 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
205 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
206 
207 static inline int rt_genid(struct net *net)
208 {
209 	return atomic_read(&net->ipv4.rt_genid);
210 }
211 
212 #ifdef CONFIG_PROC_FS
213 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
214 {
215 	if (*pos)
216 		return NULL;
217 	return SEQ_START_TOKEN;
218 }
219 
220 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
221 {
222 	++*pos;
223 	return NULL;
224 }
225 
226 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
227 {
228 }
229 
230 static int rt_cache_seq_show(struct seq_file *seq, void *v)
231 {
232 	if (v == SEQ_START_TOKEN)
233 		seq_printf(seq, "%-127s\n",
234 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
235 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
236 			   "HHUptod\tSpecDst");
237 	return 0;
238 }
239 
240 static const struct seq_operations rt_cache_seq_ops = {
241 	.start  = rt_cache_seq_start,
242 	.next   = rt_cache_seq_next,
243 	.stop   = rt_cache_seq_stop,
244 	.show   = rt_cache_seq_show,
245 };
246 
247 static int rt_cache_seq_open(struct inode *inode, struct file *file)
248 {
249 	return seq_open(file, &rt_cache_seq_ops);
250 }
251 
252 static const struct file_operations rt_cache_seq_fops = {
253 	.owner	 = THIS_MODULE,
254 	.open	 = rt_cache_seq_open,
255 	.read	 = seq_read,
256 	.llseek	 = seq_lseek,
257 	.release = seq_release,
258 };
259 
260 
261 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
262 {
263 	int cpu;
264 
265 	if (*pos == 0)
266 		return SEQ_START_TOKEN;
267 
268 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
269 		if (!cpu_possible(cpu))
270 			continue;
271 		*pos = cpu+1;
272 		return &per_cpu(rt_cache_stat, cpu);
273 	}
274 	return NULL;
275 }
276 
277 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
278 {
279 	int cpu;
280 
281 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
282 		if (!cpu_possible(cpu))
283 			continue;
284 		*pos = cpu+1;
285 		return &per_cpu(rt_cache_stat, cpu);
286 	}
287 	return NULL;
288 
289 }
290 
291 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
292 {
293 
294 }
295 
296 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
297 {
298 	struct rt_cache_stat *st = v;
299 
300 	if (v == SEQ_START_TOKEN) {
301 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
302 		return 0;
303 	}
304 
305 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
306 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
307 		   dst_entries_get_slow(&ipv4_dst_ops),
308 		   st->in_hit,
309 		   st->in_slow_tot,
310 		   st->in_slow_mc,
311 		   st->in_no_route,
312 		   st->in_brd,
313 		   st->in_martian_dst,
314 		   st->in_martian_src,
315 
316 		   st->out_hit,
317 		   st->out_slow_tot,
318 		   st->out_slow_mc,
319 
320 		   st->gc_total,
321 		   st->gc_ignored,
322 		   st->gc_goal_miss,
323 		   st->gc_dst_overflow,
324 		   st->in_hlist_search,
325 		   st->out_hlist_search
326 		);
327 	return 0;
328 }
329 
330 static const struct seq_operations rt_cpu_seq_ops = {
331 	.start  = rt_cpu_seq_start,
332 	.next   = rt_cpu_seq_next,
333 	.stop   = rt_cpu_seq_stop,
334 	.show   = rt_cpu_seq_show,
335 };
336 
337 
338 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
339 {
340 	return seq_open(file, &rt_cpu_seq_ops);
341 }
342 
343 static const struct file_operations rt_cpu_seq_fops = {
344 	.owner	 = THIS_MODULE,
345 	.open	 = rt_cpu_seq_open,
346 	.read	 = seq_read,
347 	.llseek	 = seq_lseek,
348 	.release = seq_release,
349 };
350 
351 #ifdef CONFIG_IP_ROUTE_CLASSID
352 static int rt_acct_proc_show(struct seq_file *m, void *v)
353 {
354 	struct ip_rt_acct *dst, *src;
355 	unsigned int i, j;
356 
357 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
358 	if (!dst)
359 		return -ENOMEM;
360 
361 	for_each_possible_cpu(i) {
362 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
363 		for (j = 0; j < 256; j++) {
364 			dst[j].o_bytes   += src[j].o_bytes;
365 			dst[j].o_packets += src[j].o_packets;
366 			dst[j].i_bytes   += src[j].i_bytes;
367 			dst[j].i_packets += src[j].i_packets;
368 		}
369 	}
370 
371 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
372 	kfree(dst);
373 	return 0;
374 }
375 
376 static int rt_acct_proc_open(struct inode *inode, struct file *file)
377 {
378 	return single_open(file, rt_acct_proc_show, NULL);
379 }
380 
381 static const struct file_operations rt_acct_proc_fops = {
382 	.owner		= THIS_MODULE,
383 	.open		= rt_acct_proc_open,
384 	.read		= seq_read,
385 	.llseek		= seq_lseek,
386 	.release	= single_release,
387 };
388 #endif
389 
390 static int __net_init ip_rt_do_proc_init(struct net *net)
391 {
392 	struct proc_dir_entry *pde;
393 
394 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
395 			&rt_cache_seq_fops);
396 	if (!pde)
397 		goto err1;
398 
399 	pde = proc_create("rt_cache", S_IRUGO,
400 			  net->proc_net_stat, &rt_cpu_seq_fops);
401 	if (!pde)
402 		goto err2;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
406 	if (!pde)
407 		goto err3;
408 #endif
409 	return 0;
410 
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412 err3:
413 	remove_proc_entry("rt_cache", net->proc_net_stat);
414 #endif
415 err2:
416 	remove_proc_entry("rt_cache", net->proc_net);
417 err1:
418 	return -ENOMEM;
419 }
420 
421 static void __net_exit ip_rt_do_proc_exit(struct net *net)
422 {
423 	remove_proc_entry("rt_cache", net->proc_net_stat);
424 	remove_proc_entry("rt_cache", net->proc_net);
425 #ifdef CONFIG_IP_ROUTE_CLASSID
426 	remove_proc_entry("rt_acct", net->proc_net);
427 #endif
428 }
429 
430 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
431 	.init = ip_rt_do_proc_init,
432 	.exit = ip_rt_do_proc_exit,
433 };
434 
435 static int __init ip_rt_proc_init(void)
436 {
437 	return register_pernet_subsys(&ip_rt_proc_ops);
438 }
439 
440 #else
441 static inline int ip_rt_proc_init(void)
442 {
443 	return 0;
444 }
445 #endif /* CONFIG_PROC_FS */
446 
447 static inline bool rt_is_expired(const struct rtable *rth)
448 {
449 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
450 }
451 
452 /*
453  * Perturbation of rt_genid by a small quantity [1..256]
454  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
455  * many times (2^24) without giving recent rt_genid.
456  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
457  */
458 static void rt_cache_invalidate(struct net *net)
459 {
460 	unsigned char shuffle;
461 
462 	get_random_bytes(&shuffle, sizeof(shuffle));
463 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
464 }
465 
466 /*
467  * delay < 0  : invalidate cache (fast : entries will be deleted later)
468  * delay >= 0 : invalidate & flush cache (can be long)
469  */
470 void rt_cache_flush(struct net *net, int delay)
471 {
472 	rt_cache_invalidate(net);
473 }
474 
475 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
476 					   struct sk_buff *skb,
477 					   const void *daddr)
478 {
479 	struct net_device *dev = dst->dev;
480 	const __be32 *pkey = daddr;
481 	const struct rtable *rt;
482 	struct neighbour *n;
483 
484 	rt = (const struct rtable *) dst;
485 	if (rt->rt_gateway)
486 		pkey = (const __be32 *) &rt->rt_gateway;
487 	else if (skb)
488 		pkey = &ip_hdr(skb)->daddr;
489 
490 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
491 	if (n)
492 		return n;
493 	return neigh_create(&arp_tbl, pkey, dev);
494 }
495 
496 /*
497  * Peer allocation may fail only in serious out-of-memory conditions.  However
498  * we still can generate some output.
499  * Random ID selection looks a bit dangerous because we have no chances to
500  * select ID being unique in a reasonable period of time.
501  * But broken packet identifier may be better than no packet at all.
502  */
503 static void ip_select_fb_ident(struct iphdr *iph)
504 {
505 	static DEFINE_SPINLOCK(ip_fb_id_lock);
506 	static u32 ip_fallback_id;
507 	u32 salt;
508 
509 	spin_lock_bh(&ip_fb_id_lock);
510 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
511 	iph->id = htons(salt & 0xFFFF);
512 	ip_fallback_id = salt;
513 	spin_unlock_bh(&ip_fb_id_lock);
514 }
515 
516 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
517 {
518 	struct net *net = dev_net(dst->dev);
519 	struct inet_peer *peer;
520 
521 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
522 	if (peer) {
523 		iph->id = htons(inet_getid(peer, more));
524 		inet_putpeer(peer);
525 		return;
526 	}
527 
528 	ip_select_fb_ident(iph);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
532 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
533 			     const struct iphdr *iph,
534 			     int oif, u8 tos,
535 			     u8 prot, u32 mark, int flow_flags)
536 {
537 	if (sk) {
538 		const struct inet_sock *inet = inet_sk(sk);
539 
540 		oif = sk->sk_bound_dev_if;
541 		mark = sk->sk_mark;
542 		tos = RT_CONN_FLAGS(sk);
543 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 	}
545 	flowi4_init_output(fl4, oif, mark, tos,
546 			   RT_SCOPE_UNIVERSE, prot,
547 			   flow_flags,
548 			   iph->daddr, iph->saddr, 0, 0);
549 }
550 
551 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
552 			       const struct sock *sk)
553 {
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SEQLOCK(fnhe_seqlock);
591 
592 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
593 {
594 	struct fib_nh_exception *fnhe, *oldest;
595 
596 	oldest = rcu_dereference(hash->chain);
597 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
598 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
599 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
600 			oldest = fnhe;
601 	}
602 	return oldest;
603 }
604 
605 static inline u32 fnhe_hashfun(__be32 daddr)
606 {
607 	u32 hval;
608 
609 	hval = (__force u32) daddr;
610 	hval ^= (hval >> 11) ^ (hval >> 22);
611 
612 	return hval & (FNHE_HASH_SIZE - 1);
613 }
614 
615 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
616 				  u32 pmtu, unsigned long expires)
617 {
618 	struct fnhe_hash_bucket *hash;
619 	struct fib_nh_exception *fnhe;
620 	int depth;
621 	u32 hval = fnhe_hashfun(daddr);
622 
623 	write_seqlock_bh(&fnhe_seqlock);
624 
625 	hash = nh->nh_exceptions;
626 	if (!hash) {
627 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
628 		if (!hash)
629 			goto out_unlock;
630 		nh->nh_exceptions = hash;
631 	}
632 
633 	hash += hval;
634 
635 	depth = 0;
636 	for (fnhe = rcu_dereference(hash->chain); fnhe;
637 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
638 		if (fnhe->fnhe_daddr == daddr)
639 			break;
640 		depth++;
641 	}
642 
643 	if (fnhe) {
644 		if (gw)
645 			fnhe->fnhe_gw = gw;
646 		if (pmtu) {
647 			fnhe->fnhe_pmtu = pmtu;
648 			fnhe->fnhe_expires = expires;
649 		}
650 	} else {
651 		if (depth > FNHE_RECLAIM_DEPTH)
652 			fnhe = fnhe_oldest(hash);
653 		else {
654 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
655 			if (!fnhe)
656 				goto out_unlock;
657 
658 			fnhe->fnhe_next = hash->chain;
659 			rcu_assign_pointer(hash->chain, fnhe);
660 		}
661 		fnhe->fnhe_daddr = daddr;
662 		fnhe->fnhe_gw = gw;
663 		fnhe->fnhe_pmtu = pmtu;
664 		fnhe->fnhe_expires = expires;
665 	}
666 
667 	fnhe->fnhe_stamp = jiffies;
668 
669 out_unlock:
670 	write_sequnlock_bh(&fnhe_seqlock);
671 	return;
672 }
673 
674 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
675 			     bool kill_route)
676 {
677 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
678 	__be32 old_gw = ip_hdr(skb)->saddr;
679 	struct net_device *dev = skb->dev;
680 	struct in_device *in_dev;
681 	struct fib_result res;
682 	struct neighbour *n;
683 	struct net *net;
684 
685 	switch (icmp_hdr(skb)->code & 7) {
686 	case ICMP_REDIR_NET:
687 	case ICMP_REDIR_NETTOS:
688 	case ICMP_REDIR_HOST:
689 	case ICMP_REDIR_HOSTTOS:
690 		break;
691 
692 	default:
693 		return;
694 	}
695 
696 	if (rt->rt_gateway != old_gw)
697 		return;
698 
699 	in_dev = __in_dev_get_rcu(dev);
700 	if (!in_dev)
701 		return;
702 
703 	net = dev_net(dev);
704 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
705 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
706 	    ipv4_is_zeronet(new_gw))
707 		goto reject_redirect;
708 
709 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
710 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
711 			goto reject_redirect;
712 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
713 			goto reject_redirect;
714 	} else {
715 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
716 			goto reject_redirect;
717 	}
718 
719 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
720 	if (n) {
721 		if (!(n->nud_state & NUD_VALID)) {
722 			neigh_event_send(n, NULL);
723 		} else {
724 			if (fib_lookup(net, fl4, &res) == 0) {
725 				struct fib_nh *nh = &FIB_RES_NH(res);
726 
727 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
728 						      0, 0);
729 			}
730 			if (kill_route)
731 				rt->dst.obsolete = DST_OBSOLETE_KILL;
732 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
733 		}
734 		neigh_release(n);
735 	}
736 	return;
737 
738 reject_redirect:
739 #ifdef CONFIG_IP_ROUTE_VERBOSE
740 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
741 		const struct iphdr *iph = (const struct iphdr *) skb->data;
742 		__be32 daddr = iph->daddr;
743 		__be32 saddr = iph->saddr;
744 
745 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
746 				     "  Advised path = %pI4 -> %pI4\n",
747 				     &old_gw, dev->name, &new_gw,
748 				     &saddr, &daddr);
749 	}
750 #endif
751 	;
752 }
753 
754 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
755 {
756 	struct rtable *rt;
757 	struct flowi4 fl4;
758 
759 	rt = (struct rtable *) dst;
760 
761 	ip_rt_build_flow_key(&fl4, sk, skb);
762 	__ip_do_redirect(rt, skb, &fl4, true);
763 }
764 
765 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
766 {
767 	struct rtable *rt = (struct rtable *)dst;
768 	struct dst_entry *ret = dst;
769 
770 	if (rt) {
771 		if (dst->obsolete > 0) {
772 			ip_rt_put(rt);
773 			ret = NULL;
774 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
775 			   rt->dst.expires) {
776 			ip_rt_put(rt);
777 			ret = NULL;
778 		}
779 	}
780 	return ret;
781 }
782 
783 /*
784  * Algorithm:
785  *	1. The first ip_rt_redirect_number redirects are sent
786  *	   with exponential backoff, then we stop sending them at all,
787  *	   assuming that the host ignores our redirects.
788  *	2. If we did not see packets requiring redirects
789  *	   during ip_rt_redirect_silence, we assume that the host
790  *	   forgot redirected route and start to send redirects again.
791  *
792  * This algorithm is much cheaper and more intelligent than dumb load limiting
793  * in icmp.c.
794  *
795  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
796  * and "frag. need" (breaks PMTU discovery) in icmp.c.
797  */
798 
799 void ip_rt_send_redirect(struct sk_buff *skb)
800 {
801 	struct rtable *rt = skb_rtable(skb);
802 	struct in_device *in_dev;
803 	struct inet_peer *peer;
804 	struct net *net;
805 	int log_martians;
806 
807 	rcu_read_lock();
808 	in_dev = __in_dev_get_rcu(rt->dst.dev);
809 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
810 		rcu_read_unlock();
811 		return;
812 	}
813 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
814 	rcu_read_unlock();
815 
816 	net = dev_net(rt->dst.dev);
817 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
818 	if (!peer) {
819 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
820 		return;
821 	}
822 
823 	/* No redirected packets during ip_rt_redirect_silence;
824 	 * reset the algorithm.
825 	 */
826 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
827 		peer->rate_tokens = 0;
828 
829 	/* Too many ignored redirects; do not send anything
830 	 * set dst.rate_last to the last seen redirected packet.
831 	 */
832 	if (peer->rate_tokens >= ip_rt_redirect_number) {
833 		peer->rate_last = jiffies;
834 		goto out_put_peer;
835 	}
836 
837 	/* Check for load limit; set rate_last to the latest sent
838 	 * redirect.
839 	 */
840 	if (peer->rate_tokens == 0 ||
841 	    time_after(jiffies,
842 		       (peer->rate_last +
843 			(ip_rt_redirect_load << peer->rate_tokens)))) {
844 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
845 		peer->rate_last = jiffies;
846 		++peer->rate_tokens;
847 #ifdef CONFIG_IP_ROUTE_VERBOSE
848 		if (log_martians &&
849 		    peer->rate_tokens == ip_rt_redirect_number)
850 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
851 					     &ip_hdr(skb)->saddr, inet_iif(skb),
852 					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
853 #endif
854 	}
855 out_put_peer:
856 	inet_putpeer(peer);
857 }
858 
859 static int ip_error(struct sk_buff *skb)
860 {
861 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
862 	struct rtable *rt = skb_rtable(skb);
863 	struct inet_peer *peer;
864 	unsigned long now;
865 	struct net *net;
866 	bool send;
867 	int code;
868 
869 	net = dev_net(rt->dst.dev);
870 	if (!IN_DEV_FORWARD(in_dev)) {
871 		switch (rt->dst.error) {
872 		case EHOSTUNREACH:
873 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
874 			break;
875 
876 		case ENETUNREACH:
877 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
878 			break;
879 		}
880 		goto out;
881 	}
882 
883 	switch (rt->dst.error) {
884 	case EINVAL:
885 	default:
886 		goto out;
887 	case EHOSTUNREACH:
888 		code = ICMP_HOST_UNREACH;
889 		break;
890 	case ENETUNREACH:
891 		code = ICMP_NET_UNREACH;
892 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
893 		break;
894 	case EACCES:
895 		code = ICMP_PKT_FILTERED;
896 		break;
897 	}
898 
899 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
900 
901 	send = true;
902 	if (peer) {
903 		now = jiffies;
904 		peer->rate_tokens += now - peer->rate_last;
905 		if (peer->rate_tokens > ip_rt_error_burst)
906 			peer->rate_tokens = ip_rt_error_burst;
907 		peer->rate_last = now;
908 		if (peer->rate_tokens >= ip_rt_error_cost)
909 			peer->rate_tokens -= ip_rt_error_cost;
910 		else
911 			send = false;
912 		inet_putpeer(peer);
913 	}
914 	if (send)
915 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
916 
917 out:	kfree_skb(skb);
918 	return 0;
919 }
920 
921 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
922 {
923 	struct fib_result res;
924 
925 	if (mtu < ip_rt_min_pmtu)
926 		mtu = ip_rt_min_pmtu;
927 
928 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
929 		struct fib_nh *nh = &FIB_RES_NH(res);
930 
931 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
932 				      jiffies + ip_rt_mtu_expires);
933 	}
934 	return mtu;
935 }
936 
937 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
938 			      struct sk_buff *skb, u32 mtu)
939 {
940 	struct rtable *rt = (struct rtable *) dst;
941 	struct flowi4 fl4;
942 
943 	ip_rt_build_flow_key(&fl4, sk, skb);
944 	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
945 
946 	if (!rt->rt_pmtu) {
947 		dst->obsolete = DST_OBSOLETE_KILL;
948 	} else {
949 		rt->rt_pmtu = mtu;
950 		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
951 	}
952 }
953 
954 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
955 		      int oif, u32 mark, u8 protocol, int flow_flags)
956 {
957 	const struct iphdr *iph = (const struct iphdr *) skb->data;
958 	struct flowi4 fl4;
959 	struct rtable *rt;
960 
961 	__build_flow_key(&fl4, NULL, iph, oif,
962 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
963 	rt = __ip_route_output_key(net, &fl4);
964 	if (!IS_ERR(rt)) {
965 		__ip_rt_update_pmtu(rt, &fl4, mtu);
966 		ip_rt_put(rt);
967 	}
968 }
969 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
970 
971 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
972 {
973 	const struct iphdr *iph = (const struct iphdr *) skb->data;
974 	struct flowi4 fl4;
975 	struct rtable *rt;
976 
977 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
978 	rt = __ip_route_output_key(sock_net(sk), &fl4);
979 	if (!IS_ERR(rt)) {
980 		__ip_rt_update_pmtu(rt, &fl4, mtu);
981 		ip_rt_put(rt);
982 	}
983 }
984 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
985 
986 void ipv4_redirect(struct sk_buff *skb, struct net *net,
987 		   int oif, u32 mark, u8 protocol, int flow_flags)
988 {
989 	const struct iphdr *iph = (const struct iphdr *) skb->data;
990 	struct flowi4 fl4;
991 	struct rtable *rt;
992 
993 	__build_flow_key(&fl4, NULL, iph, oif,
994 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
995 	rt = __ip_route_output_key(net, &fl4);
996 	if (!IS_ERR(rt)) {
997 		__ip_do_redirect(rt, skb, &fl4, false);
998 		ip_rt_put(rt);
999 	}
1000 }
1001 EXPORT_SYMBOL_GPL(ipv4_redirect);
1002 
1003 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1004 {
1005 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1006 	struct flowi4 fl4;
1007 	struct rtable *rt;
1008 
1009 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1010 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1011 	if (!IS_ERR(rt)) {
1012 		__ip_do_redirect(rt, skb, &fl4, false);
1013 		ip_rt_put(rt);
1014 	}
1015 }
1016 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1017 
1018 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1019 {
1020 	struct rtable *rt = (struct rtable *) dst;
1021 
1022 	/* All IPV4 dsts are created with ->obsolete set to the value
1023 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1024 	 * into this function always.
1025 	 *
1026 	 * When a PMTU/redirect information update invalidates a
1027 	 * route, this is indicated by setting obsolete to
1028 	 * DST_OBSOLETE_KILL.
1029 	 */
1030 	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1031 		return NULL;
1032 	return dst;
1033 }
1034 
1035 static void ipv4_link_failure(struct sk_buff *skb)
1036 {
1037 	struct rtable *rt;
1038 
1039 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1040 
1041 	rt = skb_rtable(skb);
1042 	if (rt)
1043 		dst_set_expires(&rt->dst, 0);
1044 }
1045 
1046 static int ip_rt_bug(struct sk_buff *skb)
1047 {
1048 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1049 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1050 		 skb->dev ? skb->dev->name : "?");
1051 	kfree_skb(skb);
1052 	WARN_ON(1);
1053 	return 0;
1054 }
1055 
1056 /*
1057    We do not cache source address of outgoing interface,
1058    because it is used only by IP RR, TS and SRR options,
1059    so that it out of fast path.
1060 
1061    BTW remember: "addr" is allowed to be not aligned
1062    in IP options!
1063  */
1064 
1065 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1066 {
1067 	__be32 src;
1068 
1069 	if (rt_is_output_route(rt))
1070 		src = ip_hdr(skb)->saddr;
1071 	else {
1072 		struct fib_result res;
1073 		struct flowi4 fl4;
1074 		struct iphdr *iph;
1075 
1076 		iph = ip_hdr(skb);
1077 
1078 		memset(&fl4, 0, sizeof(fl4));
1079 		fl4.daddr = iph->daddr;
1080 		fl4.saddr = iph->saddr;
1081 		fl4.flowi4_tos = RT_TOS(iph->tos);
1082 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1083 		fl4.flowi4_iif = skb->dev->ifindex;
1084 		fl4.flowi4_mark = skb->mark;
1085 
1086 		rcu_read_lock();
1087 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1088 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1089 		else
1090 			src = inet_select_addr(rt->dst.dev,
1091 					       rt_nexthop(rt, iph->daddr),
1092 					       RT_SCOPE_UNIVERSE);
1093 		rcu_read_unlock();
1094 	}
1095 	memcpy(addr, &src, 4);
1096 }
1097 
1098 #ifdef CONFIG_IP_ROUTE_CLASSID
1099 static void set_class_tag(struct rtable *rt, u32 tag)
1100 {
1101 	if (!(rt->dst.tclassid & 0xFFFF))
1102 		rt->dst.tclassid |= tag & 0xFFFF;
1103 	if (!(rt->dst.tclassid & 0xFFFF0000))
1104 		rt->dst.tclassid |= tag & 0xFFFF0000;
1105 }
1106 #endif
1107 
1108 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1109 {
1110 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1111 
1112 	if (advmss == 0) {
1113 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1114 			       ip_rt_min_advmss);
1115 		if (advmss > 65535 - 40)
1116 			advmss = 65535 - 40;
1117 	}
1118 	return advmss;
1119 }
1120 
1121 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1122 {
1123 	const struct rtable *rt = (const struct rtable *) dst;
1124 	unsigned int mtu = rt->rt_pmtu;
1125 
1126 	if (mtu && time_after_eq(jiffies, rt->dst.expires))
1127 		mtu = 0;
1128 
1129 	if (!mtu)
1130 		mtu = dst_metric_raw(dst, RTAX_MTU);
1131 
1132 	if (mtu && rt_is_output_route(rt))
1133 		return mtu;
1134 
1135 	mtu = dst->dev->mtu;
1136 
1137 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1138 		if (rt->rt_gateway && mtu > 576)
1139 			mtu = 576;
1140 	}
1141 
1142 	if (mtu > IP_MAX_MTU)
1143 		mtu = IP_MAX_MTU;
1144 
1145 	return mtu;
1146 }
1147 
1148 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1149 {
1150 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1151 	struct fib_nh_exception *fnhe;
1152 	u32 hval;
1153 
1154 	if (!hash)
1155 		return NULL;
1156 
1157 	hval = fnhe_hashfun(daddr);
1158 
1159 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1160 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1161 		if (fnhe->fnhe_daddr == daddr)
1162 			return fnhe;
1163 	}
1164 	return NULL;
1165 }
1166 
1167 static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1168 			      __be32 daddr)
1169 {
1170 	__be32 fnhe_daddr, gw;
1171 	unsigned long expires;
1172 	unsigned int seq;
1173 	u32 pmtu;
1174 
1175 restart:
1176 	seq = read_seqbegin(&fnhe_seqlock);
1177 	fnhe_daddr = fnhe->fnhe_daddr;
1178 	gw = fnhe->fnhe_gw;
1179 	pmtu = fnhe->fnhe_pmtu;
1180 	expires = fnhe->fnhe_expires;
1181 	if (read_seqretry(&fnhe_seqlock, seq))
1182 		goto restart;
1183 
1184 	if (daddr != fnhe_daddr)
1185 		return;
1186 
1187 	if (pmtu) {
1188 		unsigned long diff = expires - jiffies;
1189 
1190 		if (time_before(jiffies, expires)) {
1191 			rt->rt_pmtu = pmtu;
1192 			dst_set_expires(&rt->dst, diff);
1193 		}
1194 	}
1195 	if (gw) {
1196 		rt->rt_flags |= RTCF_REDIRECTED;
1197 		rt->rt_gateway = gw;
1198 	}
1199 	fnhe->fnhe_stamp = jiffies;
1200 }
1201 
1202 static inline void rt_free(struct rtable *rt)
1203 {
1204 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1205 }
1206 
1207 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1208 {
1209 	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
1210 
1211 	if (rt_is_input_route(rt))
1212 		p = &nh->nh_rth_input;
1213 
1214 	orig = *p;
1215 
1216 	prev = cmpxchg(p, orig, rt);
1217 	if (prev == orig) {
1218 		if (orig)
1219 			rt_free(orig);
1220 	} else {
1221 		/* Routes we intend to cache in the FIB nexthop have
1222 		 * the DST_NOCACHE bit clear.  However, if we are
1223 		 * unsuccessful at storing this route into the cache
1224 		 * we really need to set it.
1225 		 */
1226 		rt->dst.flags |= DST_NOCACHE;
1227 	}
1228 }
1229 
1230 static bool rt_cache_valid(const struct rtable *rt)
1231 {
1232 	return	rt &&
1233 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1234 		!rt_is_expired(rt);
1235 }
1236 
1237 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1238 			   const struct fib_result *res,
1239 			   struct fib_nh_exception *fnhe,
1240 			   struct fib_info *fi, u16 type, u32 itag)
1241 {
1242 	if (fi) {
1243 		struct fib_nh *nh = &FIB_RES_NH(*res);
1244 
1245 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1246 			rt->rt_gateway = nh->nh_gw;
1247 		if (unlikely(fnhe))
1248 			rt_bind_exception(rt, fnhe, daddr);
1249 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1250 #ifdef CONFIG_IP_ROUTE_CLASSID
1251 		rt->dst.tclassid = nh->nh_tclassid;
1252 #endif
1253 		if (!(rt->dst.flags & DST_NOCACHE))
1254 			rt_cache_route(nh, rt);
1255 	}
1256 
1257 #ifdef CONFIG_IP_ROUTE_CLASSID
1258 #ifdef CONFIG_IP_MULTIPLE_TABLES
1259 	set_class_tag(rt, res->tclassid);
1260 #endif
1261 	set_class_tag(rt, itag);
1262 #endif
1263 }
1264 
1265 static struct rtable *rt_dst_alloc(struct net_device *dev,
1266 				   bool nopolicy, bool noxfrm, bool will_cache)
1267 {
1268 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1269 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1270 			 (nopolicy ? DST_NOPOLICY : 0) |
1271 			 (noxfrm ? DST_NOXFRM : 0));
1272 }
1273 
1274 /* called in rcu_read_lock() section */
1275 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1276 				u8 tos, struct net_device *dev, int our)
1277 {
1278 	struct rtable *rth;
1279 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1280 	u32 itag = 0;
1281 	int err;
1282 
1283 	/* Primary sanity checks. */
1284 
1285 	if (in_dev == NULL)
1286 		return -EINVAL;
1287 
1288 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1289 	    skb->protocol != htons(ETH_P_IP))
1290 		goto e_inval;
1291 
1292 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1293 		if (ipv4_is_loopback(saddr))
1294 			goto e_inval;
1295 
1296 	if (ipv4_is_zeronet(saddr)) {
1297 		if (!ipv4_is_local_multicast(daddr))
1298 			goto e_inval;
1299 	} else {
1300 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1301 					  in_dev, &itag);
1302 		if (err < 0)
1303 			goto e_err;
1304 	}
1305 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1306 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1307 	if (!rth)
1308 		goto e_nobufs;
1309 
1310 #ifdef CONFIG_IP_ROUTE_CLASSID
1311 	rth->dst.tclassid = itag;
1312 #endif
1313 	rth->dst.output = ip_rt_bug;
1314 
1315 	rth->rt_genid	= rt_genid(dev_net(dev));
1316 	rth->rt_flags	= RTCF_MULTICAST;
1317 	rth->rt_type	= RTN_MULTICAST;
1318 	rth->rt_is_input= 1;
1319 	rth->rt_iif	= 0;
1320 	rth->rt_pmtu	= 0;
1321 	rth->rt_gateway	= 0;
1322 	if (our) {
1323 		rth->dst.input= ip_local_deliver;
1324 		rth->rt_flags |= RTCF_LOCAL;
1325 	}
1326 
1327 #ifdef CONFIG_IP_MROUTE
1328 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1329 		rth->dst.input = ip_mr_input;
1330 #endif
1331 	RT_CACHE_STAT_INC(in_slow_mc);
1332 
1333 	skb_dst_set(skb, &rth->dst);
1334 	return 0;
1335 
1336 e_nobufs:
1337 	return -ENOBUFS;
1338 e_inval:
1339 	return -EINVAL;
1340 e_err:
1341 	return err;
1342 }
1343 
1344 
1345 static void ip_handle_martian_source(struct net_device *dev,
1346 				     struct in_device *in_dev,
1347 				     struct sk_buff *skb,
1348 				     __be32 daddr,
1349 				     __be32 saddr)
1350 {
1351 	RT_CACHE_STAT_INC(in_martian_src);
1352 #ifdef CONFIG_IP_ROUTE_VERBOSE
1353 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1354 		/*
1355 		 *	RFC1812 recommendation, if source is martian,
1356 		 *	the only hint is MAC header.
1357 		 */
1358 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1359 			&daddr, &saddr, dev->name);
1360 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1361 			print_hex_dump(KERN_WARNING, "ll header: ",
1362 				       DUMP_PREFIX_OFFSET, 16, 1,
1363 				       skb_mac_header(skb),
1364 				       dev->hard_header_len, true);
1365 		}
1366 	}
1367 #endif
1368 }
1369 
1370 /* called in rcu_read_lock() section */
1371 static int __mkroute_input(struct sk_buff *skb,
1372 			   const struct fib_result *res,
1373 			   struct in_device *in_dev,
1374 			   __be32 daddr, __be32 saddr, u32 tos)
1375 {
1376 	struct rtable *rth;
1377 	int err;
1378 	struct in_device *out_dev;
1379 	unsigned int flags = 0;
1380 	bool do_cache;
1381 	u32 itag;
1382 
1383 	/* get a working reference to the output device */
1384 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1385 	if (out_dev == NULL) {
1386 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1387 		return -EINVAL;
1388 	}
1389 
1390 
1391 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1392 				  in_dev->dev, in_dev, &itag);
1393 	if (err < 0) {
1394 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1395 					 saddr);
1396 
1397 		goto cleanup;
1398 	}
1399 
1400 	if (out_dev == in_dev && err &&
1401 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1402 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1403 		flags |= RTCF_DOREDIRECT;
1404 
1405 	if (skb->protocol != htons(ETH_P_IP)) {
1406 		/* Not IP (i.e. ARP). Do not create route, if it is
1407 		 * invalid for proxy arp. DNAT routes are always valid.
1408 		 *
1409 		 * Proxy arp feature have been extended to allow, ARP
1410 		 * replies back to the same interface, to support
1411 		 * Private VLAN switch technologies. See arp.c.
1412 		 */
1413 		if (out_dev == in_dev &&
1414 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1415 			err = -EINVAL;
1416 			goto cleanup;
1417 		}
1418 	}
1419 
1420 	do_cache = false;
1421 	if (res->fi) {
1422 		if (!itag) {
1423 			rth = FIB_RES_NH(*res).nh_rth_input;
1424 			if (rt_cache_valid(rth)) {
1425 				skb_dst_set_noref(skb, &rth->dst);
1426 				goto out;
1427 			}
1428 			do_cache = true;
1429 		}
1430 	}
1431 
1432 	rth = rt_dst_alloc(out_dev->dev,
1433 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1434 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1435 	if (!rth) {
1436 		err = -ENOBUFS;
1437 		goto cleanup;
1438 	}
1439 
1440 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1441 	rth->rt_flags = flags;
1442 	rth->rt_type = res->type;
1443 	rth->rt_is_input = 1;
1444 	rth->rt_iif 	= 0;
1445 	rth->rt_pmtu	= 0;
1446 	rth->rt_gateway	= 0;
1447 
1448 	rth->dst.input = ip_forward;
1449 	rth->dst.output = ip_output;
1450 
1451 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1452 	skb_dst_set(skb, &rth->dst);
1453 out:
1454 	err = 0;
1455  cleanup:
1456 	return err;
1457 }
1458 
1459 static int ip_mkroute_input(struct sk_buff *skb,
1460 			    struct fib_result *res,
1461 			    const struct flowi4 *fl4,
1462 			    struct in_device *in_dev,
1463 			    __be32 daddr, __be32 saddr, u32 tos)
1464 {
1465 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1466 	if (res->fi && res->fi->fib_nhs > 1)
1467 		fib_select_multipath(res);
1468 #endif
1469 
1470 	/* create a routing cache entry */
1471 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1472 }
1473 
1474 /*
1475  *	NOTE. We drop all the packets that has local source
1476  *	addresses, because every properly looped back packet
1477  *	must have correct destination already attached by output routine.
1478  *
1479  *	Such approach solves two big problems:
1480  *	1. Not simplex devices are handled properly.
1481  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1482  *	called with rcu_read_lock()
1483  */
1484 
1485 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1486 			       u8 tos, struct net_device *dev)
1487 {
1488 	struct fib_result res;
1489 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1490 	struct flowi4	fl4;
1491 	unsigned int	flags = 0;
1492 	u32		itag = 0;
1493 	struct rtable	*rth;
1494 	int		err = -EINVAL;
1495 	struct net    *net = dev_net(dev);
1496 	bool do_cache;
1497 
1498 	/* IP on this device is disabled. */
1499 
1500 	if (!in_dev)
1501 		goto out;
1502 
1503 	/* Check for the most weird martians, which can be not detected
1504 	   by fib_lookup.
1505 	 */
1506 
1507 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1508 		goto martian_source;
1509 
1510 	res.fi = NULL;
1511 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1512 		goto brd_input;
1513 
1514 	/* Accept zero addresses only to limited broadcast;
1515 	 * I even do not know to fix it or not. Waiting for complains :-)
1516 	 */
1517 	if (ipv4_is_zeronet(saddr))
1518 		goto martian_source;
1519 
1520 	if (ipv4_is_zeronet(daddr))
1521 		goto martian_destination;
1522 
1523 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1524 		if (ipv4_is_loopback(daddr))
1525 			goto martian_destination;
1526 
1527 		if (ipv4_is_loopback(saddr))
1528 			goto martian_source;
1529 	}
1530 
1531 	/*
1532 	 *	Now we are ready to route packet.
1533 	 */
1534 	fl4.flowi4_oif = 0;
1535 	fl4.flowi4_iif = dev->ifindex;
1536 	fl4.flowi4_mark = skb->mark;
1537 	fl4.flowi4_tos = tos;
1538 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1539 	fl4.daddr = daddr;
1540 	fl4.saddr = saddr;
1541 	err = fib_lookup(net, &fl4, &res);
1542 	if (err != 0)
1543 		goto no_route;
1544 
1545 	RT_CACHE_STAT_INC(in_slow_tot);
1546 
1547 	if (res.type == RTN_BROADCAST)
1548 		goto brd_input;
1549 
1550 	if (res.type == RTN_LOCAL) {
1551 		err = fib_validate_source(skb, saddr, daddr, tos,
1552 					  net->loopback_dev->ifindex,
1553 					  dev, in_dev, &itag);
1554 		if (err < 0)
1555 			goto martian_source_keep_err;
1556 		goto local_input;
1557 	}
1558 
1559 	if (!IN_DEV_FORWARD(in_dev))
1560 		goto no_route;
1561 	if (res.type != RTN_UNICAST)
1562 		goto martian_destination;
1563 
1564 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1565 out:	return err;
1566 
1567 brd_input:
1568 	if (skb->protocol != htons(ETH_P_IP))
1569 		goto e_inval;
1570 
1571 	if (!ipv4_is_zeronet(saddr)) {
1572 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1573 					  in_dev, &itag);
1574 		if (err < 0)
1575 			goto martian_source_keep_err;
1576 	}
1577 	flags |= RTCF_BROADCAST;
1578 	res.type = RTN_BROADCAST;
1579 	RT_CACHE_STAT_INC(in_brd);
1580 
1581 local_input:
1582 	do_cache = false;
1583 	if (res.fi) {
1584 		if (!itag) {
1585 			rth = FIB_RES_NH(res).nh_rth_input;
1586 			if (rt_cache_valid(rth)) {
1587 				skb_dst_set_noref(skb, &rth->dst);
1588 				err = 0;
1589 				goto out;
1590 			}
1591 			do_cache = true;
1592 		}
1593 	}
1594 
1595 	rth = rt_dst_alloc(net->loopback_dev,
1596 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1597 	if (!rth)
1598 		goto e_nobufs;
1599 
1600 	rth->dst.input= ip_local_deliver;
1601 	rth->dst.output= ip_rt_bug;
1602 #ifdef CONFIG_IP_ROUTE_CLASSID
1603 	rth->dst.tclassid = itag;
1604 #endif
1605 
1606 	rth->rt_genid = rt_genid(net);
1607 	rth->rt_flags 	= flags|RTCF_LOCAL;
1608 	rth->rt_type	= res.type;
1609 	rth->rt_is_input = 1;
1610 	rth->rt_iif	= 0;
1611 	rth->rt_pmtu	= 0;
1612 	rth->rt_gateway	= 0;
1613 	if (res.type == RTN_UNREACHABLE) {
1614 		rth->dst.input= ip_error;
1615 		rth->dst.error= -err;
1616 		rth->rt_flags 	&= ~RTCF_LOCAL;
1617 	}
1618 	if (do_cache)
1619 		rt_cache_route(&FIB_RES_NH(res), rth);
1620 	skb_dst_set(skb, &rth->dst);
1621 	err = 0;
1622 	goto out;
1623 
1624 no_route:
1625 	RT_CACHE_STAT_INC(in_no_route);
1626 	res.type = RTN_UNREACHABLE;
1627 	if (err == -ESRCH)
1628 		err = -ENETUNREACH;
1629 	goto local_input;
1630 
1631 	/*
1632 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1633 	 */
1634 martian_destination:
1635 	RT_CACHE_STAT_INC(in_martian_dst);
1636 #ifdef CONFIG_IP_ROUTE_VERBOSE
1637 	if (IN_DEV_LOG_MARTIANS(in_dev))
1638 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1639 				     &daddr, &saddr, dev->name);
1640 #endif
1641 
1642 e_inval:
1643 	err = -EINVAL;
1644 	goto out;
1645 
1646 e_nobufs:
1647 	err = -ENOBUFS;
1648 	goto out;
1649 
1650 martian_source:
1651 	err = -EINVAL;
1652 martian_source_keep_err:
1653 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1654 	goto out;
1655 }
1656 
1657 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1658 			 u8 tos, struct net_device *dev)
1659 {
1660 	int res;
1661 
1662 	rcu_read_lock();
1663 
1664 	/* Multicast recognition logic is moved from route cache to here.
1665 	   The problem was that too many Ethernet cards have broken/missing
1666 	   hardware multicast filters :-( As result the host on multicasting
1667 	   network acquires a lot of useless route cache entries, sort of
1668 	   SDR messages from all the world. Now we try to get rid of them.
1669 	   Really, provided software IP multicast filter is organized
1670 	   reasonably (at least, hashed), it does not result in a slowdown
1671 	   comparing with route cache reject entries.
1672 	   Note, that multicast routers are not affected, because
1673 	   route cache entry is created eventually.
1674 	 */
1675 	if (ipv4_is_multicast(daddr)) {
1676 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1677 
1678 		if (in_dev) {
1679 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1680 						  ip_hdr(skb)->protocol);
1681 			if (our
1682 #ifdef CONFIG_IP_MROUTE
1683 				||
1684 			    (!ipv4_is_local_multicast(daddr) &&
1685 			     IN_DEV_MFORWARD(in_dev))
1686 #endif
1687 			   ) {
1688 				int res = ip_route_input_mc(skb, daddr, saddr,
1689 							    tos, dev, our);
1690 				rcu_read_unlock();
1691 				return res;
1692 			}
1693 		}
1694 		rcu_read_unlock();
1695 		return -EINVAL;
1696 	}
1697 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1698 	rcu_read_unlock();
1699 	return res;
1700 }
1701 EXPORT_SYMBOL(ip_route_input_noref);
1702 
1703 /* called with rcu_read_lock() */
1704 static struct rtable *__mkroute_output(const struct fib_result *res,
1705 				       const struct flowi4 *fl4, int orig_oif,
1706 				       struct net_device *dev_out,
1707 				       unsigned int flags)
1708 {
1709 	struct fib_info *fi = res->fi;
1710 	struct fib_nh_exception *fnhe;
1711 	struct in_device *in_dev;
1712 	u16 type = res->type;
1713 	struct rtable *rth;
1714 
1715 	in_dev = __in_dev_get_rcu(dev_out);
1716 	if (!in_dev)
1717 		return ERR_PTR(-EINVAL);
1718 
1719 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1720 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1721 			return ERR_PTR(-EINVAL);
1722 
1723 	if (ipv4_is_lbcast(fl4->daddr))
1724 		type = RTN_BROADCAST;
1725 	else if (ipv4_is_multicast(fl4->daddr))
1726 		type = RTN_MULTICAST;
1727 	else if (ipv4_is_zeronet(fl4->daddr))
1728 		return ERR_PTR(-EINVAL);
1729 
1730 	if (dev_out->flags & IFF_LOOPBACK)
1731 		flags |= RTCF_LOCAL;
1732 
1733 	if (type == RTN_BROADCAST) {
1734 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1735 		fi = NULL;
1736 	} else if (type == RTN_MULTICAST) {
1737 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1738 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1739 				     fl4->flowi4_proto))
1740 			flags &= ~RTCF_LOCAL;
1741 		/* If multicast route do not exist use
1742 		 * default one, but do not gateway in this case.
1743 		 * Yes, it is hack.
1744 		 */
1745 		if (fi && res->prefixlen < 4)
1746 			fi = NULL;
1747 	}
1748 
1749 	fnhe = NULL;
1750 	if (fi) {
1751 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1752 		if (!fnhe) {
1753 			rth = FIB_RES_NH(*res).nh_rth_output;
1754 			if (rt_cache_valid(rth)) {
1755 				dst_hold(&rth->dst);
1756 				return rth;
1757 			}
1758 		}
1759 	}
1760 	rth = rt_dst_alloc(dev_out,
1761 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1762 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1763 			   fi && !fnhe);
1764 	if (!rth)
1765 		return ERR_PTR(-ENOBUFS);
1766 
1767 	rth->dst.output = ip_output;
1768 
1769 	rth->rt_genid = rt_genid(dev_net(dev_out));
1770 	rth->rt_flags	= flags;
1771 	rth->rt_type	= type;
1772 	rth->rt_is_input = 0;
1773 	rth->rt_iif	= orig_oif ? : 0;
1774 	rth->rt_pmtu	= 0;
1775 	rth->rt_gateway = 0;
1776 
1777 	RT_CACHE_STAT_INC(out_slow_tot);
1778 
1779 	if (flags & RTCF_LOCAL)
1780 		rth->dst.input = ip_local_deliver;
1781 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1782 		if (flags & RTCF_LOCAL &&
1783 		    !(dev_out->flags & IFF_LOOPBACK)) {
1784 			rth->dst.output = ip_mc_output;
1785 			RT_CACHE_STAT_INC(out_slow_mc);
1786 		}
1787 #ifdef CONFIG_IP_MROUTE
1788 		if (type == RTN_MULTICAST) {
1789 			if (IN_DEV_MFORWARD(in_dev) &&
1790 			    !ipv4_is_local_multicast(fl4->daddr)) {
1791 				rth->dst.input = ip_mr_input;
1792 				rth->dst.output = ip_mc_output;
1793 			}
1794 		}
1795 #endif
1796 	}
1797 
1798 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1799 
1800 	return rth;
1801 }
1802 
1803 /*
1804  * Major route resolver routine.
1805  */
1806 
1807 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1808 {
1809 	struct net_device *dev_out = NULL;
1810 	__u8 tos = RT_FL_TOS(fl4);
1811 	unsigned int flags = 0;
1812 	struct fib_result res;
1813 	struct rtable *rth;
1814 	int orig_oif;
1815 
1816 	res.tclassid	= 0;
1817 	res.fi		= NULL;
1818 	res.table	= NULL;
1819 
1820 	orig_oif = fl4->flowi4_oif;
1821 
1822 	fl4->flowi4_iif = net->loopback_dev->ifindex;
1823 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1824 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1825 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1826 
1827 	rcu_read_lock();
1828 	if (fl4->saddr) {
1829 		rth = ERR_PTR(-EINVAL);
1830 		if (ipv4_is_multicast(fl4->saddr) ||
1831 		    ipv4_is_lbcast(fl4->saddr) ||
1832 		    ipv4_is_zeronet(fl4->saddr))
1833 			goto out;
1834 
1835 		/* I removed check for oif == dev_out->oif here.
1836 		   It was wrong for two reasons:
1837 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1838 		      is assigned to multiple interfaces.
1839 		   2. Moreover, we are allowed to send packets with saddr
1840 		      of another iface. --ANK
1841 		 */
1842 
1843 		if (fl4->flowi4_oif == 0 &&
1844 		    (ipv4_is_multicast(fl4->daddr) ||
1845 		     ipv4_is_lbcast(fl4->daddr))) {
1846 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1847 			dev_out = __ip_dev_find(net, fl4->saddr, false);
1848 			if (dev_out == NULL)
1849 				goto out;
1850 
1851 			/* Special hack: user can direct multicasts
1852 			   and limited broadcast via necessary interface
1853 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1854 			   This hack is not just for fun, it allows
1855 			   vic,vat and friends to work.
1856 			   They bind socket to loopback, set ttl to zero
1857 			   and expect that it will work.
1858 			   From the viewpoint of routing cache they are broken,
1859 			   because we are not allowed to build multicast path
1860 			   with loopback source addr (look, routing cache
1861 			   cannot know, that ttl is zero, so that packet
1862 			   will not leave this host and route is valid).
1863 			   Luckily, this hack is good workaround.
1864 			 */
1865 
1866 			fl4->flowi4_oif = dev_out->ifindex;
1867 			goto make_route;
1868 		}
1869 
1870 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1871 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1872 			if (!__ip_dev_find(net, fl4->saddr, false))
1873 				goto out;
1874 		}
1875 	}
1876 
1877 
1878 	if (fl4->flowi4_oif) {
1879 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1880 		rth = ERR_PTR(-ENODEV);
1881 		if (dev_out == NULL)
1882 			goto out;
1883 
1884 		/* RACE: Check return value of inet_select_addr instead. */
1885 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1886 			rth = ERR_PTR(-ENETUNREACH);
1887 			goto out;
1888 		}
1889 		if (ipv4_is_local_multicast(fl4->daddr) ||
1890 		    ipv4_is_lbcast(fl4->daddr)) {
1891 			if (!fl4->saddr)
1892 				fl4->saddr = inet_select_addr(dev_out, 0,
1893 							      RT_SCOPE_LINK);
1894 			goto make_route;
1895 		}
1896 		if (fl4->saddr) {
1897 			if (ipv4_is_multicast(fl4->daddr))
1898 				fl4->saddr = inet_select_addr(dev_out, 0,
1899 							      fl4->flowi4_scope);
1900 			else if (!fl4->daddr)
1901 				fl4->saddr = inet_select_addr(dev_out, 0,
1902 							      RT_SCOPE_HOST);
1903 		}
1904 	}
1905 
1906 	if (!fl4->daddr) {
1907 		fl4->daddr = fl4->saddr;
1908 		if (!fl4->daddr)
1909 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1910 		dev_out = net->loopback_dev;
1911 		fl4->flowi4_oif = net->loopback_dev->ifindex;
1912 		res.type = RTN_LOCAL;
1913 		flags |= RTCF_LOCAL;
1914 		goto make_route;
1915 	}
1916 
1917 	if (fib_lookup(net, fl4, &res)) {
1918 		res.fi = NULL;
1919 		res.table = NULL;
1920 		if (fl4->flowi4_oif) {
1921 			/* Apparently, routing tables are wrong. Assume,
1922 			   that the destination is on link.
1923 
1924 			   WHY? DW.
1925 			   Because we are allowed to send to iface
1926 			   even if it has NO routes and NO assigned
1927 			   addresses. When oif is specified, routing
1928 			   tables are looked up with only one purpose:
1929 			   to catch if destination is gatewayed, rather than
1930 			   direct. Moreover, if MSG_DONTROUTE is set,
1931 			   we send packet, ignoring both routing tables
1932 			   and ifaddr state. --ANK
1933 
1934 
1935 			   We could make it even if oif is unknown,
1936 			   likely IPv6, but we do not.
1937 			 */
1938 
1939 			if (fl4->saddr == 0)
1940 				fl4->saddr = inet_select_addr(dev_out, 0,
1941 							      RT_SCOPE_LINK);
1942 			res.type = RTN_UNICAST;
1943 			goto make_route;
1944 		}
1945 		rth = ERR_PTR(-ENETUNREACH);
1946 		goto out;
1947 	}
1948 
1949 	if (res.type == RTN_LOCAL) {
1950 		if (!fl4->saddr) {
1951 			if (res.fi->fib_prefsrc)
1952 				fl4->saddr = res.fi->fib_prefsrc;
1953 			else
1954 				fl4->saddr = fl4->daddr;
1955 		}
1956 		dev_out = net->loopback_dev;
1957 		fl4->flowi4_oif = dev_out->ifindex;
1958 		res.fi = NULL;
1959 		flags |= RTCF_LOCAL;
1960 		goto make_route;
1961 	}
1962 
1963 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1964 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1965 		fib_select_multipath(&res);
1966 	else
1967 #endif
1968 	if (!res.prefixlen &&
1969 	    res.table->tb_num_default > 1 &&
1970 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
1971 		fib_select_default(&res);
1972 
1973 	if (!fl4->saddr)
1974 		fl4->saddr = FIB_RES_PREFSRC(net, res);
1975 
1976 	dev_out = FIB_RES_DEV(res);
1977 	fl4->flowi4_oif = dev_out->ifindex;
1978 
1979 
1980 make_route:
1981 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1982 
1983 out:
1984 	rcu_read_unlock();
1985 	return rth;
1986 }
1987 EXPORT_SYMBOL_GPL(__ip_route_output_key);
1988 
1989 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
1990 {
1991 	return NULL;
1992 }
1993 
1994 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
1995 {
1996 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1997 
1998 	return mtu ? : dst->dev->mtu;
1999 }
2000 
2001 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2002 					  struct sk_buff *skb, u32 mtu)
2003 {
2004 }
2005 
2006 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2007 				       struct sk_buff *skb)
2008 {
2009 }
2010 
2011 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2012 					  unsigned long old)
2013 {
2014 	return NULL;
2015 }
2016 
2017 static struct dst_ops ipv4_dst_blackhole_ops = {
2018 	.family			=	AF_INET,
2019 	.protocol		=	cpu_to_be16(ETH_P_IP),
2020 	.check			=	ipv4_blackhole_dst_check,
2021 	.mtu			=	ipv4_blackhole_mtu,
2022 	.default_advmss		=	ipv4_default_advmss,
2023 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2024 	.redirect		=	ipv4_rt_blackhole_redirect,
2025 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2026 	.neigh_lookup		=	ipv4_neigh_lookup,
2027 };
2028 
2029 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2030 {
2031 	struct rtable *ort = (struct rtable *) dst_orig;
2032 	struct rtable *rt;
2033 
2034 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2035 	if (rt) {
2036 		struct dst_entry *new = &rt->dst;
2037 
2038 		new->__use = 1;
2039 		new->input = dst_discard;
2040 		new->output = dst_discard;
2041 
2042 		new->dev = ort->dst.dev;
2043 		if (new->dev)
2044 			dev_hold(new->dev);
2045 
2046 		rt->rt_is_input = ort->rt_is_input;
2047 		rt->rt_iif = ort->rt_iif;
2048 		rt->rt_pmtu = ort->rt_pmtu;
2049 
2050 		rt->rt_genid = rt_genid(net);
2051 		rt->rt_flags = ort->rt_flags;
2052 		rt->rt_type = ort->rt_type;
2053 		rt->rt_gateway = ort->rt_gateway;
2054 
2055 		dst_free(new);
2056 	}
2057 
2058 	dst_release(dst_orig);
2059 
2060 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2061 }
2062 
2063 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2064 				    struct sock *sk)
2065 {
2066 	struct rtable *rt = __ip_route_output_key(net, flp4);
2067 
2068 	if (IS_ERR(rt))
2069 		return rt;
2070 
2071 	if (flp4->flowi4_proto)
2072 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2073 						   flowi4_to_flowi(flp4),
2074 						   sk, 0);
2075 
2076 	return rt;
2077 }
2078 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2079 
2080 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2081 			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2082 			u32 seq, int event, int nowait, unsigned int flags)
2083 {
2084 	struct rtable *rt = skb_rtable(skb);
2085 	struct rtmsg *r;
2086 	struct nlmsghdr *nlh;
2087 	unsigned long expires = 0;
2088 	u32 error;
2089 	u32 metrics[RTAX_MAX];
2090 
2091 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2092 	if (nlh == NULL)
2093 		return -EMSGSIZE;
2094 
2095 	r = nlmsg_data(nlh);
2096 	r->rtm_family	 = AF_INET;
2097 	r->rtm_dst_len	= 32;
2098 	r->rtm_src_len	= 0;
2099 	r->rtm_tos	= fl4->flowi4_tos;
2100 	r->rtm_table	= RT_TABLE_MAIN;
2101 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2102 		goto nla_put_failure;
2103 	r->rtm_type	= rt->rt_type;
2104 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2105 	r->rtm_protocol = RTPROT_UNSPEC;
2106 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2107 	if (rt->rt_flags & RTCF_NOTIFY)
2108 		r->rtm_flags |= RTM_F_NOTIFY;
2109 
2110 	if (nla_put_be32(skb, RTA_DST, dst))
2111 		goto nla_put_failure;
2112 	if (src) {
2113 		r->rtm_src_len = 32;
2114 		if (nla_put_be32(skb, RTA_SRC, src))
2115 			goto nla_put_failure;
2116 	}
2117 	if (rt->dst.dev &&
2118 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2119 		goto nla_put_failure;
2120 #ifdef CONFIG_IP_ROUTE_CLASSID
2121 	if (rt->dst.tclassid &&
2122 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2123 		goto nla_put_failure;
2124 #endif
2125 	if (!rt_is_input_route(rt) &&
2126 	    fl4->saddr != src) {
2127 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2128 			goto nla_put_failure;
2129 	}
2130 	if (rt->rt_gateway &&
2131 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2132 		goto nla_put_failure;
2133 
2134 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2135 	if (rt->rt_pmtu)
2136 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2137 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2138 		goto nla_put_failure;
2139 
2140 	if (fl4->flowi4_mark &&
2141 	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2142 		goto nla_put_failure;
2143 
2144 	error = rt->dst.error;
2145 	expires = rt->dst.expires;
2146 	if (expires) {
2147 		if (time_before(jiffies, expires))
2148 			expires -= jiffies;
2149 		else
2150 			expires = 0;
2151 	}
2152 
2153 	if (rt_is_input_route(rt)) {
2154 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2155 			goto nla_put_failure;
2156 	}
2157 
2158 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2159 		goto nla_put_failure;
2160 
2161 	return nlmsg_end(skb, nlh);
2162 
2163 nla_put_failure:
2164 	nlmsg_cancel(skb, nlh);
2165 	return -EMSGSIZE;
2166 }
2167 
2168 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2169 {
2170 	struct net *net = sock_net(in_skb->sk);
2171 	struct rtmsg *rtm;
2172 	struct nlattr *tb[RTA_MAX+1];
2173 	struct rtable *rt = NULL;
2174 	struct flowi4 fl4;
2175 	__be32 dst = 0;
2176 	__be32 src = 0;
2177 	u32 iif;
2178 	int err;
2179 	int mark;
2180 	struct sk_buff *skb;
2181 
2182 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2183 	if (err < 0)
2184 		goto errout;
2185 
2186 	rtm = nlmsg_data(nlh);
2187 
2188 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2189 	if (skb == NULL) {
2190 		err = -ENOBUFS;
2191 		goto errout;
2192 	}
2193 
2194 	/* Reserve room for dummy headers, this skb can pass
2195 	   through good chunk of routing engine.
2196 	 */
2197 	skb_reset_mac_header(skb);
2198 	skb_reset_network_header(skb);
2199 
2200 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2201 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2202 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2203 
2204 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2205 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2206 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2207 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2208 
2209 	memset(&fl4, 0, sizeof(fl4));
2210 	fl4.daddr = dst;
2211 	fl4.saddr = src;
2212 	fl4.flowi4_tos = rtm->rtm_tos;
2213 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2214 	fl4.flowi4_mark = mark;
2215 
2216 	if (iif) {
2217 		struct net_device *dev;
2218 
2219 		dev = __dev_get_by_index(net, iif);
2220 		if (dev == NULL) {
2221 			err = -ENODEV;
2222 			goto errout_free;
2223 		}
2224 
2225 		skb->protocol	= htons(ETH_P_IP);
2226 		skb->dev	= dev;
2227 		skb->mark	= mark;
2228 		local_bh_disable();
2229 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2230 		local_bh_enable();
2231 
2232 		rt = skb_rtable(skb);
2233 		if (err == 0 && rt->dst.error)
2234 			err = -rt->dst.error;
2235 	} else {
2236 		rt = ip_route_output_key(net, &fl4);
2237 
2238 		err = 0;
2239 		if (IS_ERR(rt))
2240 			err = PTR_ERR(rt);
2241 	}
2242 
2243 	if (err)
2244 		goto errout_free;
2245 
2246 	skb_dst_set(skb, &rt->dst);
2247 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2248 		rt->rt_flags |= RTCF_NOTIFY;
2249 
2250 	err = rt_fill_info(net, dst, src, &fl4, skb,
2251 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2252 			   RTM_NEWROUTE, 0, 0);
2253 	if (err <= 0)
2254 		goto errout_free;
2255 
2256 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2257 errout:
2258 	return err;
2259 
2260 errout_free:
2261 	kfree_skb(skb);
2262 	goto errout;
2263 }
2264 
2265 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2266 {
2267 	return skb->len;
2268 }
2269 
2270 void ip_rt_multicast_event(struct in_device *in_dev)
2271 {
2272 	rt_cache_flush(dev_net(in_dev->dev), 0);
2273 }
2274 
2275 #ifdef CONFIG_SYSCTL
2276 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2277 					void __user *buffer,
2278 					size_t *lenp, loff_t *ppos)
2279 {
2280 	if (write) {
2281 		int flush_delay;
2282 		ctl_table ctl;
2283 		struct net *net;
2284 
2285 		memcpy(&ctl, __ctl, sizeof(ctl));
2286 		ctl.data = &flush_delay;
2287 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
2288 
2289 		net = (struct net *)__ctl->extra1;
2290 		rt_cache_flush(net, flush_delay);
2291 		return 0;
2292 	}
2293 
2294 	return -EINVAL;
2295 }
2296 
2297 static ctl_table ipv4_route_table[] = {
2298 	{
2299 		.procname	= "gc_thresh",
2300 		.data		= &ipv4_dst_ops.gc_thresh,
2301 		.maxlen		= sizeof(int),
2302 		.mode		= 0644,
2303 		.proc_handler	= proc_dointvec,
2304 	},
2305 	{
2306 		.procname	= "max_size",
2307 		.data		= &ip_rt_max_size,
2308 		.maxlen		= sizeof(int),
2309 		.mode		= 0644,
2310 		.proc_handler	= proc_dointvec,
2311 	},
2312 	{
2313 		/*  Deprecated. Use gc_min_interval_ms */
2314 
2315 		.procname	= "gc_min_interval",
2316 		.data		= &ip_rt_gc_min_interval,
2317 		.maxlen		= sizeof(int),
2318 		.mode		= 0644,
2319 		.proc_handler	= proc_dointvec_jiffies,
2320 	},
2321 	{
2322 		.procname	= "gc_min_interval_ms",
2323 		.data		= &ip_rt_gc_min_interval,
2324 		.maxlen		= sizeof(int),
2325 		.mode		= 0644,
2326 		.proc_handler	= proc_dointvec_ms_jiffies,
2327 	},
2328 	{
2329 		.procname	= "gc_timeout",
2330 		.data		= &ip_rt_gc_timeout,
2331 		.maxlen		= sizeof(int),
2332 		.mode		= 0644,
2333 		.proc_handler	= proc_dointvec_jiffies,
2334 	},
2335 	{
2336 		.procname	= "gc_interval",
2337 		.data		= &ip_rt_gc_interval,
2338 		.maxlen		= sizeof(int),
2339 		.mode		= 0644,
2340 		.proc_handler	= proc_dointvec_jiffies,
2341 	},
2342 	{
2343 		.procname	= "redirect_load",
2344 		.data		= &ip_rt_redirect_load,
2345 		.maxlen		= sizeof(int),
2346 		.mode		= 0644,
2347 		.proc_handler	= proc_dointvec,
2348 	},
2349 	{
2350 		.procname	= "redirect_number",
2351 		.data		= &ip_rt_redirect_number,
2352 		.maxlen		= sizeof(int),
2353 		.mode		= 0644,
2354 		.proc_handler	= proc_dointvec,
2355 	},
2356 	{
2357 		.procname	= "redirect_silence",
2358 		.data		= &ip_rt_redirect_silence,
2359 		.maxlen		= sizeof(int),
2360 		.mode		= 0644,
2361 		.proc_handler	= proc_dointvec,
2362 	},
2363 	{
2364 		.procname	= "error_cost",
2365 		.data		= &ip_rt_error_cost,
2366 		.maxlen		= sizeof(int),
2367 		.mode		= 0644,
2368 		.proc_handler	= proc_dointvec,
2369 	},
2370 	{
2371 		.procname	= "error_burst",
2372 		.data		= &ip_rt_error_burst,
2373 		.maxlen		= sizeof(int),
2374 		.mode		= 0644,
2375 		.proc_handler	= proc_dointvec,
2376 	},
2377 	{
2378 		.procname	= "gc_elasticity",
2379 		.data		= &ip_rt_gc_elasticity,
2380 		.maxlen		= sizeof(int),
2381 		.mode		= 0644,
2382 		.proc_handler	= proc_dointvec,
2383 	},
2384 	{
2385 		.procname	= "mtu_expires",
2386 		.data		= &ip_rt_mtu_expires,
2387 		.maxlen		= sizeof(int),
2388 		.mode		= 0644,
2389 		.proc_handler	= proc_dointvec_jiffies,
2390 	},
2391 	{
2392 		.procname	= "min_pmtu",
2393 		.data		= &ip_rt_min_pmtu,
2394 		.maxlen		= sizeof(int),
2395 		.mode		= 0644,
2396 		.proc_handler	= proc_dointvec,
2397 	},
2398 	{
2399 		.procname	= "min_adv_mss",
2400 		.data		= &ip_rt_min_advmss,
2401 		.maxlen		= sizeof(int),
2402 		.mode		= 0644,
2403 		.proc_handler	= proc_dointvec,
2404 	},
2405 	{ }
2406 };
2407 
2408 static struct ctl_table ipv4_route_flush_table[] = {
2409 	{
2410 		.procname	= "flush",
2411 		.maxlen		= sizeof(int),
2412 		.mode		= 0200,
2413 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2414 	},
2415 	{ },
2416 };
2417 
2418 static __net_init int sysctl_route_net_init(struct net *net)
2419 {
2420 	struct ctl_table *tbl;
2421 
2422 	tbl = ipv4_route_flush_table;
2423 	if (!net_eq(net, &init_net)) {
2424 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2425 		if (tbl == NULL)
2426 			goto err_dup;
2427 	}
2428 	tbl[0].extra1 = net;
2429 
2430 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2431 	if (net->ipv4.route_hdr == NULL)
2432 		goto err_reg;
2433 	return 0;
2434 
2435 err_reg:
2436 	if (tbl != ipv4_route_flush_table)
2437 		kfree(tbl);
2438 err_dup:
2439 	return -ENOMEM;
2440 }
2441 
2442 static __net_exit void sysctl_route_net_exit(struct net *net)
2443 {
2444 	struct ctl_table *tbl;
2445 
2446 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2447 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2448 	BUG_ON(tbl == ipv4_route_flush_table);
2449 	kfree(tbl);
2450 }
2451 
2452 static __net_initdata struct pernet_operations sysctl_route_ops = {
2453 	.init = sysctl_route_net_init,
2454 	.exit = sysctl_route_net_exit,
2455 };
2456 #endif
2457 
2458 static __net_init int rt_genid_init(struct net *net)
2459 {
2460 	get_random_bytes(&net->ipv4.rt_genid,
2461 			 sizeof(net->ipv4.rt_genid));
2462 	get_random_bytes(&net->ipv4.dev_addr_genid,
2463 			 sizeof(net->ipv4.dev_addr_genid));
2464 	return 0;
2465 }
2466 
2467 static __net_initdata struct pernet_operations rt_genid_ops = {
2468 	.init = rt_genid_init,
2469 };
2470 
2471 static int __net_init ipv4_inetpeer_init(struct net *net)
2472 {
2473 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2474 
2475 	if (!bp)
2476 		return -ENOMEM;
2477 	inet_peer_base_init(bp);
2478 	net->ipv4.peers = bp;
2479 	return 0;
2480 }
2481 
2482 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2483 {
2484 	struct inet_peer_base *bp = net->ipv4.peers;
2485 
2486 	net->ipv4.peers = NULL;
2487 	inetpeer_invalidate_tree(bp);
2488 	kfree(bp);
2489 }
2490 
2491 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2492 	.init	=	ipv4_inetpeer_init,
2493 	.exit	=	ipv4_inetpeer_exit,
2494 };
2495 
2496 #ifdef CONFIG_IP_ROUTE_CLASSID
2497 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2498 #endif /* CONFIG_IP_ROUTE_CLASSID */
2499 
2500 int __init ip_rt_init(void)
2501 {
2502 	int rc = 0;
2503 
2504 #ifdef CONFIG_IP_ROUTE_CLASSID
2505 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2506 	if (!ip_rt_acct)
2507 		panic("IP: failed to allocate ip_rt_acct\n");
2508 #endif
2509 
2510 	ipv4_dst_ops.kmem_cachep =
2511 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2512 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2513 
2514 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2515 
2516 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2517 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2518 
2519 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2520 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2521 
2522 	ipv4_dst_ops.gc_thresh = ~0;
2523 	ip_rt_max_size = INT_MAX;
2524 
2525 	devinet_init();
2526 	ip_fib_init();
2527 
2528 	if (ip_rt_proc_init())
2529 		pr_err("Unable to create route proc files\n");
2530 #ifdef CONFIG_XFRM
2531 	xfrm_init();
2532 	xfrm4_init(ip_rt_max_size);
2533 #endif
2534 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2535 
2536 #ifdef CONFIG_SYSCTL
2537 	register_pernet_subsys(&sysctl_route_ops);
2538 #endif
2539 	register_pernet_subsys(&rt_genid_ops);
2540 	register_pernet_subsys(&ipv4_inetpeer_ops);
2541 	return rc;
2542 }
2543 
2544 #ifdef CONFIG_SYSCTL
2545 /*
2546  * We really need to sanitize the damn ipv4 init order, then all
2547  * this nonsense will go away.
2548  */
2549 void __init ip_static_sysctl_init(void)
2550 {
2551 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2552 }
2553 #endif
2554