xref: /linux/net/ipv4/route.c (revision d96fc832bcb6269d96e33d506f33033d7ed08598)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #include "fib_lookup.h"
118 
119 #define RT_FL_TOS(oldflp4) \
120 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 
122 #define RT_GC_TIMEOUT (300*HZ)
123 
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 
134 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
135 /*
136  *	Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void		 ipv4_link_failure(struct sk_buff *skb);
144 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 					   struct sk_buff *skb, u32 mtu);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct file_operations rt_cache_seq_fops = {
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.open	 = rt_cpu_seq_open,
334 	.read	 = seq_read,
335 	.llseek	 = seq_lseek,
336 	.release = seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 
364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 {
366 	return single_open(file, rt_acct_proc_show, NULL);
367 }
368 
369 static const struct file_operations rt_acct_proc_fops = {
370 	.open		= rt_acct_proc_open,
371 	.read		= seq_read,
372 	.llseek		= seq_lseek,
373 	.release	= single_release,
374 };
375 #endif
376 
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379 	struct proc_dir_entry *pde;
380 
381 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
382 			  &rt_cache_seq_fops);
383 	if (!pde)
384 		goto err1;
385 
386 	pde = proc_create("rt_cache", S_IRUGO,
387 			  net->proc_net_stat, &rt_cpu_seq_fops);
388 	if (!pde)
389 		goto err2;
390 
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393 	if (!pde)
394 		goto err3;
395 #endif
396 	return 0;
397 
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403 	remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405 	return -ENOMEM;
406 }
407 
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410 	remove_proc_entry("rt_cache", net->proc_net_stat);
411 	remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413 	remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416 
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418 	.init = ip_rt_do_proc_init,
419 	.exit = ip_rt_do_proc_exit,
420 	.async = true,
421 };
422 
423 static int __init ip_rt_proc_init(void)
424 {
425 	return register_pernet_subsys(&ip_rt_proc_ops);
426 }
427 
428 #else
429 static inline int ip_rt_proc_init(void)
430 {
431 	return 0;
432 }
433 #endif /* CONFIG_PROC_FS */
434 
435 static inline bool rt_is_expired(const struct rtable *rth)
436 {
437 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 }
439 
440 void rt_cache_flush(struct net *net)
441 {
442 	rt_genid_bump_ipv4(net);
443 }
444 
445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
446 					   struct sk_buff *skb,
447 					   const void *daddr)
448 {
449 	struct net_device *dev = dst->dev;
450 	const __be32 *pkey = daddr;
451 	const struct rtable *rt;
452 	struct neighbour *n;
453 
454 	rt = (const struct rtable *) dst;
455 	if (rt->rt_gateway)
456 		pkey = (const __be32 *) &rt->rt_gateway;
457 	else if (skb)
458 		pkey = &ip_hdr(skb)->daddr;
459 
460 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461 	if (n)
462 		return n;
463 	return neigh_create(&arp_tbl, pkey, dev);
464 }
465 
466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467 {
468 	struct net_device *dev = dst->dev;
469 	const __be32 *pkey = daddr;
470 	const struct rtable *rt;
471 
472 	rt = (const struct rtable *)dst;
473 	if (rt->rt_gateway)
474 		pkey = (const __be32 *)&rt->rt_gateway;
475 	else if (!daddr ||
476 		 (rt->rt_flags &
477 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 		return;
479 
480 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 }
482 
483 #define IP_IDENTS_SZ 2048u
484 
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
495 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
496 	u32 old = READ_ONCE(*p_tstamp);
497 	u32 now = (u32)jiffies;
498 	u32 new, delta = 0;
499 
500 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
501 		delta = prandom_u32_max(now - old);
502 
503 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
504 	do {
505 		old = (u32)atomic_read(p_id);
506 		new = old + delta + segs;
507 	} while (atomic_cmpxchg(p_id, old, new) != old);
508 
509 	return new - segs;
510 }
511 EXPORT_SYMBOL(ip_idents_reserve);
512 
513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
514 {
515 	static u32 ip_idents_hashrnd __read_mostly;
516 	u32 hash, id;
517 
518 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
519 
520 	hash = jhash_3words((__force u32)iph->daddr,
521 			    (__force u32)iph->saddr,
522 			    iph->protocol ^ net_hash_mix(net),
523 			    ip_idents_hashrnd);
524 	id = ip_idents_reserve(hash, segs);
525 	iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528 
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 			     const struct sock *sk,
531 			     const struct iphdr *iph,
532 			     int oif, u8 tos,
533 			     u8 prot, u32 mark, int flow_flags)
534 {
535 	if (sk) {
536 		const struct inet_sock *inet = inet_sk(sk);
537 
538 		oif = sk->sk_bound_dev_if;
539 		mark = sk->sk_mark;
540 		tos = RT_CONN_FLAGS(sk);
541 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542 	}
543 	flowi4_init_output(fl4, oif, mark, tos,
544 			   RT_SCOPE_UNIVERSE, prot,
545 			   flow_flags,
546 			   iph->daddr, iph->saddr, 0, 0,
547 			   sock_net_uid(net, sk));
548 }
549 
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551 			       const struct sock *sk)
552 {
553 	const struct net *net = dev_net(skb->dev);
554 	const struct iphdr *iph = ip_hdr(skb);
555 	int oif = skb->dev->ifindex;
556 	u8 tos = RT_TOS(iph->tos);
557 	u8 prot = iph->protocol;
558 	u32 mark = skb->mark;
559 
560 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562 
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565 	const struct inet_sock *inet = inet_sk(sk);
566 	const struct ip_options_rcu *inet_opt;
567 	__be32 daddr = inet->inet_daddr;
568 
569 	rcu_read_lock();
570 	inet_opt = rcu_dereference(inet->inet_opt);
571 	if (inet_opt && inet_opt->opt.srr)
572 		daddr = inet_opt->opt.faddr;
573 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 			   inet_sk_flowi_flags(sk),
577 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 	rcu_read_unlock();
579 }
580 
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 				 const struct sk_buff *skb)
583 {
584 	if (skb)
585 		build_skb_flow_key(fl4, skb, sk);
586 	else
587 		build_sk_flow_key(fl4, sk);
588 }
589 
590 static DEFINE_SPINLOCK(fnhe_lock);
591 
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 {
594 	struct rtable *rt;
595 
596 	rt = rcu_dereference(fnhe->fnhe_rth_input);
597 	if (rt) {
598 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599 		dst_dev_put(&rt->dst);
600 		dst_release(&rt->dst);
601 	}
602 	rt = rcu_dereference(fnhe->fnhe_rth_output);
603 	if (rt) {
604 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605 		dst_dev_put(&rt->dst);
606 		dst_release(&rt->dst);
607 	}
608 }
609 
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
611 {
612 	struct fib_nh_exception *fnhe, *oldest;
613 
614 	oldest = rcu_dereference(hash->chain);
615 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
617 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
618 			oldest = fnhe;
619 	}
620 	fnhe_flush_routes(oldest);
621 	return oldest;
622 }
623 
624 static inline u32 fnhe_hashfun(__be32 daddr)
625 {
626 	static u32 fnhe_hashrnd __read_mostly;
627 	u32 hval;
628 
629 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631 	return hash_32(hval, FNHE_HASH_SHIFT);
632 }
633 
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
635 {
636 	rt->rt_pmtu = fnhe->fnhe_pmtu;
637 	rt->dst.expires = fnhe->fnhe_expires;
638 
639 	if (fnhe->fnhe_gw) {
640 		rt->rt_flags |= RTCF_REDIRECTED;
641 		rt->rt_gateway = fnhe->fnhe_gw;
642 		rt->rt_uses_gateway = 1;
643 	}
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
647 				  u32 pmtu, unsigned long expires)
648 {
649 	struct fnhe_hash_bucket *hash;
650 	struct fib_nh_exception *fnhe;
651 	struct rtable *rt;
652 	u32 genid, hval;
653 	unsigned int i;
654 	int depth;
655 
656 	genid = fnhe_genid(dev_net(nh->nh_dev));
657 	hval = fnhe_hashfun(daddr);
658 
659 	spin_lock_bh(&fnhe_lock);
660 
661 	hash = rcu_dereference(nh->nh_exceptions);
662 	if (!hash) {
663 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
664 		if (!hash)
665 			goto out_unlock;
666 		rcu_assign_pointer(nh->nh_exceptions, hash);
667 	}
668 
669 	hash += hval;
670 
671 	depth = 0;
672 	for (fnhe = rcu_dereference(hash->chain); fnhe;
673 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
674 		if (fnhe->fnhe_daddr == daddr)
675 			break;
676 		depth++;
677 	}
678 
679 	if (fnhe) {
680 		if (fnhe->fnhe_genid != genid)
681 			fnhe->fnhe_genid = genid;
682 		if (gw)
683 			fnhe->fnhe_gw = gw;
684 		if (pmtu)
685 			fnhe->fnhe_pmtu = pmtu;
686 		fnhe->fnhe_expires = max(1UL, expires);
687 		/* Update all cached dsts too */
688 		rt = rcu_dereference(fnhe->fnhe_rth_input);
689 		if (rt)
690 			fill_route_from_fnhe(rt, fnhe);
691 		rt = rcu_dereference(fnhe->fnhe_rth_output);
692 		if (rt)
693 			fill_route_from_fnhe(rt, fnhe);
694 	} else {
695 		if (depth > FNHE_RECLAIM_DEPTH)
696 			fnhe = fnhe_oldest(hash);
697 		else {
698 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
699 			if (!fnhe)
700 				goto out_unlock;
701 
702 			fnhe->fnhe_next = hash->chain;
703 			rcu_assign_pointer(hash->chain, fnhe);
704 		}
705 		fnhe->fnhe_genid = genid;
706 		fnhe->fnhe_daddr = daddr;
707 		fnhe->fnhe_gw = gw;
708 		fnhe->fnhe_pmtu = pmtu;
709 		fnhe->fnhe_expires = expires;
710 
711 		/* Exception created; mark the cached routes for the nexthop
712 		 * stale, so anyone caching it rechecks if this exception
713 		 * applies to them.
714 		 */
715 		rt = rcu_dereference(nh->nh_rth_input);
716 		if (rt)
717 			rt->dst.obsolete = DST_OBSOLETE_KILL;
718 
719 		for_each_possible_cpu(i) {
720 			struct rtable __rcu **prt;
721 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
722 			rt = rcu_dereference(*prt);
723 			if (rt)
724 				rt->dst.obsolete = DST_OBSOLETE_KILL;
725 		}
726 	}
727 
728 	fnhe->fnhe_stamp = jiffies;
729 
730 out_unlock:
731 	spin_unlock_bh(&fnhe_lock);
732 }
733 
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 			     bool kill_route)
736 {
737 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
738 	__be32 old_gw = ip_hdr(skb)->saddr;
739 	struct net_device *dev = skb->dev;
740 	struct in_device *in_dev;
741 	struct fib_result res;
742 	struct neighbour *n;
743 	struct net *net;
744 
745 	switch (icmp_hdr(skb)->code & 7) {
746 	case ICMP_REDIR_NET:
747 	case ICMP_REDIR_NETTOS:
748 	case ICMP_REDIR_HOST:
749 	case ICMP_REDIR_HOSTTOS:
750 		break;
751 
752 	default:
753 		return;
754 	}
755 
756 	if (rt->rt_gateway != old_gw)
757 		return;
758 
759 	in_dev = __in_dev_get_rcu(dev);
760 	if (!in_dev)
761 		return;
762 
763 	net = dev_net(dev);
764 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 	    ipv4_is_zeronet(new_gw))
767 		goto reject_redirect;
768 
769 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 			goto reject_redirect;
772 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 			goto reject_redirect;
774 	} else {
775 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 			goto reject_redirect;
777 	}
778 
779 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 	if (!n)
781 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 	if (!IS_ERR(n)) {
783 		if (!(n->nud_state & NUD_VALID)) {
784 			neigh_event_send(n, NULL);
785 		} else {
786 			if (fib_lookup(net, fl4, &res, 0) == 0) {
787 				struct fib_nh *nh = &FIB_RES_NH(res);
788 
789 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
790 						0, jiffies + ip_rt_gc_timeout);
791 			}
792 			if (kill_route)
793 				rt->dst.obsolete = DST_OBSOLETE_KILL;
794 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
795 		}
796 		neigh_release(n);
797 	}
798 	return;
799 
800 reject_redirect:
801 #ifdef CONFIG_IP_ROUTE_VERBOSE
802 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
803 		const struct iphdr *iph = (const struct iphdr *) skb->data;
804 		__be32 daddr = iph->daddr;
805 		__be32 saddr = iph->saddr;
806 
807 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
808 				     "  Advised path = %pI4 -> %pI4\n",
809 				     &old_gw, dev->name, &new_gw,
810 				     &saddr, &daddr);
811 	}
812 #endif
813 	;
814 }
815 
816 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
817 {
818 	struct rtable *rt;
819 	struct flowi4 fl4;
820 	const struct iphdr *iph = (const struct iphdr *) skb->data;
821 	struct net *net = dev_net(skb->dev);
822 	int oif = skb->dev->ifindex;
823 	u8 tos = RT_TOS(iph->tos);
824 	u8 prot = iph->protocol;
825 	u32 mark = skb->mark;
826 
827 	rt = (struct rtable *) dst;
828 
829 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
830 	__ip_do_redirect(rt, skb, &fl4, true);
831 }
832 
833 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
834 {
835 	struct rtable *rt = (struct rtable *)dst;
836 	struct dst_entry *ret = dst;
837 
838 	if (rt) {
839 		if (dst->obsolete > 0) {
840 			ip_rt_put(rt);
841 			ret = NULL;
842 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
843 			   rt->dst.expires) {
844 			ip_rt_put(rt);
845 			ret = NULL;
846 		}
847 	}
848 	return ret;
849 }
850 
851 /*
852  * Algorithm:
853  *	1. The first ip_rt_redirect_number redirects are sent
854  *	   with exponential backoff, then we stop sending them at all,
855  *	   assuming that the host ignores our redirects.
856  *	2. If we did not see packets requiring redirects
857  *	   during ip_rt_redirect_silence, we assume that the host
858  *	   forgot redirected route and start to send redirects again.
859  *
860  * This algorithm is much cheaper and more intelligent than dumb load limiting
861  * in icmp.c.
862  *
863  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
864  * and "frag. need" (breaks PMTU discovery) in icmp.c.
865  */
866 
867 void ip_rt_send_redirect(struct sk_buff *skb)
868 {
869 	struct rtable *rt = skb_rtable(skb);
870 	struct in_device *in_dev;
871 	struct inet_peer *peer;
872 	struct net *net;
873 	int log_martians;
874 	int vif;
875 
876 	rcu_read_lock();
877 	in_dev = __in_dev_get_rcu(rt->dst.dev);
878 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
879 		rcu_read_unlock();
880 		return;
881 	}
882 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
883 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
884 	rcu_read_unlock();
885 
886 	net = dev_net(rt->dst.dev);
887 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
888 	if (!peer) {
889 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
890 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
891 		return;
892 	}
893 
894 	/* No redirected packets during ip_rt_redirect_silence;
895 	 * reset the algorithm.
896 	 */
897 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
898 		peer->rate_tokens = 0;
899 
900 	/* Too many ignored redirects; do not send anything
901 	 * set dst.rate_last to the last seen redirected packet.
902 	 */
903 	if (peer->rate_tokens >= ip_rt_redirect_number) {
904 		peer->rate_last = jiffies;
905 		goto out_put_peer;
906 	}
907 
908 	/* Check for load limit; set rate_last to the latest sent
909 	 * redirect.
910 	 */
911 	if (peer->rate_tokens == 0 ||
912 	    time_after(jiffies,
913 		       (peer->rate_last +
914 			(ip_rt_redirect_load << peer->rate_tokens)))) {
915 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
916 
917 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
918 		peer->rate_last = jiffies;
919 		++peer->rate_tokens;
920 #ifdef CONFIG_IP_ROUTE_VERBOSE
921 		if (log_martians &&
922 		    peer->rate_tokens == ip_rt_redirect_number)
923 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
924 					     &ip_hdr(skb)->saddr, inet_iif(skb),
925 					     &ip_hdr(skb)->daddr, &gw);
926 #endif
927 	}
928 out_put_peer:
929 	inet_putpeer(peer);
930 }
931 
932 static int ip_error(struct sk_buff *skb)
933 {
934 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
935 	struct rtable *rt = skb_rtable(skb);
936 	struct inet_peer *peer;
937 	unsigned long now;
938 	struct net *net;
939 	bool send;
940 	int code;
941 
942 	/* IP on this device is disabled. */
943 	if (!in_dev)
944 		goto out;
945 
946 	net = dev_net(rt->dst.dev);
947 	if (!IN_DEV_FORWARD(in_dev)) {
948 		switch (rt->dst.error) {
949 		case EHOSTUNREACH:
950 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
951 			break;
952 
953 		case ENETUNREACH:
954 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
955 			break;
956 		}
957 		goto out;
958 	}
959 
960 	switch (rt->dst.error) {
961 	case EINVAL:
962 	default:
963 		goto out;
964 	case EHOSTUNREACH:
965 		code = ICMP_HOST_UNREACH;
966 		break;
967 	case ENETUNREACH:
968 		code = ICMP_NET_UNREACH;
969 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
970 		break;
971 	case EACCES:
972 		code = ICMP_PKT_FILTERED;
973 		break;
974 	}
975 
976 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
977 			       l3mdev_master_ifindex(skb->dev), 1);
978 
979 	send = true;
980 	if (peer) {
981 		now = jiffies;
982 		peer->rate_tokens += now - peer->rate_last;
983 		if (peer->rate_tokens > ip_rt_error_burst)
984 			peer->rate_tokens = ip_rt_error_burst;
985 		peer->rate_last = now;
986 		if (peer->rate_tokens >= ip_rt_error_cost)
987 			peer->rate_tokens -= ip_rt_error_cost;
988 		else
989 			send = false;
990 		inet_putpeer(peer);
991 	}
992 	if (send)
993 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
994 
995 out:	kfree_skb(skb);
996 	return 0;
997 }
998 
999 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1000 {
1001 	struct dst_entry *dst = &rt->dst;
1002 	struct fib_result res;
1003 
1004 	if (dst_metric_locked(dst, RTAX_MTU))
1005 		return;
1006 
1007 	if (ipv4_mtu(dst) < mtu)
1008 		return;
1009 
1010 	if (mtu < ip_rt_min_pmtu)
1011 		mtu = ip_rt_min_pmtu;
1012 
1013 	if (rt->rt_pmtu == mtu &&
1014 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1015 		return;
1016 
1017 	rcu_read_lock();
1018 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1019 		struct fib_nh *nh = &FIB_RES_NH(res);
1020 
1021 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1022 				      jiffies + ip_rt_mtu_expires);
1023 	}
1024 	rcu_read_unlock();
1025 }
1026 
1027 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1028 			      struct sk_buff *skb, u32 mtu)
1029 {
1030 	struct rtable *rt = (struct rtable *) dst;
1031 	struct flowi4 fl4;
1032 
1033 	ip_rt_build_flow_key(&fl4, sk, skb);
1034 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1035 }
1036 
1037 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1038 		      int oif, u32 mark, u8 protocol, int flow_flags)
1039 {
1040 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1041 	struct flowi4 fl4;
1042 	struct rtable *rt;
1043 
1044 	if (!mark)
1045 		mark = IP4_REPLY_MARK(net, skb->mark);
1046 
1047 	__build_flow_key(net, &fl4, NULL, iph, oif,
1048 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1049 	rt = __ip_route_output_key(net, &fl4);
1050 	if (!IS_ERR(rt)) {
1051 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1052 		ip_rt_put(rt);
1053 	}
1054 }
1055 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1056 
1057 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1058 {
1059 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1060 	struct flowi4 fl4;
1061 	struct rtable *rt;
1062 
1063 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1064 
1065 	if (!fl4.flowi4_mark)
1066 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1067 
1068 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1069 	if (!IS_ERR(rt)) {
1070 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1071 		ip_rt_put(rt);
1072 	}
1073 }
1074 
1075 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1078 	struct flowi4 fl4;
1079 	struct rtable *rt;
1080 	struct dst_entry *odst = NULL;
1081 	bool new = false;
1082 	struct net *net = sock_net(sk);
1083 
1084 	bh_lock_sock(sk);
1085 
1086 	if (!ip_sk_accept_pmtu(sk))
1087 		goto out;
1088 
1089 	odst = sk_dst_get(sk);
1090 
1091 	if (sock_owned_by_user(sk) || !odst) {
1092 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1093 		goto out;
1094 	}
1095 
1096 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1097 
1098 	rt = (struct rtable *)odst;
1099 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1100 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1101 		if (IS_ERR(rt))
1102 			goto out;
1103 
1104 		new = true;
1105 	}
1106 
1107 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1108 
1109 	if (!dst_check(&rt->dst, 0)) {
1110 		if (new)
1111 			dst_release(&rt->dst);
1112 
1113 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114 		if (IS_ERR(rt))
1115 			goto out;
1116 
1117 		new = true;
1118 	}
1119 
1120 	if (new)
1121 		sk_dst_set(sk, &rt->dst);
1122 
1123 out:
1124 	bh_unlock_sock(sk);
1125 	dst_release(odst);
1126 }
1127 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1128 
1129 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1130 		   int oif, u32 mark, u8 protocol, int flow_flags)
1131 {
1132 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1133 	struct flowi4 fl4;
1134 	struct rtable *rt;
1135 
1136 	__build_flow_key(net, &fl4, NULL, iph, oif,
1137 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1138 	rt = __ip_route_output_key(net, &fl4);
1139 	if (!IS_ERR(rt)) {
1140 		__ip_do_redirect(rt, skb, &fl4, false);
1141 		ip_rt_put(rt);
1142 	}
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_redirect);
1145 
1146 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1147 {
1148 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149 	struct flowi4 fl4;
1150 	struct rtable *rt;
1151 	struct net *net = sock_net(sk);
1152 
1153 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1154 	rt = __ip_route_output_key(net, &fl4);
1155 	if (!IS_ERR(rt)) {
1156 		__ip_do_redirect(rt, skb, &fl4, false);
1157 		ip_rt_put(rt);
1158 	}
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1161 
1162 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1163 {
1164 	struct rtable *rt = (struct rtable *) dst;
1165 
1166 	/* All IPV4 dsts are created with ->obsolete set to the value
1167 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1168 	 * into this function always.
1169 	 *
1170 	 * When a PMTU/redirect information update invalidates a route,
1171 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1172 	 * DST_OBSOLETE_DEAD by dst_free().
1173 	 */
1174 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1175 		return NULL;
1176 	return dst;
1177 }
1178 
1179 static void ipv4_link_failure(struct sk_buff *skb)
1180 {
1181 	struct rtable *rt;
1182 
1183 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1184 
1185 	rt = skb_rtable(skb);
1186 	if (rt)
1187 		dst_set_expires(&rt->dst, 0);
1188 }
1189 
1190 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1191 {
1192 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1193 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1194 		 skb->dev ? skb->dev->name : "?");
1195 	kfree_skb(skb);
1196 	WARN_ON(1);
1197 	return 0;
1198 }
1199 
1200 /*
1201    We do not cache source address of outgoing interface,
1202    because it is used only by IP RR, TS and SRR options,
1203    so that it out of fast path.
1204 
1205    BTW remember: "addr" is allowed to be not aligned
1206    in IP options!
1207  */
1208 
1209 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1210 {
1211 	__be32 src;
1212 
1213 	if (rt_is_output_route(rt))
1214 		src = ip_hdr(skb)->saddr;
1215 	else {
1216 		struct fib_result res;
1217 		struct flowi4 fl4;
1218 		struct iphdr *iph;
1219 
1220 		iph = ip_hdr(skb);
1221 
1222 		memset(&fl4, 0, sizeof(fl4));
1223 		fl4.daddr = iph->daddr;
1224 		fl4.saddr = iph->saddr;
1225 		fl4.flowi4_tos = RT_TOS(iph->tos);
1226 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1227 		fl4.flowi4_iif = skb->dev->ifindex;
1228 		fl4.flowi4_mark = skb->mark;
1229 
1230 		rcu_read_lock();
1231 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1232 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1233 		else
1234 			src = inet_select_addr(rt->dst.dev,
1235 					       rt_nexthop(rt, iph->daddr),
1236 					       RT_SCOPE_UNIVERSE);
1237 		rcu_read_unlock();
1238 	}
1239 	memcpy(addr, &src, 4);
1240 }
1241 
1242 #ifdef CONFIG_IP_ROUTE_CLASSID
1243 static void set_class_tag(struct rtable *rt, u32 tag)
1244 {
1245 	if (!(rt->dst.tclassid & 0xFFFF))
1246 		rt->dst.tclassid |= tag & 0xFFFF;
1247 	if (!(rt->dst.tclassid & 0xFFFF0000))
1248 		rt->dst.tclassid |= tag & 0xFFFF0000;
1249 }
1250 #endif
1251 
1252 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1253 {
1254 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1255 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1256 				    ip_rt_min_advmss);
1257 
1258 	return min(advmss, IPV4_MAX_PMTU - header_size);
1259 }
1260 
1261 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1262 {
1263 	const struct rtable *rt = (const struct rtable *) dst;
1264 	unsigned int mtu = rt->rt_pmtu;
1265 
1266 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1267 		mtu = dst_metric_raw(dst, RTAX_MTU);
1268 
1269 	if (mtu)
1270 		return mtu;
1271 
1272 	mtu = READ_ONCE(dst->dev->mtu);
1273 
1274 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1275 		if (rt->rt_uses_gateway && mtu > 576)
1276 			mtu = 576;
1277 	}
1278 
1279 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1280 
1281 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1282 }
1283 
1284 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1285 {
1286 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1287 	struct fib_nh_exception *fnhe;
1288 	u32 hval;
1289 
1290 	if (!hash)
1291 		return NULL;
1292 
1293 	hval = fnhe_hashfun(daddr);
1294 
1295 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1296 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1297 		if (fnhe->fnhe_daddr == daddr)
1298 			return fnhe;
1299 	}
1300 	return NULL;
1301 }
1302 
1303 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1304 			      __be32 daddr, const bool do_cache)
1305 {
1306 	bool ret = false;
1307 
1308 	spin_lock_bh(&fnhe_lock);
1309 
1310 	if (daddr == fnhe->fnhe_daddr) {
1311 		struct rtable __rcu **porig;
1312 		struct rtable *orig;
1313 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1314 
1315 		if (rt_is_input_route(rt))
1316 			porig = &fnhe->fnhe_rth_input;
1317 		else
1318 			porig = &fnhe->fnhe_rth_output;
1319 		orig = rcu_dereference(*porig);
1320 
1321 		if (fnhe->fnhe_genid != genid) {
1322 			fnhe->fnhe_genid = genid;
1323 			fnhe->fnhe_gw = 0;
1324 			fnhe->fnhe_pmtu = 0;
1325 			fnhe->fnhe_expires = 0;
1326 			fnhe_flush_routes(fnhe);
1327 			orig = NULL;
1328 		}
1329 		fill_route_from_fnhe(rt, fnhe);
1330 		if (!rt->rt_gateway)
1331 			rt->rt_gateway = daddr;
1332 
1333 		if (do_cache) {
1334 			dst_hold(&rt->dst);
1335 			rcu_assign_pointer(*porig, rt);
1336 			if (orig) {
1337 				dst_dev_put(&orig->dst);
1338 				dst_release(&orig->dst);
1339 			}
1340 			ret = true;
1341 		}
1342 
1343 		fnhe->fnhe_stamp = jiffies;
1344 	}
1345 	spin_unlock_bh(&fnhe_lock);
1346 
1347 	return ret;
1348 }
1349 
1350 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1351 {
1352 	struct rtable *orig, *prev, **p;
1353 	bool ret = true;
1354 
1355 	if (rt_is_input_route(rt)) {
1356 		p = (struct rtable **)&nh->nh_rth_input;
1357 	} else {
1358 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1359 	}
1360 	orig = *p;
1361 
1362 	/* hold dst before doing cmpxchg() to avoid race condition
1363 	 * on this dst
1364 	 */
1365 	dst_hold(&rt->dst);
1366 	prev = cmpxchg(p, orig, rt);
1367 	if (prev == orig) {
1368 		if (orig) {
1369 			dst_dev_put(&orig->dst);
1370 			dst_release(&orig->dst);
1371 		}
1372 	} else {
1373 		dst_release(&rt->dst);
1374 		ret = false;
1375 	}
1376 
1377 	return ret;
1378 }
1379 
1380 struct uncached_list {
1381 	spinlock_t		lock;
1382 	struct list_head	head;
1383 };
1384 
1385 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1386 
1387 static void rt_add_uncached_list(struct rtable *rt)
1388 {
1389 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1390 
1391 	rt->rt_uncached_list = ul;
1392 
1393 	spin_lock_bh(&ul->lock);
1394 	list_add_tail(&rt->rt_uncached, &ul->head);
1395 	spin_unlock_bh(&ul->lock);
1396 }
1397 
1398 static void ipv4_dst_destroy(struct dst_entry *dst)
1399 {
1400 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1401 	struct rtable *rt = (struct rtable *) dst;
1402 
1403 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1404 		kfree(p);
1405 
1406 	if (!list_empty(&rt->rt_uncached)) {
1407 		struct uncached_list *ul = rt->rt_uncached_list;
1408 
1409 		spin_lock_bh(&ul->lock);
1410 		list_del(&rt->rt_uncached);
1411 		spin_unlock_bh(&ul->lock);
1412 	}
1413 }
1414 
1415 void rt_flush_dev(struct net_device *dev)
1416 {
1417 	struct net *net = dev_net(dev);
1418 	struct rtable *rt;
1419 	int cpu;
1420 
1421 	for_each_possible_cpu(cpu) {
1422 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1423 
1424 		spin_lock_bh(&ul->lock);
1425 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1426 			if (rt->dst.dev != dev)
1427 				continue;
1428 			rt->dst.dev = net->loopback_dev;
1429 			dev_hold(rt->dst.dev);
1430 			dev_put(dev);
1431 		}
1432 		spin_unlock_bh(&ul->lock);
1433 	}
1434 }
1435 
1436 static bool rt_cache_valid(const struct rtable *rt)
1437 {
1438 	return	rt &&
1439 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1440 		!rt_is_expired(rt);
1441 }
1442 
1443 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1444 			   const struct fib_result *res,
1445 			   struct fib_nh_exception *fnhe,
1446 			   struct fib_info *fi, u16 type, u32 itag,
1447 			   const bool do_cache)
1448 {
1449 	bool cached = false;
1450 
1451 	if (fi) {
1452 		struct fib_nh *nh = &FIB_RES_NH(*res);
1453 
1454 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1455 			rt->rt_gateway = nh->nh_gw;
1456 			rt->rt_uses_gateway = 1;
1457 		}
1458 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1459 		if (fi->fib_metrics != &dst_default_metrics) {
1460 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1461 			refcount_inc(&fi->fib_metrics->refcnt);
1462 		}
1463 #ifdef CONFIG_IP_ROUTE_CLASSID
1464 		rt->dst.tclassid = nh->nh_tclassid;
1465 #endif
1466 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1467 		if (unlikely(fnhe))
1468 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1469 		else if (do_cache)
1470 			cached = rt_cache_route(nh, rt);
1471 		if (unlikely(!cached)) {
1472 			/* Routes we intend to cache in nexthop exception or
1473 			 * FIB nexthop have the DST_NOCACHE bit clear.
1474 			 * However, if we are unsuccessful at storing this
1475 			 * route into the cache we really need to set it.
1476 			 */
1477 			if (!rt->rt_gateway)
1478 				rt->rt_gateway = daddr;
1479 			rt_add_uncached_list(rt);
1480 		}
1481 	} else
1482 		rt_add_uncached_list(rt);
1483 
1484 #ifdef CONFIG_IP_ROUTE_CLASSID
1485 #ifdef CONFIG_IP_MULTIPLE_TABLES
1486 	set_class_tag(rt, res->tclassid);
1487 #endif
1488 	set_class_tag(rt, itag);
1489 #endif
1490 }
1491 
1492 struct rtable *rt_dst_alloc(struct net_device *dev,
1493 			    unsigned int flags, u16 type,
1494 			    bool nopolicy, bool noxfrm, bool will_cache)
1495 {
1496 	struct rtable *rt;
1497 
1498 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1499 		       (will_cache ? 0 : DST_HOST) |
1500 		       (nopolicy ? DST_NOPOLICY : 0) |
1501 		       (noxfrm ? DST_NOXFRM : 0));
1502 
1503 	if (rt) {
1504 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1505 		rt->rt_flags = flags;
1506 		rt->rt_type = type;
1507 		rt->rt_is_input = 0;
1508 		rt->rt_iif = 0;
1509 		rt->rt_pmtu = 0;
1510 		rt->rt_gateway = 0;
1511 		rt->rt_uses_gateway = 0;
1512 		INIT_LIST_HEAD(&rt->rt_uncached);
1513 
1514 		rt->dst.output = ip_output;
1515 		if (flags & RTCF_LOCAL)
1516 			rt->dst.input = ip_local_deliver;
1517 	}
1518 
1519 	return rt;
1520 }
1521 EXPORT_SYMBOL(rt_dst_alloc);
1522 
1523 /* called in rcu_read_lock() section */
1524 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1525 			  u8 tos, struct net_device *dev,
1526 			  struct in_device *in_dev, u32 *itag)
1527 {
1528 	int err;
1529 
1530 	/* Primary sanity checks. */
1531 	if (!in_dev)
1532 		return -EINVAL;
1533 
1534 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1535 	    skb->protocol != htons(ETH_P_IP))
1536 		return -EINVAL;
1537 
1538 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1539 		return -EINVAL;
1540 
1541 	if (ipv4_is_zeronet(saddr)) {
1542 		if (!ipv4_is_local_multicast(daddr))
1543 			return -EINVAL;
1544 	} else {
1545 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1546 					  in_dev, itag);
1547 		if (err < 0)
1548 			return err;
1549 	}
1550 	return 0;
1551 }
1552 
1553 /* called in rcu_read_lock() section */
1554 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1555 			     u8 tos, struct net_device *dev, int our)
1556 {
1557 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1558 	unsigned int flags = RTCF_MULTICAST;
1559 	struct rtable *rth;
1560 	u32 itag = 0;
1561 	int err;
1562 
1563 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1564 	if (err)
1565 		return err;
1566 
1567 	if (our)
1568 		flags |= RTCF_LOCAL;
1569 
1570 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1571 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1572 	if (!rth)
1573 		return -ENOBUFS;
1574 
1575 #ifdef CONFIG_IP_ROUTE_CLASSID
1576 	rth->dst.tclassid = itag;
1577 #endif
1578 	rth->dst.output = ip_rt_bug;
1579 	rth->rt_is_input= 1;
1580 
1581 #ifdef CONFIG_IP_MROUTE
1582 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1583 		rth->dst.input = ip_mr_input;
1584 #endif
1585 	RT_CACHE_STAT_INC(in_slow_mc);
1586 
1587 	skb_dst_set(skb, &rth->dst);
1588 	return 0;
1589 }
1590 
1591 
1592 static void ip_handle_martian_source(struct net_device *dev,
1593 				     struct in_device *in_dev,
1594 				     struct sk_buff *skb,
1595 				     __be32 daddr,
1596 				     __be32 saddr)
1597 {
1598 	RT_CACHE_STAT_INC(in_martian_src);
1599 #ifdef CONFIG_IP_ROUTE_VERBOSE
1600 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1601 		/*
1602 		 *	RFC1812 recommendation, if source is martian,
1603 		 *	the only hint is MAC header.
1604 		 */
1605 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1606 			&daddr, &saddr, dev->name);
1607 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1608 			print_hex_dump(KERN_WARNING, "ll header: ",
1609 				       DUMP_PREFIX_OFFSET, 16, 1,
1610 				       skb_mac_header(skb),
1611 				       dev->hard_header_len, true);
1612 		}
1613 	}
1614 #endif
1615 }
1616 
1617 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1618 {
1619 	struct fnhe_hash_bucket *hash;
1620 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1621 	u32 hval = fnhe_hashfun(daddr);
1622 
1623 	spin_lock_bh(&fnhe_lock);
1624 
1625 	hash = rcu_dereference_protected(nh->nh_exceptions,
1626 					 lockdep_is_held(&fnhe_lock));
1627 	hash += hval;
1628 
1629 	fnhe_p = &hash->chain;
1630 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1631 	while (fnhe) {
1632 		if (fnhe->fnhe_daddr == daddr) {
1633 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1634 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1635 			fnhe_flush_routes(fnhe);
1636 			kfree_rcu(fnhe, rcu);
1637 			break;
1638 		}
1639 		fnhe_p = &fnhe->fnhe_next;
1640 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1641 						 lockdep_is_held(&fnhe_lock));
1642 	}
1643 
1644 	spin_unlock_bh(&fnhe_lock);
1645 }
1646 
1647 /* called in rcu_read_lock() section */
1648 static int __mkroute_input(struct sk_buff *skb,
1649 			   const struct fib_result *res,
1650 			   struct in_device *in_dev,
1651 			   __be32 daddr, __be32 saddr, u32 tos)
1652 {
1653 	struct fib_nh_exception *fnhe;
1654 	struct rtable *rth;
1655 	int err;
1656 	struct in_device *out_dev;
1657 	bool do_cache;
1658 	u32 itag = 0;
1659 
1660 	/* get a working reference to the output device */
1661 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1662 	if (!out_dev) {
1663 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1664 		return -EINVAL;
1665 	}
1666 
1667 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1668 				  in_dev->dev, in_dev, &itag);
1669 	if (err < 0) {
1670 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1671 					 saddr);
1672 
1673 		goto cleanup;
1674 	}
1675 
1676 	do_cache = res->fi && !itag;
1677 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1678 	    skb->protocol == htons(ETH_P_IP) &&
1679 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1680 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1681 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1682 
1683 	if (skb->protocol != htons(ETH_P_IP)) {
1684 		/* Not IP (i.e. ARP). Do not create route, if it is
1685 		 * invalid for proxy arp. DNAT routes are always valid.
1686 		 *
1687 		 * Proxy arp feature have been extended to allow, ARP
1688 		 * replies back to the same interface, to support
1689 		 * Private VLAN switch technologies. See arp.c.
1690 		 */
1691 		if (out_dev == in_dev &&
1692 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1693 			err = -EINVAL;
1694 			goto cleanup;
1695 		}
1696 	}
1697 
1698 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1699 	if (do_cache) {
1700 		if (fnhe) {
1701 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1702 			if (rth && rth->dst.expires &&
1703 			    time_after(jiffies, rth->dst.expires)) {
1704 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1705 				fnhe = NULL;
1706 			} else {
1707 				goto rt_cache;
1708 			}
1709 		}
1710 
1711 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1712 
1713 rt_cache:
1714 		if (rt_cache_valid(rth)) {
1715 			skb_dst_set_noref(skb, &rth->dst);
1716 			goto out;
1717 		}
1718 	}
1719 
1720 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1721 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1722 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1723 	if (!rth) {
1724 		err = -ENOBUFS;
1725 		goto cleanup;
1726 	}
1727 
1728 	rth->rt_is_input = 1;
1729 	RT_CACHE_STAT_INC(in_slow_tot);
1730 
1731 	rth->dst.input = ip_forward;
1732 
1733 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1734 		       do_cache);
1735 	lwtunnel_set_redirect(&rth->dst);
1736 	skb_dst_set(skb, &rth->dst);
1737 out:
1738 	err = 0;
1739  cleanup:
1740 	return err;
1741 }
1742 
1743 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1744 /* To make ICMP packets follow the right flow, the multipath hash is
1745  * calculated from the inner IP addresses.
1746  */
1747 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1748 				 struct flow_keys *hash_keys)
1749 {
1750 	const struct iphdr *outer_iph = ip_hdr(skb);
1751 	const struct iphdr *inner_iph;
1752 	const struct icmphdr *icmph;
1753 	struct iphdr _inner_iph;
1754 	struct icmphdr _icmph;
1755 
1756 	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1757 	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1758 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1759 		return;
1760 
1761 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1762 		return;
1763 
1764 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1765 				   &_icmph);
1766 	if (!icmph)
1767 		return;
1768 
1769 	if (icmph->type != ICMP_DEST_UNREACH &&
1770 	    icmph->type != ICMP_REDIRECT &&
1771 	    icmph->type != ICMP_TIME_EXCEEDED &&
1772 	    icmph->type != ICMP_PARAMETERPROB)
1773 		return;
1774 
1775 	inner_iph = skb_header_pointer(skb,
1776 				       outer_iph->ihl * 4 + sizeof(_icmph),
1777 				       sizeof(_inner_iph), &_inner_iph);
1778 	if (!inner_iph)
1779 		return;
1780 	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1781 	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1782 }
1783 
1784 /* if skb is set it will be used and fl4 can be NULL */
1785 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1786 		       const struct sk_buff *skb)
1787 {
1788 	struct net *net = fi->fib_net;
1789 	struct flow_keys hash_keys;
1790 	u32 mhash;
1791 
1792 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1793 	case 0:
1794 		memset(&hash_keys, 0, sizeof(hash_keys));
1795 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1796 		if (skb) {
1797 			ip_multipath_l3_keys(skb, &hash_keys);
1798 		} else {
1799 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1800 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1801 		}
1802 		break;
1803 	case 1:
1804 		/* skb is currently provided only when forwarding */
1805 		if (skb) {
1806 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1807 			struct flow_keys keys;
1808 
1809 			/* short-circuit if we already have L4 hash present */
1810 			if (skb->l4_hash)
1811 				return skb_get_hash_raw(skb) >> 1;
1812 			memset(&hash_keys, 0, sizeof(hash_keys));
1813 			skb_flow_dissect_flow_keys(skb, &keys, flag);
1814 
1815 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816 			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1817 			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1818 			hash_keys.ports.src = keys.ports.src;
1819 			hash_keys.ports.dst = keys.ports.dst;
1820 			hash_keys.basic.ip_proto = keys.basic.ip_proto;
1821 		} else {
1822 			memset(&hash_keys, 0, sizeof(hash_keys));
1823 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1824 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1825 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1826 			hash_keys.ports.src = fl4->fl4_sport;
1827 			hash_keys.ports.dst = fl4->fl4_dport;
1828 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1829 		}
1830 		break;
1831 	}
1832 	mhash = flow_hash_from_keys(&hash_keys);
1833 
1834 	return mhash >> 1;
1835 }
1836 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1837 
1838 static int ip_mkroute_input(struct sk_buff *skb,
1839 			    struct fib_result *res,
1840 			    struct in_device *in_dev,
1841 			    __be32 daddr, __be32 saddr, u32 tos)
1842 {
1843 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1844 	if (res->fi && res->fi->fib_nhs > 1) {
1845 		int h = fib_multipath_hash(res->fi, NULL, skb);
1846 
1847 		fib_select_multipath(res, h);
1848 	}
1849 #endif
1850 
1851 	/* create a routing cache entry */
1852 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1853 }
1854 
1855 /*
1856  *	NOTE. We drop all the packets that has local source
1857  *	addresses, because every properly looped back packet
1858  *	must have correct destination already attached by output routine.
1859  *
1860  *	Such approach solves two big problems:
1861  *	1. Not simplex devices are handled properly.
1862  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1863  *	called with rcu_read_lock()
1864  */
1865 
1866 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1867 			       u8 tos, struct net_device *dev,
1868 			       struct fib_result *res)
1869 {
1870 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1871 	struct ip_tunnel_info *tun_info;
1872 	struct flowi4	fl4;
1873 	unsigned int	flags = 0;
1874 	u32		itag = 0;
1875 	struct rtable	*rth;
1876 	int		err = -EINVAL;
1877 	struct net    *net = dev_net(dev);
1878 	bool do_cache;
1879 
1880 	/* IP on this device is disabled. */
1881 
1882 	if (!in_dev)
1883 		goto out;
1884 
1885 	/* Check for the most weird martians, which can be not detected
1886 	   by fib_lookup.
1887 	 */
1888 
1889 	tun_info = skb_tunnel_info(skb);
1890 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1891 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1892 	else
1893 		fl4.flowi4_tun_key.tun_id = 0;
1894 	skb_dst_drop(skb);
1895 
1896 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1897 		goto martian_source;
1898 
1899 	res->fi = NULL;
1900 	res->table = NULL;
1901 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1902 		goto brd_input;
1903 
1904 	/* Accept zero addresses only to limited broadcast;
1905 	 * I even do not know to fix it or not. Waiting for complains :-)
1906 	 */
1907 	if (ipv4_is_zeronet(saddr))
1908 		goto martian_source;
1909 
1910 	if (ipv4_is_zeronet(daddr))
1911 		goto martian_destination;
1912 
1913 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1914 	 * and call it once if daddr or/and saddr are loopback addresses
1915 	 */
1916 	if (ipv4_is_loopback(daddr)) {
1917 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1918 			goto martian_destination;
1919 	} else if (ipv4_is_loopback(saddr)) {
1920 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1921 			goto martian_source;
1922 	}
1923 
1924 	/*
1925 	 *	Now we are ready to route packet.
1926 	 */
1927 	fl4.flowi4_oif = 0;
1928 	fl4.flowi4_iif = dev->ifindex;
1929 	fl4.flowi4_mark = skb->mark;
1930 	fl4.flowi4_tos = tos;
1931 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1932 	fl4.flowi4_flags = 0;
1933 	fl4.daddr = daddr;
1934 	fl4.saddr = saddr;
1935 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1936 	err = fib_lookup(net, &fl4, res, 0);
1937 	if (err != 0) {
1938 		if (!IN_DEV_FORWARD(in_dev))
1939 			err = -EHOSTUNREACH;
1940 		goto no_route;
1941 	}
1942 
1943 	if (res->type == RTN_BROADCAST)
1944 		goto brd_input;
1945 
1946 	if (res->type == RTN_LOCAL) {
1947 		err = fib_validate_source(skb, saddr, daddr, tos,
1948 					  0, dev, in_dev, &itag);
1949 		if (err < 0)
1950 			goto martian_source;
1951 		goto local_input;
1952 	}
1953 
1954 	if (!IN_DEV_FORWARD(in_dev)) {
1955 		err = -EHOSTUNREACH;
1956 		goto no_route;
1957 	}
1958 	if (res->type != RTN_UNICAST)
1959 		goto martian_destination;
1960 
1961 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1962 out:	return err;
1963 
1964 brd_input:
1965 	if (skb->protocol != htons(ETH_P_IP))
1966 		goto e_inval;
1967 
1968 	if (!ipv4_is_zeronet(saddr)) {
1969 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1970 					  in_dev, &itag);
1971 		if (err < 0)
1972 			goto martian_source;
1973 	}
1974 	flags |= RTCF_BROADCAST;
1975 	res->type = RTN_BROADCAST;
1976 	RT_CACHE_STAT_INC(in_brd);
1977 
1978 local_input:
1979 	do_cache = false;
1980 	if (res->fi) {
1981 		if (!itag) {
1982 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1983 			if (rt_cache_valid(rth)) {
1984 				skb_dst_set_noref(skb, &rth->dst);
1985 				err = 0;
1986 				goto out;
1987 			}
1988 			do_cache = true;
1989 		}
1990 	}
1991 
1992 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1993 			   flags | RTCF_LOCAL, res->type,
1994 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1995 	if (!rth)
1996 		goto e_nobufs;
1997 
1998 	rth->dst.output= ip_rt_bug;
1999 #ifdef CONFIG_IP_ROUTE_CLASSID
2000 	rth->dst.tclassid = itag;
2001 #endif
2002 	rth->rt_is_input = 1;
2003 
2004 	RT_CACHE_STAT_INC(in_slow_tot);
2005 	if (res->type == RTN_UNREACHABLE) {
2006 		rth->dst.input= ip_error;
2007 		rth->dst.error= -err;
2008 		rth->rt_flags 	&= ~RTCF_LOCAL;
2009 	}
2010 
2011 	if (do_cache) {
2012 		struct fib_nh *nh = &FIB_RES_NH(*res);
2013 
2014 		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2015 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2016 			WARN_ON(rth->dst.input == lwtunnel_input);
2017 			rth->dst.lwtstate->orig_input = rth->dst.input;
2018 			rth->dst.input = lwtunnel_input;
2019 		}
2020 
2021 		if (unlikely(!rt_cache_route(nh, rth)))
2022 			rt_add_uncached_list(rth);
2023 	}
2024 	skb_dst_set(skb, &rth->dst);
2025 	err = 0;
2026 	goto out;
2027 
2028 no_route:
2029 	RT_CACHE_STAT_INC(in_no_route);
2030 	res->type = RTN_UNREACHABLE;
2031 	res->fi = NULL;
2032 	res->table = NULL;
2033 	goto local_input;
2034 
2035 	/*
2036 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2037 	 */
2038 martian_destination:
2039 	RT_CACHE_STAT_INC(in_martian_dst);
2040 #ifdef CONFIG_IP_ROUTE_VERBOSE
2041 	if (IN_DEV_LOG_MARTIANS(in_dev))
2042 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2043 				     &daddr, &saddr, dev->name);
2044 #endif
2045 
2046 e_inval:
2047 	err = -EINVAL;
2048 	goto out;
2049 
2050 e_nobufs:
2051 	err = -ENOBUFS;
2052 	goto out;
2053 
2054 martian_source:
2055 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2056 	goto out;
2057 }
2058 
2059 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2060 			 u8 tos, struct net_device *dev)
2061 {
2062 	struct fib_result res;
2063 	int err;
2064 
2065 	tos &= IPTOS_RT_MASK;
2066 	rcu_read_lock();
2067 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2068 	rcu_read_unlock();
2069 
2070 	return err;
2071 }
2072 EXPORT_SYMBOL(ip_route_input_noref);
2073 
2074 /* called with rcu_read_lock held */
2075 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076 		       u8 tos, struct net_device *dev, struct fib_result *res)
2077 {
2078 	/* Multicast recognition logic is moved from route cache to here.
2079 	   The problem was that too many Ethernet cards have broken/missing
2080 	   hardware multicast filters :-( As result the host on multicasting
2081 	   network acquires a lot of useless route cache entries, sort of
2082 	   SDR messages from all the world. Now we try to get rid of them.
2083 	   Really, provided software IP multicast filter is organized
2084 	   reasonably (at least, hashed), it does not result in a slowdown
2085 	   comparing with route cache reject entries.
2086 	   Note, that multicast routers are not affected, because
2087 	   route cache entry is created eventually.
2088 	 */
2089 	if (ipv4_is_multicast(daddr)) {
2090 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2091 		int our = 0;
2092 		int err = -EINVAL;
2093 
2094 		if (in_dev)
2095 			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2096 					      ip_hdr(skb)->protocol);
2097 
2098 		/* check l3 master if no match yet */
2099 		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2100 			struct in_device *l3_in_dev;
2101 
2102 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2103 			if (l3_in_dev)
2104 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2105 						      ip_hdr(skb)->protocol);
2106 		}
2107 
2108 		if (our
2109 #ifdef CONFIG_IP_MROUTE
2110 			||
2111 		    (!ipv4_is_local_multicast(daddr) &&
2112 		     IN_DEV_MFORWARD(in_dev))
2113 #endif
2114 		   ) {
2115 			err = ip_route_input_mc(skb, daddr, saddr,
2116 						tos, dev, our);
2117 		}
2118 		return err;
2119 	}
2120 
2121 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2122 }
2123 
2124 /* called with rcu_read_lock() */
2125 static struct rtable *__mkroute_output(const struct fib_result *res,
2126 				       const struct flowi4 *fl4, int orig_oif,
2127 				       struct net_device *dev_out,
2128 				       unsigned int flags)
2129 {
2130 	struct fib_info *fi = res->fi;
2131 	struct fib_nh_exception *fnhe;
2132 	struct in_device *in_dev;
2133 	u16 type = res->type;
2134 	struct rtable *rth;
2135 	bool do_cache;
2136 
2137 	in_dev = __in_dev_get_rcu(dev_out);
2138 	if (!in_dev)
2139 		return ERR_PTR(-EINVAL);
2140 
2141 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2142 		if (ipv4_is_loopback(fl4->saddr) &&
2143 		    !(dev_out->flags & IFF_LOOPBACK) &&
2144 		    !netif_is_l3_master(dev_out))
2145 			return ERR_PTR(-EINVAL);
2146 
2147 	if (ipv4_is_lbcast(fl4->daddr))
2148 		type = RTN_BROADCAST;
2149 	else if (ipv4_is_multicast(fl4->daddr))
2150 		type = RTN_MULTICAST;
2151 	else if (ipv4_is_zeronet(fl4->daddr))
2152 		return ERR_PTR(-EINVAL);
2153 
2154 	if (dev_out->flags & IFF_LOOPBACK)
2155 		flags |= RTCF_LOCAL;
2156 
2157 	do_cache = true;
2158 	if (type == RTN_BROADCAST) {
2159 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2160 		fi = NULL;
2161 	} else if (type == RTN_MULTICAST) {
2162 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2163 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2164 				     fl4->flowi4_proto))
2165 			flags &= ~RTCF_LOCAL;
2166 		else
2167 			do_cache = false;
2168 		/* If multicast route do not exist use
2169 		 * default one, but do not gateway in this case.
2170 		 * Yes, it is hack.
2171 		 */
2172 		if (fi && res->prefixlen < 4)
2173 			fi = NULL;
2174 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2175 		   (orig_oif != dev_out->ifindex)) {
2176 		/* For local routes that require a particular output interface
2177 		 * we do not want to cache the result.  Caching the result
2178 		 * causes incorrect behaviour when there are multiple source
2179 		 * addresses on the interface, the end result being that if the
2180 		 * intended recipient is waiting on that interface for the
2181 		 * packet he won't receive it because it will be delivered on
2182 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2183 		 * be set to the loopback interface as well.
2184 		 */
2185 		fi = NULL;
2186 	}
2187 
2188 	fnhe = NULL;
2189 	do_cache &= fi != NULL;
2190 	if (do_cache) {
2191 		struct rtable __rcu **prth;
2192 		struct fib_nh *nh = &FIB_RES_NH(*res);
2193 
2194 		fnhe = find_exception(nh, fl4->daddr);
2195 		if (fnhe) {
2196 			prth = &fnhe->fnhe_rth_output;
2197 			rth = rcu_dereference(*prth);
2198 			if (rth && rth->dst.expires &&
2199 			    time_after(jiffies, rth->dst.expires)) {
2200 				ip_del_fnhe(nh, fl4->daddr);
2201 				fnhe = NULL;
2202 			} else {
2203 				goto rt_cache;
2204 			}
2205 		}
2206 
2207 		if (unlikely(fl4->flowi4_flags &
2208 			     FLOWI_FLAG_KNOWN_NH &&
2209 			     !(nh->nh_gw &&
2210 			       nh->nh_scope == RT_SCOPE_LINK))) {
2211 			do_cache = false;
2212 			goto add;
2213 		}
2214 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2215 		rth = rcu_dereference(*prth);
2216 
2217 rt_cache:
2218 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2219 			return rth;
2220 	}
2221 
2222 add:
2223 	rth = rt_dst_alloc(dev_out, flags, type,
2224 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2225 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2226 			   do_cache);
2227 	if (!rth)
2228 		return ERR_PTR(-ENOBUFS);
2229 
2230 	rth->rt_iif = orig_oif;
2231 
2232 	RT_CACHE_STAT_INC(out_slow_tot);
2233 
2234 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2235 		if (flags & RTCF_LOCAL &&
2236 		    !(dev_out->flags & IFF_LOOPBACK)) {
2237 			rth->dst.output = ip_mc_output;
2238 			RT_CACHE_STAT_INC(out_slow_mc);
2239 		}
2240 #ifdef CONFIG_IP_MROUTE
2241 		if (type == RTN_MULTICAST) {
2242 			if (IN_DEV_MFORWARD(in_dev) &&
2243 			    !ipv4_is_local_multicast(fl4->daddr)) {
2244 				rth->dst.input = ip_mr_input;
2245 				rth->dst.output = ip_mc_output;
2246 			}
2247 		}
2248 #endif
2249 	}
2250 
2251 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2252 	lwtunnel_set_redirect(&rth->dst);
2253 
2254 	return rth;
2255 }
2256 
2257 /*
2258  * Major route resolver routine.
2259  */
2260 
2261 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2262 					const struct sk_buff *skb)
2263 {
2264 	__u8 tos = RT_FL_TOS(fl4);
2265 	struct fib_result res;
2266 	struct rtable *rth;
2267 
2268 	res.tclassid	= 0;
2269 	res.fi		= NULL;
2270 	res.table	= NULL;
2271 
2272 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2273 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2274 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2275 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2276 
2277 	rcu_read_lock();
2278 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2279 	rcu_read_unlock();
2280 
2281 	return rth;
2282 }
2283 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2284 
2285 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2286 					    struct fib_result *res,
2287 					    const struct sk_buff *skb)
2288 {
2289 	struct net_device *dev_out = NULL;
2290 	int orig_oif = fl4->flowi4_oif;
2291 	unsigned int flags = 0;
2292 	struct rtable *rth;
2293 	int err = -ENETUNREACH;
2294 
2295 	if (fl4->saddr) {
2296 		rth = ERR_PTR(-EINVAL);
2297 		if (ipv4_is_multicast(fl4->saddr) ||
2298 		    ipv4_is_lbcast(fl4->saddr) ||
2299 		    ipv4_is_zeronet(fl4->saddr))
2300 			goto out;
2301 
2302 		/* I removed check for oif == dev_out->oif here.
2303 		   It was wrong for two reasons:
2304 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2305 		      is assigned to multiple interfaces.
2306 		   2. Moreover, we are allowed to send packets with saddr
2307 		      of another iface. --ANK
2308 		 */
2309 
2310 		if (fl4->flowi4_oif == 0 &&
2311 		    (ipv4_is_multicast(fl4->daddr) ||
2312 		     ipv4_is_lbcast(fl4->daddr))) {
2313 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2314 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2315 			if (!dev_out)
2316 				goto out;
2317 
2318 			/* Special hack: user can direct multicasts
2319 			   and limited broadcast via necessary interface
2320 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2321 			   This hack is not just for fun, it allows
2322 			   vic,vat and friends to work.
2323 			   They bind socket to loopback, set ttl to zero
2324 			   and expect that it will work.
2325 			   From the viewpoint of routing cache they are broken,
2326 			   because we are not allowed to build multicast path
2327 			   with loopback source addr (look, routing cache
2328 			   cannot know, that ttl is zero, so that packet
2329 			   will not leave this host and route is valid).
2330 			   Luckily, this hack is good workaround.
2331 			 */
2332 
2333 			fl4->flowi4_oif = dev_out->ifindex;
2334 			goto make_route;
2335 		}
2336 
2337 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2338 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2339 			if (!__ip_dev_find(net, fl4->saddr, false))
2340 				goto out;
2341 		}
2342 	}
2343 
2344 
2345 	if (fl4->flowi4_oif) {
2346 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2347 		rth = ERR_PTR(-ENODEV);
2348 		if (!dev_out)
2349 			goto out;
2350 
2351 		/* RACE: Check return value of inet_select_addr instead. */
2352 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2353 			rth = ERR_PTR(-ENETUNREACH);
2354 			goto out;
2355 		}
2356 		if (ipv4_is_local_multicast(fl4->daddr) ||
2357 		    ipv4_is_lbcast(fl4->daddr) ||
2358 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2359 			if (!fl4->saddr)
2360 				fl4->saddr = inet_select_addr(dev_out, 0,
2361 							      RT_SCOPE_LINK);
2362 			goto make_route;
2363 		}
2364 		if (!fl4->saddr) {
2365 			if (ipv4_is_multicast(fl4->daddr))
2366 				fl4->saddr = inet_select_addr(dev_out, 0,
2367 							      fl4->flowi4_scope);
2368 			else if (!fl4->daddr)
2369 				fl4->saddr = inet_select_addr(dev_out, 0,
2370 							      RT_SCOPE_HOST);
2371 		}
2372 	}
2373 
2374 	if (!fl4->daddr) {
2375 		fl4->daddr = fl4->saddr;
2376 		if (!fl4->daddr)
2377 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2378 		dev_out = net->loopback_dev;
2379 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2380 		res->type = RTN_LOCAL;
2381 		flags |= RTCF_LOCAL;
2382 		goto make_route;
2383 	}
2384 
2385 	err = fib_lookup(net, fl4, res, 0);
2386 	if (err) {
2387 		res->fi = NULL;
2388 		res->table = NULL;
2389 		if (fl4->flowi4_oif &&
2390 		    (ipv4_is_multicast(fl4->daddr) ||
2391 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2392 			/* Apparently, routing tables are wrong. Assume,
2393 			   that the destination is on link.
2394 
2395 			   WHY? DW.
2396 			   Because we are allowed to send to iface
2397 			   even if it has NO routes and NO assigned
2398 			   addresses. When oif is specified, routing
2399 			   tables are looked up with only one purpose:
2400 			   to catch if destination is gatewayed, rather than
2401 			   direct. Moreover, if MSG_DONTROUTE is set,
2402 			   we send packet, ignoring both routing tables
2403 			   and ifaddr state. --ANK
2404 
2405 
2406 			   We could make it even if oif is unknown,
2407 			   likely IPv6, but we do not.
2408 			 */
2409 
2410 			if (fl4->saddr == 0)
2411 				fl4->saddr = inet_select_addr(dev_out, 0,
2412 							      RT_SCOPE_LINK);
2413 			res->type = RTN_UNICAST;
2414 			goto make_route;
2415 		}
2416 		rth = ERR_PTR(err);
2417 		goto out;
2418 	}
2419 
2420 	if (res->type == RTN_LOCAL) {
2421 		if (!fl4->saddr) {
2422 			if (res->fi->fib_prefsrc)
2423 				fl4->saddr = res->fi->fib_prefsrc;
2424 			else
2425 				fl4->saddr = fl4->daddr;
2426 		}
2427 
2428 		/* L3 master device is the loopback for that domain */
2429 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2430 			net->loopback_dev;
2431 
2432 		/* make sure orig_oif points to fib result device even
2433 		 * though packet rx/tx happens over loopback or l3mdev
2434 		 */
2435 		orig_oif = FIB_RES_OIF(*res);
2436 
2437 		fl4->flowi4_oif = dev_out->ifindex;
2438 		flags |= RTCF_LOCAL;
2439 		goto make_route;
2440 	}
2441 
2442 	fib_select_path(net, res, fl4, skb);
2443 
2444 	dev_out = FIB_RES_DEV(*res);
2445 	fl4->flowi4_oif = dev_out->ifindex;
2446 
2447 
2448 make_route:
2449 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2450 
2451 out:
2452 	return rth;
2453 }
2454 
2455 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2456 {
2457 	return NULL;
2458 }
2459 
2460 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2461 {
2462 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2463 
2464 	return mtu ? : dst->dev->mtu;
2465 }
2466 
2467 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2468 					  struct sk_buff *skb, u32 mtu)
2469 {
2470 }
2471 
2472 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2473 				       struct sk_buff *skb)
2474 {
2475 }
2476 
2477 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2478 					  unsigned long old)
2479 {
2480 	return NULL;
2481 }
2482 
2483 static struct dst_ops ipv4_dst_blackhole_ops = {
2484 	.family			=	AF_INET,
2485 	.check			=	ipv4_blackhole_dst_check,
2486 	.mtu			=	ipv4_blackhole_mtu,
2487 	.default_advmss		=	ipv4_default_advmss,
2488 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2489 	.redirect		=	ipv4_rt_blackhole_redirect,
2490 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2491 	.neigh_lookup		=	ipv4_neigh_lookup,
2492 };
2493 
2494 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2495 {
2496 	struct rtable *ort = (struct rtable *) dst_orig;
2497 	struct rtable *rt;
2498 
2499 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2500 	if (rt) {
2501 		struct dst_entry *new = &rt->dst;
2502 
2503 		new->__use = 1;
2504 		new->input = dst_discard;
2505 		new->output = dst_discard_out;
2506 
2507 		new->dev = net->loopback_dev;
2508 		if (new->dev)
2509 			dev_hold(new->dev);
2510 
2511 		rt->rt_is_input = ort->rt_is_input;
2512 		rt->rt_iif = ort->rt_iif;
2513 		rt->rt_pmtu = ort->rt_pmtu;
2514 
2515 		rt->rt_genid = rt_genid_ipv4(net);
2516 		rt->rt_flags = ort->rt_flags;
2517 		rt->rt_type = ort->rt_type;
2518 		rt->rt_gateway = ort->rt_gateway;
2519 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2520 
2521 		INIT_LIST_HEAD(&rt->rt_uncached);
2522 	}
2523 
2524 	dst_release(dst_orig);
2525 
2526 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2527 }
2528 
2529 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2530 				    const struct sock *sk)
2531 {
2532 	struct rtable *rt = __ip_route_output_key(net, flp4);
2533 
2534 	if (IS_ERR(rt))
2535 		return rt;
2536 
2537 	if (flp4->flowi4_proto)
2538 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2539 							flowi4_to_flowi(flp4),
2540 							sk, 0);
2541 
2542 	return rt;
2543 }
2544 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2545 
2546 /* called with rcu_read_lock held */
2547 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2548 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2549 			u32 seq)
2550 {
2551 	struct rtable *rt = skb_rtable(skb);
2552 	struct rtmsg *r;
2553 	struct nlmsghdr *nlh;
2554 	unsigned long expires = 0;
2555 	u32 error;
2556 	u32 metrics[RTAX_MAX];
2557 
2558 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2559 	if (!nlh)
2560 		return -EMSGSIZE;
2561 
2562 	r = nlmsg_data(nlh);
2563 	r->rtm_family	 = AF_INET;
2564 	r->rtm_dst_len	= 32;
2565 	r->rtm_src_len	= 0;
2566 	r->rtm_tos	= fl4->flowi4_tos;
2567 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2568 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2569 		goto nla_put_failure;
2570 	r->rtm_type	= rt->rt_type;
2571 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2572 	r->rtm_protocol = RTPROT_UNSPEC;
2573 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2574 	if (rt->rt_flags & RTCF_NOTIFY)
2575 		r->rtm_flags |= RTM_F_NOTIFY;
2576 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2577 		r->rtm_flags |= RTCF_DOREDIRECT;
2578 
2579 	if (nla_put_in_addr(skb, RTA_DST, dst))
2580 		goto nla_put_failure;
2581 	if (src) {
2582 		r->rtm_src_len = 32;
2583 		if (nla_put_in_addr(skb, RTA_SRC, src))
2584 			goto nla_put_failure;
2585 	}
2586 	if (rt->dst.dev &&
2587 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2588 		goto nla_put_failure;
2589 #ifdef CONFIG_IP_ROUTE_CLASSID
2590 	if (rt->dst.tclassid &&
2591 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2592 		goto nla_put_failure;
2593 #endif
2594 	if (!rt_is_input_route(rt) &&
2595 	    fl4->saddr != src) {
2596 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2597 			goto nla_put_failure;
2598 	}
2599 	if (rt->rt_uses_gateway &&
2600 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2601 		goto nla_put_failure;
2602 
2603 	expires = rt->dst.expires;
2604 	if (expires) {
2605 		unsigned long now = jiffies;
2606 
2607 		if (time_before(now, expires))
2608 			expires -= now;
2609 		else
2610 			expires = 0;
2611 	}
2612 
2613 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2614 	if (rt->rt_pmtu && expires)
2615 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2616 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2617 		goto nla_put_failure;
2618 
2619 	if (fl4->flowi4_mark &&
2620 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2621 		goto nla_put_failure;
2622 
2623 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2624 	    nla_put_u32(skb, RTA_UID,
2625 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2626 		goto nla_put_failure;
2627 
2628 	error = rt->dst.error;
2629 
2630 	if (rt_is_input_route(rt)) {
2631 #ifdef CONFIG_IP_MROUTE
2632 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2633 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2634 			int err = ipmr_get_route(net, skb,
2635 						 fl4->saddr, fl4->daddr,
2636 						 r, portid);
2637 
2638 			if (err <= 0) {
2639 				if (err == 0)
2640 					return 0;
2641 				goto nla_put_failure;
2642 			}
2643 		} else
2644 #endif
2645 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2646 				goto nla_put_failure;
2647 	}
2648 
2649 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2650 		goto nla_put_failure;
2651 
2652 	nlmsg_end(skb, nlh);
2653 	return 0;
2654 
2655 nla_put_failure:
2656 	nlmsg_cancel(skb, nlh);
2657 	return -EMSGSIZE;
2658 }
2659 
2660 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2661 			     struct netlink_ext_ack *extack)
2662 {
2663 	struct net *net = sock_net(in_skb->sk);
2664 	struct rtmsg *rtm;
2665 	struct nlattr *tb[RTA_MAX+1];
2666 	struct fib_result res = {};
2667 	struct rtable *rt = NULL;
2668 	struct flowi4 fl4;
2669 	__be32 dst = 0;
2670 	__be32 src = 0;
2671 	u32 iif;
2672 	int err;
2673 	int mark;
2674 	struct sk_buff *skb;
2675 	u32 table_id = RT_TABLE_MAIN;
2676 	kuid_t uid;
2677 
2678 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2679 			  extack);
2680 	if (err < 0)
2681 		goto errout;
2682 
2683 	rtm = nlmsg_data(nlh);
2684 
2685 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2686 	if (!skb) {
2687 		err = -ENOBUFS;
2688 		goto errout;
2689 	}
2690 
2691 	/* Reserve room for dummy headers, this skb can pass
2692 	   through good chunk of routing engine.
2693 	 */
2694 	skb_reset_mac_header(skb);
2695 	skb_reset_network_header(skb);
2696 
2697 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2698 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2699 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2700 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2701 	if (tb[RTA_UID])
2702 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2703 	else
2704 		uid = (iif ? INVALID_UID : current_uid());
2705 
2706 	/* Bugfix: need to give ip_route_input enough of an IP header to
2707 	 * not gag.
2708 	 */
2709 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2710 	ip_hdr(skb)->saddr = src;
2711 	ip_hdr(skb)->daddr = dst;
2712 
2713 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2714 
2715 	memset(&fl4, 0, sizeof(fl4));
2716 	fl4.daddr = dst;
2717 	fl4.saddr = src;
2718 	fl4.flowi4_tos = rtm->rtm_tos;
2719 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2720 	fl4.flowi4_mark = mark;
2721 	fl4.flowi4_uid = uid;
2722 
2723 	rcu_read_lock();
2724 
2725 	if (iif) {
2726 		struct net_device *dev;
2727 
2728 		dev = dev_get_by_index_rcu(net, iif);
2729 		if (!dev) {
2730 			err = -ENODEV;
2731 			goto errout_free;
2732 		}
2733 
2734 		skb->protocol	= htons(ETH_P_IP);
2735 		skb->dev	= dev;
2736 		skb->mark	= mark;
2737 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2738 					 dev, &res);
2739 
2740 		rt = skb_rtable(skb);
2741 		if (err == 0 && rt->dst.error)
2742 			err = -rt->dst.error;
2743 	} else {
2744 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2745 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2746 		err = 0;
2747 		if (IS_ERR(rt))
2748 			err = PTR_ERR(rt);
2749 		else
2750 			skb_dst_set(skb, &rt->dst);
2751 	}
2752 
2753 	if (err)
2754 		goto errout_free;
2755 
2756 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2757 		rt->rt_flags |= RTCF_NOTIFY;
2758 
2759 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2760 		table_id = res.table ? res.table->tb_id : 0;
2761 
2762 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2763 		if (!res.fi) {
2764 			err = fib_props[res.type].error;
2765 			if (!err)
2766 				err = -EHOSTUNREACH;
2767 			goto errout_free;
2768 		}
2769 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2770 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2771 				    rt->rt_type, res.prefix, res.prefixlen,
2772 				    fl4.flowi4_tos, res.fi, 0);
2773 	} else {
2774 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2775 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2776 	}
2777 	if (err < 0)
2778 		goto errout_free;
2779 
2780 	rcu_read_unlock();
2781 
2782 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2783 errout:
2784 	return err;
2785 
2786 errout_free:
2787 	rcu_read_unlock();
2788 	kfree_skb(skb);
2789 	goto errout;
2790 }
2791 
2792 void ip_rt_multicast_event(struct in_device *in_dev)
2793 {
2794 	rt_cache_flush(dev_net(in_dev->dev));
2795 }
2796 
2797 #ifdef CONFIG_SYSCTL
2798 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2799 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2800 static int ip_rt_gc_elasticity __read_mostly	= 8;
2801 
2802 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2803 					void __user *buffer,
2804 					size_t *lenp, loff_t *ppos)
2805 {
2806 	struct net *net = (struct net *)__ctl->extra1;
2807 
2808 	if (write) {
2809 		rt_cache_flush(net);
2810 		fnhe_genid_bump(net);
2811 		return 0;
2812 	}
2813 
2814 	return -EINVAL;
2815 }
2816 
2817 static struct ctl_table ipv4_route_table[] = {
2818 	{
2819 		.procname	= "gc_thresh",
2820 		.data		= &ipv4_dst_ops.gc_thresh,
2821 		.maxlen		= sizeof(int),
2822 		.mode		= 0644,
2823 		.proc_handler	= proc_dointvec,
2824 	},
2825 	{
2826 		.procname	= "max_size",
2827 		.data		= &ip_rt_max_size,
2828 		.maxlen		= sizeof(int),
2829 		.mode		= 0644,
2830 		.proc_handler	= proc_dointvec,
2831 	},
2832 	{
2833 		/*  Deprecated. Use gc_min_interval_ms */
2834 
2835 		.procname	= "gc_min_interval",
2836 		.data		= &ip_rt_gc_min_interval,
2837 		.maxlen		= sizeof(int),
2838 		.mode		= 0644,
2839 		.proc_handler	= proc_dointvec_jiffies,
2840 	},
2841 	{
2842 		.procname	= "gc_min_interval_ms",
2843 		.data		= &ip_rt_gc_min_interval,
2844 		.maxlen		= sizeof(int),
2845 		.mode		= 0644,
2846 		.proc_handler	= proc_dointvec_ms_jiffies,
2847 	},
2848 	{
2849 		.procname	= "gc_timeout",
2850 		.data		= &ip_rt_gc_timeout,
2851 		.maxlen		= sizeof(int),
2852 		.mode		= 0644,
2853 		.proc_handler	= proc_dointvec_jiffies,
2854 	},
2855 	{
2856 		.procname	= "gc_interval",
2857 		.data		= &ip_rt_gc_interval,
2858 		.maxlen		= sizeof(int),
2859 		.mode		= 0644,
2860 		.proc_handler	= proc_dointvec_jiffies,
2861 	},
2862 	{
2863 		.procname	= "redirect_load",
2864 		.data		= &ip_rt_redirect_load,
2865 		.maxlen		= sizeof(int),
2866 		.mode		= 0644,
2867 		.proc_handler	= proc_dointvec,
2868 	},
2869 	{
2870 		.procname	= "redirect_number",
2871 		.data		= &ip_rt_redirect_number,
2872 		.maxlen		= sizeof(int),
2873 		.mode		= 0644,
2874 		.proc_handler	= proc_dointvec,
2875 	},
2876 	{
2877 		.procname	= "redirect_silence",
2878 		.data		= &ip_rt_redirect_silence,
2879 		.maxlen		= sizeof(int),
2880 		.mode		= 0644,
2881 		.proc_handler	= proc_dointvec,
2882 	},
2883 	{
2884 		.procname	= "error_cost",
2885 		.data		= &ip_rt_error_cost,
2886 		.maxlen		= sizeof(int),
2887 		.mode		= 0644,
2888 		.proc_handler	= proc_dointvec,
2889 	},
2890 	{
2891 		.procname	= "error_burst",
2892 		.data		= &ip_rt_error_burst,
2893 		.maxlen		= sizeof(int),
2894 		.mode		= 0644,
2895 		.proc_handler	= proc_dointvec,
2896 	},
2897 	{
2898 		.procname	= "gc_elasticity",
2899 		.data		= &ip_rt_gc_elasticity,
2900 		.maxlen		= sizeof(int),
2901 		.mode		= 0644,
2902 		.proc_handler	= proc_dointvec,
2903 	},
2904 	{
2905 		.procname	= "mtu_expires",
2906 		.data		= &ip_rt_mtu_expires,
2907 		.maxlen		= sizeof(int),
2908 		.mode		= 0644,
2909 		.proc_handler	= proc_dointvec_jiffies,
2910 	},
2911 	{
2912 		.procname	= "min_pmtu",
2913 		.data		= &ip_rt_min_pmtu,
2914 		.maxlen		= sizeof(int),
2915 		.mode		= 0644,
2916 		.proc_handler	= proc_dointvec,
2917 	},
2918 	{
2919 		.procname	= "min_adv_mss",
2920 		.data		= &ip_rt_min_advmss,
2921 		.maxlen		= sizeof(int),
2922 		.mode		= 0644,
2923 		.proc_handler	= proc_dointvec,
2924 	},
2925 	{ }
2926 };
2927 
2928 static struct ctl_table ipv4_route_flush_table[] = {
2929 	{
2930 		.procname	= "flush",
2931 		.maxlen		= sizeof(int),
2932 		.mode		= 0200,
2933 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2934 	},
2935 	{ },
2936 };
2937 
2938 static __net_init int sysctl_route_net_init(struct net *net)
2939 {
2940 	struct ctl_table *tbl;
2941 
2942 	tbl = ipv4_route_flush_table;
2943 	if (!net_eq(net, &init_net)) {
2944 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2945 		if (!tbl)
2946 			goto err_dup;
2947 
2948 		/* Don't export sysctls to unprivileged users */
2949 		if (net->user_ns != &init_user_ns)
2950 			tbl[0].procname = NULL;
2951 	}
2952 	tbl[0].extra1 = net;
2953 
2954 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2955 	if (!net->ipv4.route_hdr)
2956 		goto err_reg;
2957 	return 0;
2958 
2959 err_reg:
2960 	if (tbl != ipv4_route_flush_table)
2961 		kfree(tbl);
2962 err_dup:
2963 	return -ENOMEM;
2964 }
2965 
2966 static __net_exit void sysctl_route_net_exit(struct net *net)
2967 {
2968 	struct ctl_table *tbl;
2969 
2970 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2971 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2972 	BUG_ON(tbl == ipv4_route_flush_table);
2973 	kfree(tbl);
2974 }
2975 
2976 static __net_initdata struct pernet_operations sysctl_route_ops = {
2977 	.init = sysctl_route_net_init,
2978 	.exit = sysctl_route_net_exit,
2979 	.async = true,
2980 };
2981 #endif
2982 
2983 static __net_init int rt_genid_init(struct net *net)
2984 {
2985 	atomic_set(&net->ipv4.rt_genid, 0);
2986 	atomic_set(&net->fnhe_genid, 0);
2987 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
2988 	return 0;
2989 }
2990 
2991 static __net_initdata struct pernet_operations rt_genid_ops = {
2992 	.init = rt_genid_init,
2993 	.async = true,
2994 };
2995 
2996 static int __net_init ipv4_inetpeer_init(struct net *net)
2997 {
2998 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2999 
3000 	if (!bp)
3001 		return -ENOMEM;
3002 	inet_peer_base_init(bp);
3003 	net->ipv4.peers = bp;
3004 	return 0;
3005 }
3006 
3007 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3008 {
3009 	struct inet_peer_base *bp = net->ipv4.peers;
3010 
3011 	net->ipv4.peers = NULL;
3012 	inetpeer_invalidate_tree(bp);
3013 	kfree(bp);
3014 }
3015 
3016 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3017 	.init	=	ipv4_inetpeer_init,
3018 	.exit	=	ipv4_inetpeer_exit,
3019 	.async	=	true,
3020 };
3021 
3022 #ifdef CONFIG_IP_ROUTE_CLASSID
3023 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3024 #endif /* CONFIG_IP_ROUTE_CLASSID */
3025 
3026 int __init ip_rt_init(void)
3027 {
3028 	int cpu;
3029 
3030 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3031 	if (!ip_idents)
3032 		panic("IP: failed to allocate ip_idents\n");
3033 
3034 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3035 
3036 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3037 	if (!ip_tstamps)
3038 		panic("IP: failed to allocate ip_tstamps\n");
3039 
3040 	for_each_possible_cpu(cpu) {
3041 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3042 
3043 		INIT_LIST_HEAD(&ul->head);
3044 		spin_lock_init(&ul->lock);
3045 	}
3046 #ifdef CONFIG_IP_ROUTE_CLASSID
3047 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3048 	if (!ip_rt_acct)
3049 		panic("IP: failed to allocate ip_rt_acct\n");
3050 #endif
3051 
3052 	ipv4_dst_ops.kmem_cachep =
3053 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3054 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3055 
3056 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3057 
3058 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3059 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3060 
3061 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3062 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3063 
3064 	ipv4_dst_ops.gc_thresh = ~0;
3065 	ip_rt_max_size = INT_MAX;
3066 
3067 	devinet_init();
3068 	ip_fib_init();
3069 
3070 	if (ip_rt_proc_init())
3071 		pr_err("Unable to create route proc files\n");
3072 #ifdef CONFIG_XFRM
3073 	xfrm_init();
3074 	xfrm4_init();
3075 #endif
3076 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3077 		      RTNL_FLAG_DOIT_UNLOCKED);
3078 
3079 #ifdef CONFIG_SYSCTL
3080 	register_pernet_subsys(&sysctl_route_ops);
3081 #endif
3082 	register_pernet_subsys(&rt_genid_ops);
3083 	register_pernet_subsys(&ipv4_inetpeer_ops);
3084 	return 0;
3085 }
3086 
3087 #ifdef CONFIG_SYSCTL
3088 /*
3089  * We really need to sanitize the damn ipv4 init order, then all
3090  * this nonsense will go away.
3091  */
3092 void __init ip_static_sysctl_init(void)
3093 {
3094 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3095 }
3096 #endif
3097