xref: /linux/net/ipv4/route.c (revision bf8981a2aa082d9d64771b47c8a1c9c388d8cd40)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115 
116 #include "fib_lookup.h"
117 
118 #define RT_FL_TOS(oldflp4) \
119 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120 
121 #define RT_GC_TIMEOUT (300*HZ)
122 
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly	= 9;
125 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly	= HZ;
128 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 
133 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
134 
135 /*
136  *	Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void		 ipv4_link_failure(struct sk_buff *skb);
144 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 					   struct sk_buff *skb, u32 mtu);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct file_operations rt_cache_seq_fops = {
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.open	 = rt_cpu_seq_open,
334 	.read	 = seq_read,
335 	.llseek	 = seq_lseek,
336 	.release = seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_seq_fops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_seq_fops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	struct net_device *dev = dst->dev;
438 	const __be32 *pkey = daddr;
439 	const struct rtable *rt;
440 	struct neighbour *n;
441 
442 	rt = (const struct rtable *) dst;
443 	if (rt->rt_gateway)
444 		pkey = (const __be32 *) &rt->rt_gateway;
445 	else if (skb)
446 		pkey = &ip_hdr(skb)->daddr;
447 
448 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
449 	if (n)
450 		return n;
451 	return neigh_create(&arp_tbl, pkey, dev);
452 }
453 
454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
455 {
456 	struct net_device *dev = dst->dev;
457 	const __be32 *pkey = daddr;
458 	const struct rtable *rt;
459 
460 	rt = (const struct rtable *)dst;
461 	if (rt->rt_gateway)
462 		pkey = (const __be32 *)&rt->rt_gateway;
463 	else if (!daddr ||
464 		 (rt->rt_flags &
465 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
466 		return;
467 
468 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
469 }
470 
471 #define IP_IDENTS_SZ 2048u
472 
473 static atomic_t *ip_idents __read_mostly;
474 static u32 *ip_tstamps __read_mostly;
475 
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
484 	u32 old = READ_ONCE(*p_tstamp);
485 	u32 now = (u32)jiffies;
486 	u32 new, delta = 0;
487 
488 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
489 		delta = prandom_u32_max(now - old);
490 
491 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
492 	do {
493 		old = (u32)atomic_read(p_id);
494 		new = old + delta + segs;
495 	} while (atomic_cmpxchg(p_id, old, new) != old);
496 
497 	return new - segs;
498 }
499 EXPORT_SYMBOL(ip_idents_reserve);
500 
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
502 {
503 	u32 hash, id;
504 
505 	/* Note the following code is not safe, but this is okay. */
506 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
507 		get_random_bytes(&net->ipv4.ip_id_key,
508 				 sizeof(net->ipv4.ip_id_key));
509 
510 	hash = siphash_3u32((__force u32)iph->daddr,
511 			    (__force u32)iph->saddr,
512 			    iph->protocol,
513 			    &net->ipv4.ip_id_key);
514 	id = ip_idents_reserve(hash, segs);
515 	iph->id = htons(id);
516 }
517 EXPORT_SYMBOL(__ip_select_ident);
518 
519 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
520 			     const struct sock *sk,
521 			     const struct iphdr *iph,
522 			     int oif, u8 tos,
523 			     u8 prot, u32 mark, int flow_flags)
524 {
525 	if (sk) {
526 		const struct inet_sock *inet = inet_sk(sk);
527 
528 		oif = sk->sk_bound_dev_if;
529 		mark = sk->sk_mark;
530 		tos = RT_CONN_FLAGS(sk);
531 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
532 	}
533 	flowi4_init_output(fl4, oif, mark, tos,
534 			   RT_SCOPE_UNIVERSE, prot,
535 			   flow_flags,
536 			   iph->daddr, iph->saddr, 0, 0,
537 			   sock_net_uid(net, sk));
538 }
539 
540 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
541 			       const struct sock *sk)
542 {
543 	const struct net *net = dev_net(skb->dev);
544 	const struct iphdr *iph = ip_hdr(skb);
545 	int oif = skb->dev->ifindex;
546 	u8 tos = RT_TOS(iph->tos);
547 	u8 prot = iph->protocol;
548 	u32 mark = skb->mark;
549 
550 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
551 }
552 
553 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
554 {
555 	const struct inet_sock *inet = inet_sk(sk);
556 	const struct ip_options_rcu *inet_opt;
557 	__be32 daddr = inet->inet_daddr;
558 
559 	rcu_read_lock();
560 	inet_opt = rcu_dereference(inet->inet_opt);
561 	if (inet_opt && inet_opt->opt.srr)
562 		daddr = inet_opt->opt.faddr;
563 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
564 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
565 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
566 			   inet_sk_flowi_flags(sk),
567 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
568 	rcu_read_unlock();
569 }
570 
571 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
572 				 const struct sk_buff *skb)
573 {
574 	if (skb)
575 		build_skb_flow_key(fl4, skb, sk);
576 	else
577 		build_sk_flow_key(fl4, sk);
578 }
579 
580 static DEFINE_SPINLOCK(fnhe_lock);
581 
582 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
583 {
584 	struct rtable *rt;
585 
586 	rt = rcu_dereference(fnhe->fnhe_rth_input);
587 	if (rt) {
588 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
589 		dst_dev_put(&rt->dst);
590 		dst_release(&rt->dst);
591 	}
592 	rt = rcu_dereference(fnhe->fnhe_rth_output);
593 	if (rt) {
594 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
595 		dst_dev_put(&rt->dst);
596 		dst_release(&rt->dst);
597 	}
598 }
599 
600 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
601 {
602 	struct fib_nh_exception *fnhe, *oldest;
603 
604 	oldest = rcu_dereference(hash->chain);
605 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
606 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
607 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608 			oldest = fnhe;
609 	}
610 	fnhe_flush_routes(oldest);
611 	return oldest;
612 }
613 
614 static inline u32 fnhe_hashfun(__be32 daddr)
615 {
616 	static u32 fnhe_hashrnd __read_mostly;
617 	u32 hval;
618 
619 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
620 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
621 	return hash_32(hval, FNHE_HASH_SHIFT);
622 }
623 
624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
625 {
626 	rt->rt_pmtu = fnhe->fnhe_pmtu;
627 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
628 	rt->dst.expires = fnhe->fnhe_expires;
629 
630 	if (fnhe->fnhe_gw) {
631 		rt->rt_flags |= RTCF_REDIRECTED;
632 		rt->rt_gateway = fnhe->fnhe_gw;
633 		rt->rt_uses_gateway = 1;
634 	}
635 }
636 
637 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
638 				  u32 pmtu, bool lock, unsigned long expires)
639 {
640 	struct fnhe_hash_bucket *hash;
641 	struct fib_nh_exception *fnhe;
642 	struct rtable *rt;
643 	u32 genid, hval;
644 	unsigned int i;
645 	int depth;
646 
647 	genid = fnhe_genid(dev_net(nh->fib_nh_dev));
648 	hval = fnhe_hashfun(daddr);
649 
650 	spin_lock_bh(&fnhe_lock);
651 
652 	hash = rcu_dereference(nh->nh_exceptions);
653 	if (!hash) {
654 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
655 		if (!hash)
656 			goto out_unlock;
657 		rcu_assign_pointer(nh->nh_exceptions, hash);
658 	}
659 
660 	hash += hval;
661 
662 	depth = 0;
663 	for (fnhe = rcu_dereference(hash->chain); fnhe;
664 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
665 		if (fnhe->fnhe_daddr == daddr)
666 			break;
667 		depth++;
668 	}
669 
670 	if (fnhe) {
671 		if (fnhe->fnhe_genid != genid)
672 			fnhe->fnhe_genid = genid;
673 		if (gw)
674 			fnhe->fnhe_gw = gw;
675 		if (pmtu) {
676 			fnhe->fnhe_pmtu = pmtu;
677 			fnhe->fnhe_mtu_locked = lock;
678 		}
679 		fnhe->fnhe_expires = max(1UL, expires);
680 		/* Update all cached dsts too */
681 		rt = rcu_dereference(fnhe->fnhe_rth_input);
682 		if (rt)
683 			fill_route_from_fnhe(rt, fnhe);
684 		rt = rcu_dereference(fnhe->fnhe_rth_output);
685 		if (rt)
686 			fill_route_from_fnhe(rt, fnhe);
687 	} else {
688 		if (depth > FNHE_RECLAIM_DEPTH)
689 			fnhe = fnhe_oldest(hash);
690 		else {
691 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
692 			if (!fnhe)
693 				goto out_unlock;
694 
695 			fnhe->fnhe_next = hash->chain;
696 			rcu_assign_pointer(hash->chain, fnhe);
697 		}
698 		fnhe->fnhe_genid = genid;
699 		fnhe->fnhe_daddr = daddr;
700 		fnhe->fnhe_gw = gw;
701 		fnhe->fnhe_pmtu = pmtu;
702 		fnhe->fnhe_mtu_locked = lock;
703 		fnhe->fnhe_expires = max(1UL, expires);
704 
705 		/* Exception created; mark the cached routes for the nexthop
706 		 * stale, so anyone caching it rechecks if this exception
707 		 * applies to them.
708 		 */
709 		rt = rcu_dereference(nh->nh_rth_input);
710 		if (rt)
711 			rt->dst.obsolete = DST_OBSOLETE_KILL;
712 
713 		for_each_possible_cpu(i) {
714 			struct rtable __rcu **prt;
715 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
716 			rt = rcu_dereference(*prt);
717 			if (rt)
718 				rt->dst.obsolete = DST_OBSOLETE_KILL;
719 		}
720 	}
721 
722 	fnhe->fnhe_stamp = jiffies;
723 
724 out_unlock:
725 	spin_unlock_bh(&fnhe_lock);
726 }
727 
728 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
729 			     bool kill_route)
730 {
731 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
732 	__be32 old_gw = ip_hdr(skb)->saddr;
733 	struct net_device *dev = skb->dev;
734 	struct in_device *in_dev;
735 	struct fib_result res;
736 	struct neighbour *n;
737 	struct net *net;
738 
739 	switch (icmp_hdr(skb)->code & 7) {
740 	case ICMP_REDIR_NET:
741 	case ICMP_REDIR_NETTOS:
742 	case ICMP_REDIR_HOST:
743 	case ICMP_REDIR_HOSTTOS:
744 		break;
745 
746 	default:
747 		return;
748 	}
749 
750 	if (rt->rt_gateway != old_gw)
751 		return;
752 
753 	in_dev = __in_dev_get_rcu(dev);
754 	if (!in_dev)
755 		return;
756 
757 	net = dev_net(dev);
758 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
759 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
760 	    ipv4_is_zeronet(new_gw))
761 		goto reject_redirect;
762 
763 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
764 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
765 			goto reject_redirect;
766 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
767 			goto reject_redirect;
768 	} else {
769 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
770 			goto reject_redirect;
771 	}
772 
773 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
774 	if (!n)
775 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
776 	if (!IS_ERR(n)) {
777 		if (!(n->nud_state & NUD_VALID)) {
778 			neigh_event_send(n, NULL);
779 		} else {
780 			if (fib_lookup(net, fl4, &res, 0) == 0) {
781 				struct fib_nh_common *nhc = FIB_RES_NHC(res);
782 				struct fib_nh *nh;
783 
784 				nh = container_of(nhc, struct fib_nh, nh_common);
785 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
786 						0, false,
787 						jiffies + ip_rt_gc_timeout);
788 			}
789 			if (kill_route)
790 				rt->dst.obsolete = DST_OBSOLETE_KILL;
791 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
792 		}
793 		neigh_release(n);
794 	}
795 	return;
796 
797 reject_redirect:
798 #ifdef CONFIG_IP_ROUTE_VERBOSE
799 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
800 		const struct iphdr *iph = (const struct iphdr *) skb->data;
801 		__be32 daddr = iph->daddr;
802 		__be32 saddr = iph->saddr;
803 
804 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
805 				     "  Advised path = %pI4 -> %pI4\n",
806 				     &old_gw, dev->name, &new_gw,
807 				     &saddr, &daddr);
808 	}
809 #endif
810 	;
811 }
812 
813 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
814 {
815 	struct rtable *rt;
816 	struct flowi4 fl4;
817 	const struct iphdr *iph = (const struct iphdr *) skb->data;
818 	struct net *net = dev_net(skb->dev);
819 	int oif = skb->dev->ifindex;
820 	u8 tos = RT_TOS(iph->tos);
821 	u8 prot = iph->protocol;
822 	u32 mark = skb->mark;
823 
824 	rt = (struct rtable *) dst;
825 
826 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
827 	__ip_do_redirect(rt, skb, &fl4, true);
828 }
829 
830 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
831 {
832 	struct rtable *rt = (struct rtable *)dst;
833 	struct dst_entry *ret = dst;
834 
835 	if (rt) {
836 		if (dst->obsolete > 0) {
837 			ip_rt_put(rt);
838 			ret = NULL;
839 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
840 			   rt->dst.expires) {
841 			ip_rt_put(rt);
842 			ret = NULL;
843 		}
844 	}
845 	return ret;
846 }
847 
848 /*
849  * Algorithm:
850  *	1. The first ip_rt_redirect_number redirects are sent
851  *	   with exponential backoff, then we stop sending them at all,
852  *	   assuming that the host ignores our redirects.
853  *	2. If we did not see packets requiring redirects
854  *	   during ip_rt_redirect_silence, we assume that the host
855  *	   forgot redirected route and start to send redirects again.
856  *
857  * This algorithm is much cheaper and more intelligent than dumb load limiting
858  * in icmp.c.
859  *
860  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
861  * and "frag. need" (breaks PMTU discovery) in icmp.c.
862  */
863 
864 void ip_rt_send_redirect(struct sk_buff *skb)
865 {
866 	struct rtable *rt = skb_rtable(skb);
867 	struct in_device *in_dev;
868 	struct inet_peer *peer;
869 	struct net *net;
870 	int log_martians;
871 	int vif;
872 
873 	rcu_read_lock();
874 	in_dev = __in_dev_get_rcu(rt->dst.dev);
875 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
876 		rcu_read_unlock();
877 		return;
878 	}
879 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
880 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
881 	rcu_read_unlock();
882 
883 	net = dev_net(rt->dst.dev);
884 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
885 	if (!peer) {
886 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
887 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
888 		return;
889 	}
890 
891 	/* No redirected packets during ip_rt_redirect_silence;
892 	 * reset the algorithm.
893 	 */
894 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
895 		peer->rate_tokens = 0;
896 		peer->n_redirects = 0;
897 	}
898 
899 	/* Too many ignored redirects; do not send anything
900 	 * set dst.rate_last to the last seen redirected packet.
901 	 */
902 	if (peer->n_redirects >= ip_rt_redirect_number) {
903 		peer->rate_last = jiffies;
904 		goto out_put_peer;
905 	}
906 
907 	/* Check for load limit; set rate_last to the latest sent
908 	 * redirect.
909 	 */
910 	if (peer->rate_tokens == 0 ||
911 	    time_after(jiffies,
912 		       (peer->rate_last +
913 			(ip_rt_redirect_load << peer->rate_tokens)))) {
914 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
915 
916 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
917 		peer->rate_last = jiffies;
918 		++peer->rate_tokens;
919 		++peer->n_redirects;
920 #ifdef CONFIG_IP_ROUTE_VERBOSE
921 		if (log_martians &&
922 		    peer->rate_tokens == ip_rt_redirect_number)
923 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
924 					     &ip_hdr(skb)->saddr, inet_iif(skb),
925 					     &ip_hdr(skb)->daddr, &gw);
926 #endif
927 	}
928 out_put_peer:
929 	inet_putpeer(peer);
930 }
931 
932 static int ip_error(struct sk_buff *skb)
933 {
934 	struct rtable *rt = skb_rtable(skb);
935 	struct net_device *dev = skb->dev;
936 	struct in_device *in_dev;
937 	struct inet_peer *peer;
938 	unsigned long now;
939 	struct net *net;
940 	bool send;
941 	int code;
942 
943 	if (netif_is_l3_master(skb->dev)) {
944 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
945 		if (!dev)
946 			goto out;
947 	}
948 
949 	in_dev = __in_dev_get_rcu(dev);
950 
951 	/* IP on this device is disabled. */
952 	if (!in_dev)
953 		goto out;
954 
955 	net = dev_net(rt->dst.dev);
956 	if (!IN_DEV_FORWARD(in_dev)) {
957 		switch (rt->dst.error) {
958 		case EHOSTUNREACH:
959 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
960 			break;
961 
962 		case ENETUNREACH:
963 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
964 			break;
965 		}
966 		goto out;
967 	}
968 
969 	switch (rt->dst.error) {
970 	case EINVAL:
971 	default:
972 		goto out;
973 	case EHOSTUNREACH:
974 		code = ICMP_HOST_UNREACH;
975 		break;
976 	case ENETUNREACH:
977 		code = ICMP_NET_UNREACH;
978 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
979 		break;
980 	case EACCES:
981 		code = ICMP_PKT_FILTERED;
982 		break;
983 	}
984 
985 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
986 			       l3mdev_master_ifindex(skb->dev), 1);
987 
988 	send = true;
989 	if (peer) {
990 		now = jiffies;
991 		peer->rate_tokens += now - peer->rate_last;
992 		if (peer->rate_tokens > ip_rt_error_burst)
993 			peer->rate_tokens = ip_rt_error_burst;
994 		peer->rate_last = now;
995 		if (peer->rate_tokens >= ip_rt_error_cost)
996 			peer->rate_tokens -= ip_rt_error_cost;
997 		else
998 			send = false;
999 		inet_putpeer(peer);
1000 	}
1001 	if (send)
1002 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1003 
1004 out:	kfree_skb(skb);
1005 	return 0;
1006 }
1007 
1008 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1009 {
1010 	struct dst_entry *dst = &rt->dst;
1011 	u32 old_mtu = ipv4_mtu(dst);
1012 	struct fib_result res;
1013 	bool lock = false;
1014 
1015 	if (ip_mtu_locked(dst))
1016 		return;
1017 
1018 	if (old_mtu < mtu)
1019 		return;
1020 
1021 	if (mtu < ip_rt_min_pmtu) {
1022 		lock = true;
1023 		mtu = min(old_mtu, ip_rt_min_pmtu);
1024 	}
1025 
1026 	if (rt->rt_pmtu == mtu && !lock &&
1027 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1028 		return;
1029 
1030 	rcu_read_lock();
1031 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1032 		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1033 		struct fib_nh *nh;
1034 
1035 		nh = container_of(nhc, struct fib_nh, nh_common);
1036 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1037 				      jiffies + ip_rt_mtu_expires);
1038 	}
1039 	rcu_read_unlock();
1040 }
1041 
1042 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1043 			      struct sk_buff *skb, u32 mtu)
1044 {
1045 	struct rtable *rt = (struct rtable *) dst;
1046 	struct flowi4 fl4;
1047 
1048 	ip_rt_build_flow_key(&fl4, sk, skb);
1049 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1050 }
1051 
1052 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1053 		      int oif, u8 protocol)
1054 {
1055 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1056 	struct flowi4 fl4;
1057 	struct rtable *rt;
1058 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1059 
1060 	__build_flow_key(net, &fl4, NULL, iph, oif,
1061 			 RT_TOS(iph->tos), protocol, mark, 0);
1062 	rt = __ip_route_output_key(net, &fl4);
1063 	if (!IS_ERR(rt)) {
1064 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1065 		ip_rt_put(rt);
1066 	}
1067 }
1068 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1069 
1070 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1071 {
1072 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1073 	struct flowi4 fl4;
1074 	struct rtable *rt;
1075 
1076 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1077 
1078 	if (!fl4.flowi4_mark)
1079 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1080 
1081 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1082 	if (!IS_ERR(rt)) {
1083 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1084 		ip_rt_put(rt);
1085 	}
1086 }
1087 
1088 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1089 {
1090 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1091 	struct flowi4 fl4;
1092 	struct rtable *rt;
1093 	struct dst_entry *odst = NULL;
1094 	bool new = false;
1095 	struct net *net = sock_net(sk);
1096 
1097 	bh_lock_sock(sk);
1098 
1099 	if (!ip_sk_accept_pmtu(sk))
1100 		goto out;
1101 
1102 	odst = sk_dst_get(sk);
1103 
1104 	if (sock_owned_by_user(sk) || !odst) {
1105 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1106 		goto out;
1107 	}
1108 
1109 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1110 
1111 	rt = (struct rtable *)odst;
1112 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1113 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114 		if (IS_ERR(rt))
1115 			goto out;
1116 
1117 		new = true;
1118 	}
1119 
1120 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1121 
1122 	if (!dst_check(&rt->dst, 0)) {
1123 		if (new)
1124 			dst_release(&rt->dst);
1125 
1126 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127 		if (IS_ERR(rt))
1128 			goto out;
1129 
1130 		new = true;
1131 	}
1132 
1133 	if (new)
1134 		sk_dst_set(sk, &rt->dst);
1135 
1136 out:
1137 	bh_unlock_sock(sk);
1138 	dst_release(odst);
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1141 
1142 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143 		   int oif, u8 protocol)
1144 {
1145 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1146 	struct flowi4 fl4;
1147 	struct rtable *rt;
1148 
1149 	__build_flow_key(net, &fl4, NULL, iph, oif,
1150 			 RT_TOS(iph->tos), protocol, 0, 0);
1151 	rt = __ip_route_output_key(net, &fl4);
1152 	if (!IS_ERR(rt)) {
1153 		__ip_do_redirect(rt, skb, &fl4, false);
1154 		ip_rt_put(rt);
1155 	}
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_redirect);
1158 
1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160 {
1161 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1162 	struct flowi4 fl4;
1163 	struct rtable *rt;
1164 	struct net *net = sock_net(sk);
1165 
1166 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1167 	rt = __ip_route_output_key(net, &fl4);
1168 	if (!IS_ERR(rt)) {
1169 		__ip_do_redirect(rt, skb, &fl4, false);
1170 		ip_rt_put(rt);
1171 	}
1172 }
1173 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1174 
1175 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1176 {
1177 	struct rtable *rt = (struct rtable *) dst;
1178 
1179 	/* All IPV4 dsts are created with ->obsolete set to the value
1180 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1181 	 * into this function always.
1182 	 *
1183 	 * When a PMTU/redirect information update invalidates a route,
1184 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1185 	 * DST_OBSOLETE_DEAD.
1186 	 */
1187 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1188 		return NULL;
1189 	return dst;
1190 }
1191 
1192 static void ipv4_link_failure(struct sk_buff *skb)
1193 {
1194 	struct rtable *rt;
1195 
1196 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1197 
1198 	rt = skb_rtable(skb);
1199 	if (rt)
1200 		dst_set_expires(&rt->dst, 0);
1201 }
1202 
1203 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1204 {
1205 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1206 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1207 		 skb->dev ? skb->dev->name : "?");
1208 	kfree_skb(skb);
1209 	WARN_ON(1);
1210 	return 0;
1211 }
1212 
1213 /*
1214    We do not cache source address of outgoing interface,
1215    because it is used only by IP RR, TS and SRR options,
1216    so that it out of fast path.
1217 
1218    BTW remember: "addr" is allowed to be not aligned
1219    in IP options!
1220  */
1221 
1222 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1223 {
1224 	__be32 src;
1225 
1226 	if (rt_is_output_route(rt))
1227 		src = ip_hdr(skb)->saddr;
1228 	else {
1229 		struct fib_result res;
1230 		struct iphdr *iph = ip_hdr(skb);
1231 		struct flowi4 fl4 = {
1232 			.daddr = iph->daddr,
1233 			.saddr = iph->saddr,
1234 			.flowi4_tos = RT_TOS(iph->tos),
1235 			.flowi4_oif = rt->dst.dev->ifindex,
1236 			.flowi4_iif = skb->dev->ifindex,
1237 			.flowi4_mark = skb->mark,
1238 		};
1239 
1240 		rcu_read_lock();
1241 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1242 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1243 		else
1244 			src = inet_select_addr(rt->dst.dev,
1245 					       rt_nexthop(rt, iph->daddr),
1246 					       RT_SCOPE_UNIVERSE);
1247 		rcu_read_unlock();
1248 	}
1249 	memcpy(addr, &src, 4);
1250 }
1251 
1252 #ifdef CONFIG_IP_ROUTE_CLASSID
1253 static void set_class_tag(struct rtable *rt, u32 tag)
1254 {
1255 	if (!(rt->dst.tclassid & 0xFFFF))
1256 		rt->dst.tclassid |= tag & 0xFFFF;
1257 	if (!(rt->dst.tclassid & 0xFFFF0000))
1258 		rt->dst.tclassid |= tag & 0xFFFF0000;
1259 }
1260 #endif
1261 
1262 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1263 {
1264 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1265 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1266 				    ip_rt_min_advmss);
1267 
1268 	return min(advmss, IPV4_MAX_PMTU - header_size);
1269 }
1270 
1271 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1272 {
1273 	const struct rtable *rt = (const struct rtable *) dst;
1274 	unsigned int mtu = rt->rt_pmtu;
1275 
1276 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1277 		mtu = dst_metric_raw(dst, RTAX_MTU);
1278 
1279 	if (mtu)
1280 		return mtu;
1281 
1282 	mtu = READ_ONCE(dst->dev->mtu);
1283 
1284 	if (unlikely(ip_mtu_locked(dst))) {
1285 		if (rt->rt_uses_gateway && mtu > 576)
1286 			mtu = 576;
1287 	}
1288 
1289 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1290 
1291 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1292 }
1293 
1294 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1295 {
1296 	struct fnhe_hash_bucket *hash;
1297 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1298 	u32 hval = fnhe_hashfun(daddr);
1299 
1300 	spin_lock_bh(&fnhe_lock);
1301 
1302 	hash = rcu_dereference_protected(nh->nh_exceptions,
1303 					 lockdep_is_held(&fnhe_lock));
1304 	hash += hval;
1305 
1306 	fnhe_p = &hash->chain;
1307 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1308 	while (fnhe) {
1309 		if (fnhe->fnhe_daddr == daddr) {
1310 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1311 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1312 			/* set fnhe_daddr to 0 to ensure it won't bind with
1313 			 * new dsts in rt_bind_exception().
1314 			 */
1315 			fnhe->fnhe_daddr = 0;
1316 			fnhe_flush_routes(fnhe);
1317 			kfree_rcu(fnhe, rcu);
1318 			break;
1319 		}
1320 		fnhe_p = &fnhe->fnhe_next;
1321 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1322 						 lockdep_is_held(&fnhe_lock));
1323 	}
1324 
1325 	spin_unlock_bh(&fnhe_lock);
1326 }
1327 
1328 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1329 {
1330 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1331 	struct fib_nh_exception *fnhe;
1332 	u32 hval;
1333 
1334 	if (!hash)
1335 		return NULL;
1336 
1337 	hval = fnhe_hashfun(daddr);
1338 
1339 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1340 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1341 		if (fnhe->fnhe_daddr == daddr) {
1342 			if (fnhe->fnhe_expires &&
1343 			    time_after(jiffies, fnhe->fnhe_expires)) {
1344 				ip_del_fnhe(nh, daddr);
1345 				break;
1346 			}
1347 			return fnhe;
1348 		}
1349 	}
1350 	return NULL;
1351 }
1352 
1353 /* MTU selection:
1354  * 1. mtu on route is locked - use it
1355  * 2. mtu from nexthop exception
1356  * 3. mtu from egress device
1357  */
1358 
1359 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1360 {
1361 	struct fib_nh_common *nhc = res->nhc;
1362 	struct net_device *dev = nhc->nhc_dev;
1363 	struct fib_info *fi = res->fi;
1364 	u32 mtu = 0;
1365 
1366 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1367 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1368 		mtu = fi->fib_mtu;
1369 
1370 	if (likely(!mtu)) {
1371 		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
1372 		struct fib_nh_exception *fnhe;
1373 
1374 		fnhe = find_exception(nh, daddr);
1375 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1376 			mtu = fnhe->fnhe_pmtu;
1377 	}
1378 
1379 	if (likely(!mtu))
1380 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1381 
1382 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1383 }
1384 
1385 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1386 			      __be32 daddr, const bool do_cache)
1387 {
1388 	bool ret = false;
1389 
1390 	spin_lock_bh(&fnhe_lock);
1391 
1392 	if (daddr == fnhe->fnhe_daddr) {
1393 		struct rtable __rcu **porig;
1394 		struct rtable *orig;
1395 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1396 
1397 		if (rt_is_input_route(rt))
1398 			porig = &fnhe->fnhe_rth_input;
1399 		else
1400 			porig = &fnhe->fnhe_rth_output;
1401 		orig = rcu_dereference(*porig);
1402 
1403 		if (fnhe->fnhe_genid != genid) {
1404 			fnhe->fnhe_genid = genid;
1405 			fnhe->fnhe_gw = 0;
1406 			fnhe->fnhe_pmtu = 0;
1407 			fnhe->fnhe_expires = 0;
1408 			fnhe->fnhe_mtu_locked = false;
1409 			fnhe_flush_routes(fnhe);
1410 			orig = NULL;
1411 		}
1412 		fill_route_from_fnhe(rt, fnhe);
1413 		if (!rt->rt_gateway)
1414 			rt->rt_gateway = daddr;
1415 
1416 		if (do_cache) {
1417 			dst_hold(&rt->dst);
1418 			rcu_assign_pointer(*porig, rt);
1419 			if (orig) {
1420 				dst_dev_put(&orig->dst);
1421 				dst_release(&orig->dst);
1422 			}
1423 			ret = true;
1424 		}
1425 
1426 		fnhe->fnhe_stamp = jiffies;
1427 	}
1428 	spin_unlock_bh(&fnhe_lock);
1429 
1430 	return ret;
1431 }
1432 
1433 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1434 {
1435 	struct rtable *orig, *prev, **p;
1436 	bool ret = true;
1437 
1438 	if (rt_is_input_route(rt)) {
1439 		p = (struct rtable **)&nh->nh_rth_input;
1440 	} else {
1441 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1442 	}
1443 	orig = *p;
1444 
1445 	/* hold dst before doing cmpxchg() to avoid race condition
1446 	 * on this dst
1447 	 */
1448 	dst_hold(&rt->dst);
1449 	prev = cmpxchg(p, orig, rt);
1450 	if (prev == orig) {
1451 		if (orig) {
1452 			dst_dev_put(&orig->dst);
1453 			dst_release(&orig->dst);
1454 		}
1455 	} else {
1456 		dst_release(&rt->dst);
1457 		ret = false;
1458 	}
1459 
1460 	return ret;
1461 }
1462 
1463 struct uncached_list {
1464 	spinlock_t		lock;
1465 	struct list_head	head;
1466 };
1467 
1468 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1469 
1470 void rt_add_uncached_list(struct rtable *rt)
1471 {
1472 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1473 
1474 	rt->rt_uncached_list = ul;
1475 
1476 	spin_lock_bh(&ul->lock);
1477 	list_add_tail(&rt->rt_uncached, &ul->head);
1478 	spin_unlock_bh(&ul->lock);
1479 }
1480 
1481 void rt_del_uncached_list(struct rtable *rt)
1482 {
1483 	if (!list_empty(&rt->rt_uncached)) {
1484 		struct uncached_list *ul = rt->rt_uncached_list;
1485 
1486 		spin_lock_bh(&ul->lock);
1487 		list_del(&rt->rt_uncached);
1488 		spin_unlock_bh(&ul->lock);
1489 	}
1490 }
1491 
1492 static void ipv4_dst_destroy(struct dst_entry *dst)
1493 {
1494 	struct rtable *rt = (struct rtable *)dst;
1495 
1496 	ip_dst_metrics_put(dst);
1497 	rt_del_uncached_list(rt);
1498 }
1499 
1500 void rt_flush_dev(struct net_device *dev)
1501 {
1502 	struct net *net = dev_net(dev);
1503 	struct rtable *rt;
1504 	int cpu;
1505 
1506 	for_each_possible_cpu(cpu) {
1507 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1508 
1509 		spin_lock_bh(&ul->lock);
1510 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1511 			if (rt->dst.dev != dev)
1512 				continue;
1513 			rt->dst.dev = net->loopback_dev;
1514 			dev_hold(rt->dst.dev);
1515 			dev_put(dev);
1516 		}
1517 		spin_unlock_bh(&ul->lock);
1518 	}
1519 }
1520 
1521 static bool rt_cache_valid(const struct rtable *rt)
1522 {
1523 	return	rt &&
1524 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1525 		!rt_is_expired(rt);
1526 }
1527 
1528 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1529 			   const struct fib_result *res,
1530 			   struct fib_nh_exception *fnhe,
1531 			   struct fib_info *fi, u16 type, u32 itag,
1532 			   const bool do_cache)
1533 {
1534 	bool cached = false;
1535 
1536 	if (fi) {
1537 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1538 		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
1539 
1540 		if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
1541 			rt->rt_gateway = nh->fib_nh_gw4;
1542 			rt->rt_uses_gateway = 1;
1543 		}
1544 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1545 
1546 #ifdef CONFIG_IP_ROUTE_CLASSID
1547 		rt->dst.tclassid = nh->nh_tclassid;
1548 #endif
1549 		rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
1550 		if (unlikely(fnhe))
1551 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1552 		else if (do_cache)
1553 			cached = rt_cache_route(nh, rt);
1554 		if (unlikely(!cached)) {
1555 			/* Routes we intend to cache in nexthop exception or
1556 			 * FIB nexthop have the DST_NOCACHE bit clear.
1557 			 * However, if we are unsuccessful at storing this
1558 			 * route into the cache we really need to set it.
1559 			 */
1560 			if (!rt->rt_gateway)
1561 				rt->rt_gateway = daddr;
1562 			rt_add_uncached_list(rt);
1563 		}
1564 	} else
1565 		rt_add_uncached_list(rt);
1566 
1567 #ifdef CONFIG_IP_ROUTE_CLASSID
1568 #ifdef CONFIG_IP_MULTIPLE_TABLES
1569 	set_class_tag(rt, res->tclassid);
1570 #endif
1571 	set_class_tag(rt, itag);
1572 #endif
1573 }
1574 
1575 struct rtable *rt_dst_alloc(struct net_device *dev,
1576 			    unsigned int flags, u16 type,
1577 			    bool nopolicy, bool noxfrm, bool will_cache)
1578 {
1579 	struct rtable *rt;
1580 
1581 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1582 		       (will_cache ? 0 : DST_HOST) |
1583 		       (nopolicy ? DST_NOPOLICY : 0) |
1584 		       (noxfrm ? DST_NOXFRM : 0));
1585 
1586 	if (rt) {
1587 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1588 		rt->rt_flags = flags;
1589 		rt->rt_type = type;
1590 		rt->rt_is_input = 0;
1591 		rt->rt_iif = 0;
1592 		rt->rt_pmtu = 0;
1593 		rt->rt_mtu_locked = 0;
1594 		rt->rt_gateway = 0;
1595 		rt->rt_uses_gateway = 0;
1596 		INIT_LIST_HEAD(&rt->rt_uncached);
1597 
1598 		rt->dst.output = ip_output;
1599 		if (flags & RTCF_LOCAL)
1600 			rt->dst.input = ip_local_deliver;
1601 	}
1602 
1603 	return rt;
1604 }
1605 EXPORT_SYMBOL(rt_dst_alloc);
1606 
1607 /* called in rcu_read_lock() section */
1608 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1609 			  u8 tos, struct net_device *dev,
1610 			  struct in_device *in_dev, u32 *itag)
1611 {
1612 	int err;
1613 
1614 	/* Primary sanity checks. */
1615 	if (!in_dev)
1616 		return -EINVAL;
1617 
1618 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1619 	    skb->protocol != htons(ETH_P_IP))
1620 		return -EINVAL;
1621 
1622 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1623 		return -EINVAL;
1624 
1625 	if (ipv4_is_zeronet(saddr)) {
1626 		if (!ipv4_is_local_multicast(daddr) &&
1627 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1628 			return -EINVAL;
1629 	} else {
1630 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1631 					  in_dev, itag);
1632 		if (err < 0)
1633 			return err;
1634 	}
1635 	return 0;
1636 }
1637 
1638 /* called in rcu_read_lock() section */
1639 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640 			     u8 tos, struct net_device *dev, int our)
1641 {
1642 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1643 	unsigned int flags = RTCF_MULTICAST;
1644 	struct rtable *rth;
1645 	u32 itag = 0;
1646 	int err;
1647 
1648 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1649 	if (err)
1650 		return err;
1651 
1652 	if (our)
1653 		flags |= RTCF_LOCAL;
1654 
1655 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1656 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1657 	if (!rth)
1658 		return -ENOBUFS;
1659 
1660 #ifdef CONFIG_IP_ROUTE_CLASSID
1661 	rth->dst.tclassid = itag;
1662 #endif
1663 	rth->dst.output = ip_rt_bug;
1664 	rth->rt_is_input= 1;
1665 
1666 #ifdef CONFIG_IP_MROUTE
1667 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1668 		rth->dst.input = ip_mr_input;
1669 #endif
1670 	RT_CACHE_STAT_INC(in_slow_mc);
1671 
1672 	skb_dst_set(skb, &rth->dst);
1673 	return 0;
1674 }
1675 
1676 
1677 static void ip_handle_martian_source(struct net_device *dev,
1678 				     struct in_device *in_dev,
1679 				     struct sk_buff *skb,
1680 				     __be32 daddr,
1681 				     __be32 saddr)
1682 {
1683 	RT_CACHE_STAT_INC(in_martian_src);
1684 #ifdef CONFIG_IP_ROUTE_VERBOSE
1685 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1686 		/*
1687 		 *	RFC1812 recommendation, if source is martian,
1688 		 *	the only hint is MAC header.
1689 		 */
1690 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1691 			&daddr, &saddr, dev->name);
1692 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1693 			print_hex_dump(KERN_WARNING, "ll header: ",
1694 				       DUMP_PREFIX_OFFSET, 16, 1,
1695 				       skb_mac_header(skb),
1696 				       dev->hard_header_len, false);
1697 		}
1698 	}
1699 #endif
1700 }
1701 
1702 /* called in rcu_read_lock() section */
1703 static int __mkroute_input(struct sk_buff *skb,
1704 			   const struct fib_result *res,
1705 			   struct in_device *in_dev,
1706 			   __be32 daddr, __be32 saddr, u32 tos)
1707 {
1708 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1709 	struct net_device *dev = nhc->nhc_dev;
1710 	struct fib_nh_exception *fnhe;
1711 	struct rtable *rth;
1712 	struct fib_nh *nh;
1713 	int err;
1714 	struct in_device *out_dev;
1715 	bool do_cache;
1716 	u32 itag = 0;
1717 
1718 	/* get a working reference to the output device */
1719 	out_dev = __in_dev_get_rcu(dev);
1720 	if (!out_dev) {
1721 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1722 		return -EINVAL;
1723 	}
1724 
1725 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1726 				  in_dev->dev, in_dev, &itag);
1727 	if (err < 0) {
1728 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1729 					 saddr);
1730 
1731 		goto cleanup;
1732 	}
1733 
1734 	do_cache = res->fi && !itag;
1735 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1736 	    skb->protocol == htons(ETH_P_IP)) {
1737 		__be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1738 
1739 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1740 		    inet_addr_onlink(out_dev, saddr, gw))
1741 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1742 	}
1743 
1744 	if (skb->protocol != htons(ETH_P_IP)) {
1745 		/* Not IP (i.e. ARP). Do not create route, if it is
1746 		 * invalid for proxy arp. DNAT routes are always valid.
1747 		 *
1748 		 * Proxy arp feature have been extended to allow, ARP
1749 		 * replies back to the same interface, to support
1750 		 * Private VLAN switch technologies. See arp.c.
1751 		 */
1752 		if (out_dev == in_dev &&
1753 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1754 			err = -EINVAL;
1755 			goto cleanup;
1756 		}
1757 	}
1758 
1759 	nh = container_of(nhc, struct fib_nh, nh_common);
1760 	fnhe = find_exception(nh, daddr);
1761 	if (do_cache) {
1762 		if (fnhe)
1763 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1764 		else
1765 			rth = rcu_dereference(nh->nh_rth_input);
1766 		if (rt_cache_valid(rth)) {
1767 			skb_dst_set_noref(skb, &rth->dst);
1768 			goto out;
1769 		}
1770 	}
1771 
1772 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1773 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1774 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1775 	if (!rth) {
1776 		err = -ENOBUFS;
1777 		goto cleanup;
1778 	}
1779 
1780 	rth->rt_is_input = 1;
1781 	RT_CACHE_STAT_INC(in_slow_tot);
1782 
1783 	rth->dst.input = ip_forward;
1784 
1785 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1786 		       do_cache);
1787 	lwtunnel_set_redirect(&rth->dst);
1788 	skb_dst_set(skb, &rth->dst);
1789 out:
1790 	err = 0;
1791  cleanup:
1792 	return err;
1793 }
1794 
1795 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1796 /* To make ICMP packets follow the right flow, the multipath hash is
1797  * calculated from the inner IP addresses.
1798  */
1799 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1800 				 struct flow_keys *hash_keys)
1801 {
1802 	const struct iphdr *outer_iph = ip_hdr(skb);
1803 	const struct iphdr *key_iph = outer_iph;
1804 	const struct iphdr *inner_iph;
1805 	const struct icmphdr *icmph;
1806 	struct iphdr _inner_iph;
1807 	struct icmphdr _icmph;
1808 
1809 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1810 		goto out;
1811 
1812 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1813 		goto out;
1814 
1815 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1816 				   &_icmph);
1817 	if (!icmph)
1818 		goto out;
1819 
1820 	if (icmph->type != ICMP_DEST_UNREACH &&
1821 	    icmph->type != ICMP_REDIRECT &&
1822 	    icmph->type != ICMP_TIME_EXCEEDED &&
1823 	    icmph->type != ICMP_PARAMETERPROB)
1824 		goto out;
1825 
1826 	inner_iph = skb_header_pointer(skb,
1827 				       outer_iph->ihl * 4 + sizeof(_icmph),
1828 				       sizeof(_inner_iph), &_inner_iph);
1829 	if (!inner_iph)
1830 		goto out;
1831 
1832 	key_iph = inner_iph;
1833 out:
1834 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1835 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1836 }
1837 
1838 /* if skb is set it will be used and fl4 can be NULL */
1839 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1840 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1841 {
1842 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1843 	struct flow_keys hash_keys;
1844 	u32 mhash;
1845 
1846 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1847 	case 0:
1848 		memset(&hash_keys, 0, sizeof(hash_keys));
1849 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1850 		if (skb) {
1851 			ip_multipath_l3_keys(skb, &hash_keys);
1852 		} else {
1853 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1854 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1855 		}
1856 		break;
1857 	case 1:
1858 		/* skb is currently provided only when forwarding */
1859 		if (skb) {
1860 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1861 			struct flow_keys keys;
1862 
1863 			/* short-circuit if we already have L4 hash present */
1864 			if (skb->l4_hash)
1865 				return skb_get_hash_raw(skb) >> 1;
1866 
1867 			memset(&hash_keys, 0, sizeof(hash_keys));
1868 
1869 			if (!flkeys) {
1870 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1871 				flkeys = &keys;
1872 			}
1873 
1874 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1875 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1876 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1877 			hash_keys.ports.src = flkeys->ports.src;
1878 			hash_keys.ports.dst = flkeys->ports.dst;
1879 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1880 		} else {
1881 			memset(&hash_keys, 0, sizeof(hash_keys));
1882 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1883 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1884 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1885 			hash_keys.ports.src = fl4->fl4_sport;
1886 			hash_keys.ports.dst = fl4->fl4_dport;
1887 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1888 		}
1889 		break;
1890 	}
1891 	mhash = flow_hash_from_keys(&hash_keys);
1892 
1893 	if (multipath_hash)
1894 		mhash = jhash_2words(mhash, multipath_hash, 0);
1895 
1896 	return mhash >> 1;
1897 }
1898 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1899 
1900 static int ip_mkroute_input(struct sk_buff *skb,
1901 			    struct fib_result *res,
1902 			    struct in_device *in_dev,
1903 			    __be32 daddr, __be32 saddr, u32 tos,
1904 			    struct flow_keys *hkeys)
1905 {
1906 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1907 	if (res->fi && res->fi->fib_nhs > 1) {
1908 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1909 
1910 		fib_select_multipath(res, h);
1911 	}
1912 #endif
1913 
1914 	/* create a routing cache entry */
1915 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1916 }
1917 
1918 /*
1919  *	NOTE. We drop all the packets that has local source
1920  *	addresses, because every properly looped back packet
1921  *	must have correct destination already attached by output routine.
1922  *
1923  *	Such approach solves two big problems:
1924  *	1. Not simplex devices are handled properly.
1925  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1926  *	called with rcu_read_lock()
1927  */
1928 
1929 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1930 			       u8 tos, struct net_device *dev,
1931 			       struct fib_result *res)
1932 {
1933 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1934 	struct flow_keys *flkeys = NULL, _flkeys;
1935 	struct net    *net = dev_net(dev);
1936 	struct ip_tunnel_info *tun_info;
1937 	int		err = -EINVAL;
1938 	unsigned int	flags = 0;
1939 	u32		itag = 0;
1940 	struct rtable	*rth;
1941 	struct flowi4	fl4;
1942 	bool do_cache;
1943 
1944 	/* IP on this device is disabled. */
1945 
1946 	if (!in_dev)
1947 		goto out;
1948 
1949 	/* Check for the most weird martians, which can be not detected
1950 	   by fib_lookup.
1951 	 */
1952 
1953 	tun_info = skb_tunnel_info(skb);
1954 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1955 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1956 	else
1957 		fl4.flowi4_tun_key.tun_id = 0;
1958 	skb_dst_drop(skb);
1959 
1960 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1961 		goto martian_source;
1962 
1963 	res->fi = NULL;
1964 	res->table = NULL;
1965 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1966 		goto brd_input;
1967 
1968 	/* Accept zero addresses only to limited broadcast;
1969 	 * I even do not know to fix it or not. Waiting for complains :-)
1970 	 */
1971 	if (ipv4_is_zeronet(saddr))
1972 		goto martian_source;
1973 
1974 	if (ipv4_is_zeronet(daddr))
1975 		goto martian_destination;
1976 
1977 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1978 	 * and call it once if daddr or/and saddr are loopback addresses
1979 	 */
1980 	if (ipv4_is_loopback(daddr)) {
1981 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1982 			goto martian_destination;
1983 	} else if (ipv4_is_loopback(saddr)) {
1984 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1985 			goto martian_source;
1986 	}
1987 
1988 	/*
1989 	 *	Now we are ready to route packet.
1990 	 */
1991 	fl4.flowi4_oif = 0;
1992 	fl4.flowi4_iif = dev->ifindex;
1993 	fl4.flowi4_mark = skb->mark;
1994 	fl4.flowi4_tos = tos;
1995 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1996 	fl4.flowi4_flags = 0;
1997 	fl4.daddr = daddr;
1998 	fl4.saddr = saddr;
1999 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2000 
2001 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2002 		flkeys = &_flkeys;
2003 	} else {
2004 		fl4.flowi4_proto = 0;
2005 		fl4.fl4_sport = 0;
2006 		fl4.fl4_dport = 0;
2007 	}
2008 
2009 	err = fib_lookup(net, &fl4, res, 0);
2010 	if (err != 0) {
2011 		if (!IN_DEV_FORWARD(in_dev))
2012 			err = -EHOSTUNREACH;
2013 		goto no_route;
2014 	}
2015 
2016 	if (res->type == RTN_BROADCAST) {
2017 		if (IN_DEV_BFORWARD(in_dev))
2018 			goto make_route;
2019 		goto brd_input;
2020 	}
2021 
2022 	if (res->type == RTN_LOCAL) {
2023 		err = fib_validate_source(skb, saddr, daddr, tos,
2024 					  0, dev, in_dev, &itag);
2025 		if (err < 0)
2026 			goto martian_source;
2027 		goto local_input;
2028 	}
2029 
2030 	if (!IN_DEV_FORWARD(in_dev)) {
2031 		err = -EHOSTUNREACH;
2032 		goto no_route;
2033 	}
2034 	if (res->type != RTN_UNICAST)
2035 		goto martian_destination;
2036 
2037 make_route:
2038 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2039 out:	return err;
2040 
2041 brd_input:
2042 	if (skb->protocol != htons(ETH_P_IP))
2043 		goto e_inval;
2044 
2045 	if (!ipv4_is_zeronet(saddr)) {
2046 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2047 					  in_dev, &itag);
2048 		if (err < 0)
2049 			goto martian_source;
2050 	}
2051 	flags |= RTCF_BROADCAST;
2052 	res->type = RTN_BROADCAST;
2053 	RT_CACHE_STAT_INC(in_brd);
2054 
2055 local_input:
2056 	do_cache = false;
2057 	if (res->fi) {
2058 		if (!itag) {
2059 			struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2060 			struct fib_nh *nh;
2061 
2062 			nh = container_of(nhc, struct fib_nh, nh_common);
2063 			rth = rcu_dereference(nh->nh_rth_input);
2064 			if (rt_cache_valid(rth)) {
2065 				skb_dst_set_noref(skb, &rth->dst);
2066 				err = 0;
2067 				goto out;
2068 			}
2069 			do_cache = true;
2070 		}
2071 	}
2072 
2073 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2074 			   flags | RTCF_LOCAL, res->type,
2075 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2076 	if (!rth)
2077 		goto e_nobufs;
2078 
2079 	rth->dst.output= ip_rt_bug;
2080 #ifdef CONFIG_IP_ROUTE_CLASSID
2081 	rth->dst.tclassid = itag;
2082 #endif
2083 	rth->rt_is_input = 1;
2084 
2085 	RT_CACHE_STAT_INC(in_slow_tot);
2086 	if (res->type == RTN_UNREACHABLE) {
2087 		rth->dst.input= ip_error;
2088 		rth->dst.error= -err;
2089 		rth->rt_flags 	&= ~RTCF_LOCAL;
2090 	}
2091 
2092 	if (do_cache) {
2093 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2094 		struct fib_nh *nh;
2095 
2096 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2097 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2098 			WARN_ON(rth->dst.input == lwtunnel_input);
2099 			rth->dst.lwtstate->orig_input = rth->dst.input;
2100 			rth->dst.input = lwtunnel_input;
2101 		}
2102 
2103 		nh = container_of(nhc, struct fib_nh, nh_common);
2104 		if (unlikely(!rt_cache_route(nh, rth)))
2105 			rt_add_uncached_list(rth);
2106 	}
2107 	skb_dst_set(skb, &rth->dst);
2108 	err = 0;
2109 	goto out;
2110 
2111 no_route:
2112 	RT_CACHE_STAT_INC(in_no_route);
2113 	res->type = RTN_UNREACHABLE;
2114 	res->fi = NULL;
2115 	res->table = NULL;
2116 	goto local_input;
2117 
2118 	/*
2119 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2120 	 */
2121 martian_destination:
2122 	RT_CACHE_STAT_INC(in_martian_dst);
2123 #ifdef CONFIG_IP_ROUTE_VERBOSE
2124 	if (IN_DEV_LOG_MARTIANS(in_dev))
2125 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2126 				     &daddr, &saddr, dev->name);
2127 #endif
2128 
2129 e_inval:
2130 	err = -EINVAL;
2131 	goto out;
2132 
2133 e_nobufs:
2134 	err = -ENOBUFS;
2135 	goto out;
2136 
2137 martian_source:
2138 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2139 	goto out;
2140 }
2141 
2142 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2143 			 u8 tos, struct net_device *dev)
2144 {
2145 	struct fib_result res;
2146 	int err;
2147 
2148 	tos &= IPTOS_RT_MASK;
2149 	rcu_read_lock();
2150 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2151 	rcu_read_unlock();
2152 
2153 	return err;
2154 }
2155 EXPORT_SYMBOL(ip_route_input_noref);
2156 
2157 /* called with rcu_read_lock held */
2158 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2159 		       u8 tos, struct net_device *dev, struct fib_result *res)
2160 {
2161 	/* Multicast recognition logic is moved from route cache to here.
2162 	   The problem was that too many Ethernet cards have broken/missing
2163 	   hardware multicast filters :-( As result the host on multicasting
2164 	   network acquires a lot of useless route cache entries, sort of
2165 	   SDR messages from all the world. Now we try to get rid of them.
2166 	   Really, provided software IP multicast filter is organized
2167 	   reasonably (at least, hashed), it does not result in a slowdown
2168 	   comparing with route cache reject entries.
2169 	   Note, that multicast routers are not affected, because
2170 	   route cache entry is created eventually.
2171 	 */
2172 	if (ipv4_is_multicast(daddr)) {
2173 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2174 		int our = 0;
2175 		int err = -EINVAL;
2176 
2177 		if (!in_dev)
2178 			return err;
2179 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2180 				      ip_hdr(skb)->protocol);
2181 
2182 		/* check l3 master if no match yet */
2183 		if (!our && netif_is_l3_slave(dev)) {
2184 			struct in_device *l3_in_dev;
2185 
2186 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2187 			if (l3_in_dev)
2188 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2189 						      ip_hdr(skb)->protocol);
2190 		}
2191 
2192 		if (our
2193 #ifdef CONFIG_IP_MROUTE
2194 			||
2195 		    (!ipv4_is_local_multicast(daddr) &&
2196 		     IN_DEV_MFORWARD(in_dev))
2197 #endif
2198 		   ) {
2199 			err = ip_route_input_mc(skb, daddr, saddr,
2200 						tos, dev, our);
2201 		}
2202 		return err;
2203 	}
2204 
2205 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2206 }
2207 
2208 /* called with rcu_read_lock() */
2209 static struct rtable *__mkroute_output(const struct fib_result *res,
2210 				       const struct flowi4 *fl4, int orig_oif,
2211 				       struct net_device *dev_out,
2212 				       unsigned int flags)
2213 {
2214 	struct fib_info *fi = res->fi;
2215 	struct fib_nh_exception *fnhe;
2216 	struct in_device *in_dev;
2217 	u16 type = res->type;
2218 	struct rtable *rth;
2219 	bool do_cache;
2220 
2221 	in_dev = __in_dev_get_rcu(dev_out);
2222 	if (!in_dev)
2223 		return ERR_PTR(-EINVAL);
2224 
2225 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2226 		if (ipv4_is_loopback(fl4->saddr) &&
2227 		    !(dev_out->flags & IFF_LOOPBACK) &&
2228 		    !netif_is_l3_master(dev_out))
2229 			return ERR_PTR(-EINVAL);
2230 
2231 	if (ipv4_is_lbcast(fl4->daddr))
2232 		type = RTN_BROADCAST;
2233 	else if (ipv4_is_multicast(fl4->daddr))
2234 		type = RTN_MULTICAST;
2235 	else if (ipv4_is_zeronet(fl4->daddr))
2236 		return ERR_PTR(-EINVAL);
2237 
2238 	if (dev_out->flags & IFF_LOOPBACK)
2239 		flags |= RTCF_LOCAL;
2240 
2241 	do_cache = true;
2242 	if (type == RTN_BROADCAST) {
2243 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2244 		fi = NULL;
2245 	} else if (type == RTN_MULTICAST) {
2246 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2247 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2248 				     fl4->flowi4_proto))
2249 			flags &= ~RTCF_LOCAL;
2250 		else
2251 			do_cache = false;
2252 		/* If multicast route do not exist use
2253 		 * default one, but do not gateway in this case.
2254 		 * Yes, it is hack.
2255 		 */
2256 		if (fi && res->prefixlen < 4)
2257 			fi = NULL;
2258 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2259 		   (orig_oif != dev_out->ifindex)) {
2260 		/* For local routes that require a particular output interface
2261 		 * we do not want to cache the result.  Caching the result
2262 		 * causes incorrect behaviour when there are multiple source
2263 		 * addresses on the interface, the end result being that if the
2264 		 * intended recipient is waiting on that interface for the
2265 		 * packet he won't receive it because it will be delivered on
2266 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2267 		 * be set to the loopback interface as well.
2268 		 */
2269 		do_cache = false;
2270 	}
2271 
2272 	fnhe = NULL;
2273 	do_cache &= fi != NULL;
2274 	if (fi) {
2275 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2276 		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
2277 		struct rtable __rcu **prth;
2278 
2279 		fnhe = find_exception(nh, fl4->daddr);
2280 		if (!do_cache)
2281 			goto add;
2282 		if (fnhe) {
2283 			prth = &fnhe->fnhe_rth_output;
2284 		} else {
2285 			if (unlikely(fl4->flowi4_flags &
2286 				     FLOWI_FLAG_KNOWN_NH &&
2287 				     !(nhc->nhc_has_gw &&
2288 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2289 				do_cache = false;
2290 				goto add;
2291 			}
2292 			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2293 		}
2294 		rth = rcu_dereference(*prth);
2295 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2296 			return rth;
2297 	}
2298 
2299 add:
2300 	rth = rt_dst_alloc(dev_out, flags, type,
2301 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2302 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2303 			   do_cache);
2304 	if (!rth)
2305 		return ERR_PTR(-ENOBUFS);
2306 
2307 	rth->rt_iif = orig_oif;
2308 
2309 	RT_CACHE_STAT_INC(out_slow_tot);
2310 
2311 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2312 		if (flags & RTCF_LOCAL &&
2313 		    !(dev_out->flags & IFF_LOOPBACK)) {
2314 			rth->dst.output = ip_mc_output;
2315 			RT_CACHE_STAT_INC(out_slow_mc);
2316 		}
2317 #ifdef CONFIG_IP_MROUTE
2318 		if (type == RTN_MULTICAST) {
2319 			if (IN_DEV_MFORWARD(in_dev) &&
2320 			    !ipv4_is_local_multicast(fl4->daddr)) {
2321 				rth->dst.input = ip_mr_input;
2322 				rth->dst.output = ip_mc_output;
2323 			}
2324 		}
2325 #endif
2326 	}
2327 
2328 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2329 	lwtunnel_set_redirect(&rth->dst);
2330 
2331 	return rth;
2332 }
2333 
2334 /*
2335  * Major route resolver routine.
2336  */
2337 
2338 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2339 					const struct sk_buff *skb)
2340 {
2341 	__u8 tos = RT_FL_TOS(fl4);
2342 	struct fib_result res = {
2343 		.type		= RTN_UNSPEC,
2344 		.fi		= NULL,
2345 		.table		= NULL,
2346 		.tclassid	= 0,
2347 	};
2348 	struct rtable *rth;
2349 
2350 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2351 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2352 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2353 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2354 
2355 	rcu_read_lock();
2356 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2357 	rcu_read_unlock();
2358 
2359 	return rth;
2360 }
2361 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2362 
2363 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2364 					    struct fib_result *res,
2365 					    const struct sk_buff *skb)
2366 {
2367 	struct net_device *dev_out = NULL;
2368 	int orig_oif = fl4->flowi4_oif;
2369 	unsigned int flags = 0;
2370 	struct rtable *rth;
2371 	int err = -ENETUNREACH;
2372 
2373 	if (fl4->saddr) {
2374 		rth = ERR_PTR(-EINVAL);
2375 		if (ipv4_is_multicast(fl4->saddr) ||
2376 		    ipv4_is_lbcast(fl4->saddr) ||
2377 		    ipv4_is_zeronet(fl4->saddr))
2378 			goto out;
2379 
2380 		/* I removed check for oif == dev_out->oif here.
2381 		   It was wrong for two reasons:
2382 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2383 		      is assigned to multiple interfaces.
2384 		   2. Moreover, we are allowed to send packets with saddr
2385 		      of another iface. --ANK
2386 		 */
2387 
2388 		if (fl4->flowi4_oif == 0 &&
2389 		    (ipv4_is_multicast(fl4->daddr) ||
2390 		     ipv4_is_lbcast(fl4->daddr))) {
2391 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2392 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2393 			if (!dev_out)
2394 				goto out;
2395 
2396 			/* Special hack: user can direct multicasts
2397 			   and limited broadcast via necessary interface
2398 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2399 			   This hack is not just for fun, it allows
2400 			   vic,vat and friends to work.
2401 			   They bind socket to loopback, set ttl to zero
2402 			   and expect that it will work.
2403 			   From the viewpoint of routing cache they are broken,
2404 			   because we are not allowed to build multicast path
2405 			   with loopback source addr (look, routing cache
2406 			   cannot know, that ttl is zero, so that packet
2407 			   will not leave this host and route is valid).
2408 			   Luckily, this hack is good workaround.
2409 			 */
2410 
2411 			fl4->flowi4_oif = dev_out->ifindex;
2412 			goto make_route;
2413 		}
2414 
2415 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2416 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2417 			if (!__ip_dev_find(net, fl4->saddr, false))
2418 				goto out;
2419 		}
2420 	}
2421 
2422 
2423 	if (fl4->flowi4_oif) {
2424 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2425 		rth = ERR_PTR(-ENODEV);
2426 		if (!dev_out)
2427 			goto out;
2428 
2429 		/* RACE: Check return value of inet_select_addr instead. */
2430 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2431 			rth = ERR_PTR(-ENETUNREACH);
2432 			goto out;
2433 		}
2434 		if (ipv4_is_local_multicast(fl4->daddr) ||
2435 		    ipv4_is_lbcast(fl4->daddr) ||
2436 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2437 			if (!fl4->saddr)
2438 				fl4->saddr = inet_select_addr(dev_out, 0,
2439 							      RT_SCOPE_LINK);
2440 			goto make_route;
2441 		}
2442 		if (!fl4->saddr) {
2443 			if (ipv4_is_multicast(fl4->daddr))
2444 				fl4->saddr = inet_select_addr(dev_out, 0,
2445 							      fl4->flowi4_scope);
2446 			else if (!fl4->daddr)
2447 				fl4->saddr = inet_select_addr(dev_out, 0,
2448 							      RT_SCOPE_HOST);
2449 		}
2450 	}
2451 
2452 	if (!fl4->daddr) {
2453 		fl4->daddr = fl4->saddr;
2454 		if (!fl4->daddr)
2455 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2456 		dev_out = net->loopback_dev;
2457 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2458 		res->type = RTN_LOCAL;
2459 		flags |= RTCF_LOCAL;
2460 		goto make_route;
2461 	}
2462 
2463 	err = fib_lookup(net, fl4, res, 0);
2464 	if (err) {
2465 		res->fi = NULL;
2466 		res->table = NULL;
2467 		if (fl4->flowi4_oif &&
2468 		    (ipv4_is_multicast(fl4->daddr) ||
2469 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2470 			/* Apparently, routing tables are wrong. Assume,
2471 			   that the destination is on link.
2472 
2473 			   WHY? DW.
2474 			   Because we are allowed to send to iface
2475 			   even if it has NO routes and NO assigned
2476 			   addresses. When oif is specified, routing
2477 			   tables are looked up with only one purpose:
2478 			   to catch if destination is gatewayed, rather than
2479 			   direct. Moreover, if MSG_DONTROUTE is set,
2480 			   we send packet, ignoring both routing tables
2481 			   and ifaddr state. --ANK
2482 
2483 
2484 			   We could make it even if oif is unknown,
2485 			   likely IPv6, but we do not.
2486 			 */
2487 
2488 			if (fl4->saddr == 0)
2489 				fl4->saddr = inet_select_addr(dev_out, 0,
2490 							      RT_SCOPE_LINK);
2491 			res->type = RTN_UNICAST;
2492 			goto make_route;
2493 		}
2494 		rth = ERR_PTR(err);
2495 		goto out;
2496 	}
2497 
2498 	if (res->type == RTN_LOCAL) {
2499 		if (!fl4->saddr) {
2500 			if (res->fi->fib_prefsrc)
2501 				fl4->saddr = res->fi->fib_prefsrc;
2502 			else
2503 				fl4->saddr = fl4->daddr;
2504 		}
2505 
2506 		/* L3 master device is the loopback for that domain */
2507 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2508 			net->loopback_dev;
2509 
2510 		/* make sure orig_oif points to fib result device even
2511 		 * though packet rx/tx happens over loopback or l3mdev
2512 		 */
2513 		orig_oif = FIB_RES_OIF(*res);
2514 
2515 		fl4->flowi4_oif = dev_out->ifindex;
2516 		flags |= RTCF_LOCAL;
2517 		goto make_route;
2518 	}
2519 
2520 	fib_select_path(net, res, fl4, skb);
2521 
2522 	dev_out = FIB_RES_DEV(*res);
2523 	fl4->flowi4_oif = dev_out->ifindex;
2524 
2525 
2526 make_route:
2527 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2528 
2529 out:
2530 	return rth;
2531 }
2532 
2533 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2534 {
2535 	return NULL;
2536 }
2537 
2538 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2539 {
2540 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2541 
2542 	return mtu ? : dst->dev->mtu;
2543 }
2544 
2545 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2546 					  struct sk_buff *skb, u32 mtu)
2547 {
2548 }
2549 
2550 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2551 				       struct sk_buff *skb)
2552 {
2553 }
2554 
2555 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2556 					  unsigned long old)
2557 {
2558 	return NULL;
2559 }
2560 
2561 static struct dst_ops ipv4_dst_blackhole_ops = {
2562 	.family			=	AF_INET,
2563 	.check			=	ipv4_blackhole_dst_check,
2564 	.mtu			=	ipv4_blackhole_mtu,
2565 	.default_advmss		=	ipv4_default_advmss,
2566 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2567 	.redirect		=	ipv4_rt_blackhole_redirect,
2568 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2569 	.neigh_lookup		=	ipv4_neigh_lookup,
2570 };
2571 
2572 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2573 {
2574 	struct rtable *ort = (struct rtable *) dst_orig;
2575 	struct rtable *rt;
2576 
2577 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2578 	if (rt) {
2579 		struct dst_entry *new = &rt->dst;
2580 
2581 		new->__use = 1;
2582 		new->input = dst_discard;
2583 		new->output = dst_discard_out;
2584 
2585 		new->dev = net->loopback_dev;
2586 		if (new->dev)
2587 			dev_hold(new->dev);
2588 
2589 		rt->rt_is_input = ort->rt_is_input;
2590 		rt->rt_iif = ort->rt_iif;
2591 		rt->rt_pmtu = ort->rt_pmtu;
2592 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2593 
2594 		rt->rt_genid = rt_genid_ipv4(net);
2595 		rt->rt_flags = ort->rt_flags;
2596 		rt->rt_type = ort->rt_type;
2597 		rt->rt_gateway = ort->rt_gateway;
2598 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2599 
2600 		INIT_LIST_HEAD(&rt->rt_uncached);
2601 	}
2602 
2603 	dst_release(dst_orig);
2604 
2605 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2606 }
2607 
2608 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2609 				    const struct sock *sk)
2610 {
2611 	struct rtable *rt = __ip_route_output_key(net, flp4);
2612 
2613 	if (IS_ERR(rt))
2614 		return rt;
2615 
2616 	if (flp4->flowi4_proto)
2617 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2618 							flowi4_to_flowi(flp4),
2619 							sk, 0);
2620 
2621 	return rt;
2622 }
2623 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2624 
2625 /* called with rcu_read_lock held */
2626 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2627 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2628 			struct sk_buff *skb, u32 portid, u32 seq)
2629 {
2630 	struct rtmsg *r;
2631 	struct nlmsghdr *nlh;
2632 	unsigned long expires = 0;
2633 	u32 error;
2634 	u32 metrics[RTAX_MAX];
2635 
2636 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2637 	if (!nlh)
2638 		return -EMSGSIZE;
2639 
2640 	r = nlmsg_data(nlh);
2641 	r->rtm_family	 = AF_INET;
2642 	r->rtm_dst_len	= 32;
2643 	r->rtm_src_len	= 0;
2644 	r->rtm_tos	= fl4->flowi4_tos;
2645 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2646 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2647 		goto nla_put_failure;
2648 	r->rtm_type	= rt->rt_type;
2649 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2650 	r->rtm_protocol = RTPROT_UNSPEC;
2651 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2652 	if (rt->rt_flags & RTCF_NOTIFY)
2653 		r->rtm_flags |= RTM_F_NOTIFY;
2654 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2655 		r->rtm_flags |= RTCF_DOREDIRECT;
2656 
2657 	if (nla_put_in_addr(skb, RTA_DST, dst))
2658 		goto nla_put_failure;
2659 	if (src) {
2660 		r->rtm_src_len = 32;
2661 		if (nla_put_in_addr(skb, RTA_SRC, src))
2662 			goto nla_put_failure;
2663 	}
2664 	if (rt->dst.dev &&
2665 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2666 		goto nla_put_failure;
2667 #ifdef CONFIG_IP_ROUTE_CLASSID
2668 	if (rt->dst.tclassid &&
2669 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2670 		goto nla_put_failure;
2671 #endif
2672 	if (!rt_is_input_route(rt) &&
2673 	    fl4->saddr != src) {
2674 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2675 			goto nla_put_failure;
2676 	}
2677 	if (rt->rt_uses_gateway &&
2678 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2679 		goto nla_put_failure;
2680 
2681 	expires = rt->dst.expires;
2682 	if (expires) {
2683 		unsigned long now = jiffies;
2684 
2685 		if (time_before(now, expires))
2686 			expires -= now;
2687 		else
2688 			expires = 0;
2689 	}
2690 
2691 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2692 	if (rt->rt_pmtu && expires)
2693 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2694 	if (rt->rt_mtu_locked && expires)
2695 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2696 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2697 		goto nla_put_failure;
2698 
2699 	if (fl4->flowi4_mark &&
2700 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2701 		goto nla_put_failure;
2702 
2703 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2704 	    nla_put_u32(skb, RTA_UID,
2705 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2706 		goto nla_put_failure;
2707 
2708 	error = rt->dst.error;
2709 
2710 	if (rt_is_input_route(rt)) {
2711 #ifdef CONFIG_IP_MROUTE
2712 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2713 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2714 			int err = ipmr_get_route(net, skb,
2715 						 fl4->saddr, fl4->daddr,
2716 						 r, portid);
2717 
2718 			if (err <= 0) {
2719 				if (err == 0)
2720 					return 0;
2721 				goto nla_put_failure;
2722 			}
2723 		} else
2724 #endif
2725 			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2726 				goto nla_put_failure;
2727 	}
2728 
2729 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2730 		goto nla_put_failure;
2731 
2732 	nlmsg_end(skb, nlh);
2733 	return 0;
2734 
2735 nla_put_failure:
2736 	nlmsg_cancel(skb, nlh);
2737 	return -EMSGSIZE;
2738 }
2739 
2740 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2741 						   u8 ip_proto, __be16 sport,
2742 						   __be16 dport)
2743 {
2744 	struct sk_buff *skb;
2745 	struct iphdr *iph;
2746 
2747 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2748 	if (!skb)
2749 		return NULL;
2750 
2751 	/* Reserve room for dummy headers, this skb can pass
2752 	 * through good chunk of routing engine.
2753 	 */
2754 	skb_reset_mac_header(skb);
2755 	skb_reset_network_header(skb);
2756 	skb->protocol = htons(ETH_P_IP);
2757 	iph = skb_put(skb, sizeof(struct iphdr));
2758 	iph->protocol = ip_proto;
2759 	iph->saddr = src;
2760 	iph->daddr = dst;
2761 	iph->version = 0x4;
2762 	iph->frag_off = 0;
2763 	iph->ihl = 0x5;
2764 	skb_set_transport_header(skb, skb->len);
2765 
2766 	switch (iph->protocol) {
2767 	case IPPROTO_UDP: {
2768 		struct udphdr *udph;
2769 
2770 		udph = skb_put_zero(skb, sizeof(struct udphdr));
2771 		udph->source = sport;
2772 		udph->dest = dport;
2773 		udph->len = sizeof(struct udphdr);
2774 		udph->check = 0;
2775 		break;
2776 	}
2777 	case IPPROTO_TCP: {
2778 		struct tcphdr *tcph;
2779 
2780 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2781 		tcph->source	= sport;
2782 		tcph->dest	= dport;
2783 		tcph->doff	= sizeof(struct tcphdr) / 4;
2784 		tcph->rst = 1;
2785 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2786 					    src, dst, 0);
2787 		break;
2788 	}
2789 	case IPPROTO_ICMP: {
2790 		struct icmphdr *icmph;
2791 
2792 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2793 		icmph->type = ICMP_ECHO;
2794 		icmph->code = 0;
2795 	}
2796 	}
2797 
2798 	return skb;
2799 }
2800 
2801 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2802 				       const struct nlmsghdr *nlh,
2803 				       struct nlattr **tb,
2804 				       struct netlink_ext_ack *extack)
2805 {
2806 	struct rtmsg *rtm;
2807 	int i, err;
2808 
2809 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2810 		NL_SET_ERR_MSG(extack,
2811 			       "ipv4: Invalid header for route get request");
2812 		return -EINVAL;
2813 	}
2814 
2815 	if (!netlink_strict_get_check(skb))
2816 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
2817 				   rtm_ipv4_policy, extack);
2818 
2819 	rtm = nlmsg_data(nlh);
2820 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2821 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2822 	    rtm->rtm_table || rtm->rtm_protocol ||
2823 	    rtm->rtm_scope || rtm->rtm_type) {
2824 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2825 		return -EINVAL;
2826 	}
2827 
2828 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2829 			       RTM_F_LOOKUP_TABLE |
2830 			       RTM_F_FIB_MATCH)) {
2831 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2832 		return -EINVAL;
2833 	}
2834 
2835 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2836 				 rtm_ipv4_policy, extack);
2837 	if (err)
2838 		return err;
2839 
2840 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2841 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2842 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2843 		return -EINVAL;
2844 	}
2845 
2846 	for (i = 0; i <= RTA_MAX; i++) {
2847 		if (!tb[i])
2848 			continue;
2849 
2850 		switch (i) {
2851 		case RTA_IIF:
2852 		case RTA_OIF:
2853 		case RTA_SRC:
2854 		case RTA_DST:
2855 		case RTA_IP_PROTO:
2856 		case RTA_SPORT:
2857 		case RTA_DPORT:
2858 		case RTA_MARK:
2859 		case RTA_UID:
2860 			break;
2861 		default:
2862 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2863 			return -EINVAL;
2864 		}
2865 	}
2866 
2867 	return 0;
2868 }
2869 
2870 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2871 			     struct netlink_ext_ack *extack)
2872 {
2873 	struct net *net = sock_net(in_skb->sk);
2874 	struct nlattr *tb[RTA_MAX+1];
2875 	u32 table_id = RT_TABLE_MAIN;
2876 	__be16 sport = 0, dport = 0;
2877 	struct fib_result res = {};
2878 	u8 ip_proto = IPPROTO_UDP;
2879 	struct rtable *rt = NULL;
2880 	struct sk_buff *skb;
2881 	struct rtmsg *rtm;
2882 	struct flowi4 fl4 = {};
2883 	__be32 dst = 0;
2884 	__be32 src = 0;
2885 	kuid_t uid;
2886 	u32 iif;
2887 	int err;
2888 	int mark;
2889 
2890 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2891 	if (err < 0)
2892 		return err;
2893 
2894 	rtm = nlmsg_data(nlh);
2895 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2896 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2897 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2898 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2899 	if (tb[RTA_UID])
2900 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2901 	else
2902 		uid = (iif ? INVALID_UID : current_uid());
2903 
2904 	if (tb[RTA_IP_PROTO]) {
2905 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2906 						  &ip_proto, AF_INET, extack);
2907 		if (err)
2908 			return err;
2909 	}
2910 
2911 	if (tb[RTA_SPORT])
2912 		sport = nla_get_be16(tb[RTA_SPORT]);
2913 
2914 	if (tb[RTA_DPORT])
2915 		dport = nla_get_be16(tb[RTA_DPORT]);
2916 
2917 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2918 	if (!skb)
2919 		return -ENOBUFS;
2920 
2921 	fl4.daddr = dst;
2922 	fl4.saddr = src;
2923 	fl4.flowi4_tos = rtm->rtm_tos;
2924 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2925 	fl4.flowi4_mark = mark;
2926 	fl4.flowi4_uid = uid;
2927 	if (sport)
2928 		fl4.fl4_sport = sport;
2929 	if (dport)
2930 		fl4.fl4_dport = dport;
2931 	fl4.flowi4_proto = ip_proto;
2932 
2933 	rcu_read_lock();
2934 
2935 	if (iif) {
2936 		struct net_device *dev;
2937 
2938 		dev = dev_get_by_index_rcu(net, iif);
2939 		if (!dev) {
2940 			err = -ENODEV;
2941 			goto errout_rcu;
2942 		}
2943 
2944 		fl4.flowi4_iif = iif; /* for rt_fill_info */
2945 		skb->dev	= dev;
2946 		skb->mark	= mark;
2947 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2948 					 dev, &res);
2949 
2950 		rt = skb_rtable(skb);
2951 		if (err == 0 && rt->dst.error)
2952 			err = -rt->dst.error;
2953 	} else {
2954 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2955 		skb->dev = net->loopback_dev;
2956 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2957 		err = 0;
2958 		if (IS_ERR(rt))
2959 			err = PTR_ERR(rt);
2960 		else
2961 			skb_dst_set(skb, &rt->dst);
2962 	}
2963 
2964 	if (err)
2965 		goto errout_rcu;
2966 
2967 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2968 		rt->rt_flags |= RTCF_NOTIFY;
2969 
2970 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2971 		table_id = res.table ? res.table->tb_id : 0;
2972 
2973 	/* reset skb for netlink reply msg */
2974 	skb_trim(skb, 0);
2975 	skb_reset_network_header(skb);
2976 	skb_reset_transport_header(skb);
2977 	skb_reset_mac_header(skb);
2978 
2979 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2980 		if (!res.fi) {
2981 			err = fib_props[res.type].error;
2982 			if (!err)
2983 				err = -EHOSTUNREACH;
2984 			goto errout_rcu;
2985 		}
2986 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2987 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2988 				    rt->rt_type, res.prefix, res.prefixlen,
2989 				    fl4.flowi4_tos, res.fi, 0);
2990 	} else {
2991 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2992 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2993 	}
2994 	if (err < 0)
2995 		goto errout_rcu;
2996 
2997 	rcu_read_unlock();
2998 
2999 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3000 
3001 errout_free:
3002 	return err;
3003 errout_rcu:
3004 	rcu_read_unlock();
3005 	kfree_skb(skb);
3006 	goto errout_free;
3007 }
3008 
3009 void ip_rt_multicast_event(struct in_device *in_dev)
3010 {
3011 	rt_cache_flush(dev_net(in_dev->dev));
3012 }
3013 
3014 #ifdef CONFIG_SYSCTL
3015 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3016 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3017 static int ip_rt_gc_elasticity __read_mostly	= 8;
3018 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3019 
3020 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3021 					void __user *buffer,
3022 					size_t *lenp, loff_t *ppos)
3023 {
3024 	struct net *net = (struct net *)__ctl->extra1;
3025 
3026 	if (write) {
3027 		rt_cache_flush(net);
3028 		fnhe_genid_bump(net);
3029 		return 0;
3030 	}
3031 
3032 	return -EINVAL;
3033 }
3034 
3035 static struct ctl_table ipv4_route_table[] = {
3036 	{
3037 		.procname	= "gc_thresh",
3038 		.data		= &ipv4_dst_ops.gc_thresh,
3039 		.maxlen		= sizeof(int),
3040 		.mode		= 0644,
3041 		.proc_handler	= proc_dointvec,
3042 	},
3043 	{
3044 		.procname	= "max_size",
3045 		.data		= &ip_rt_max_size,
3046 		.maxlen		= sizeof(int),
3047 		.mode		= 0644,
3048 		.proc_handler	= proc_dointvec,
3049 	},
3050 	{
3051 		/*  Deprecated. Use gc_min_interval_ms */
3052 
3053 		.procname	= "gc_min_interval",
3054 		.data		= &ip_rt_gc_min_interval,
3055 		.maxlen		= sizeof(int),
3056 		.mode		= 0644,
3057 		.proc_handler	= proc_dointvec_jiffies,
3058 	},
3059 	{
3060 		.procname	= "gc_min_interval_ms",
3061 		.data		= &ip_rt_gc_min_interval,
3062 		.maxlen		= sizeof(int),
3063 		.mode		= 0644,
3064 		.proc_handler	= proc_dointvec_ms_jiffies,
3065 	},
3066 	{
3067 		.procname	= "gc_timeout",
3068 		.data		= &ip_rt_gc_timeout,
3069 		.maxlen		= sizeof(int),
3070 		.mode		= 0644,
3071 		.proc_handler	= proc_dointvec_jiffies,
3072 	},
3073 	{
3074 		.procname	= "gc_interval",
3075 		.data		= &ip_rt_gc_interval,
3076 		.maxlen		= sizeof(int),
3077 		.mode		= 0644,
3078 		.proc_handler	= proc_dointvec_jiffies,
3079 	},
3080 	{
3081 		.procname	= "redirect_load",
3082 		.data		= &ip_rt_redirect_load,
3083 		.maxlen		= sizeof(int),
3084 		.mode		= 0644,
3085 		.proc_handler	= proc_dointvec,
3086 	},
3087 	{
3088 		.procname	= "redirect_number",
3089 		.data		= &ip_rt_redirect_number,
3090 		.maxlen		= sizeof(int),
3091 		.mode		= 0644,
3092 		.proc_handler	= proc_dointvec,
3093 	},
3094 	{
3095 		.procname	= "redirect_silence",
3096 		.data		= &ip_rt_redirect_silence,
3097 		.maxlen		= sizeof(int),
3098 		.mode		= 0644,
3099 		.proc_handler	= proc_dointvec,
3100 	},
3101 	{
3102 		.procname	= "error_cost",
3103 		.data		= &ip_rt_error_cost,
3104 		.maxlen		= sizeof(int),
3105 		.mode		= 0644,
3106 		.proc_handler	= proc_dointvec,
3107 	},
3108 	{
3109 		.procname	= "error_burst",
3110 		.data		= &ip_rt_error_burst,
3111 		.maxlen		= sizeof(int),
3112 		.mode		= 0644,
3113 		.proc_handler	= proc_dointvec,
3114 	},
3115 	{
3116 		.procname	= "gc_elasticity",
3117 		.data		= &ip_rt_gc_elasticity,
3118 		.maxlen		= sizeof(int),
3119 		.mode		= 0644,
3120 		.proc_handler	= proc_dointvec,
3121 	},
3122 	{
3123 		.procname	= "mtu_expires",
3124 		.data		= &ip_rt_mtu_expires,
3125 		.maxlen		= sizeof(int),
3126 		.mode		= 0644,
3127 		.proc_handler	= proc_dointvec_jiffies,
3128 	},
3129 	{
3130 		.procname	= "min_pmtu",
3131 		.data		= &ip_rt_min_pmtu,
3132 		.maxlen		= sizeof(int),
3133 		.mode		= 0644,
3134 		.proc_handler	= proc_dointvec_minmax,
3135 		.extra1		= &ip_min_valid_pmtu,
3136 	},
3137 	{
3138 		.procname	= "min_adv_mss",
3139 		.data		= &ip_rt_min_advmss,
3140 		.maxlen		= sizeof(int),
3141 		.mode		= 0644,
3142 		.proc_handler	= proc_dointvec,
3143 	},
3144 	{ }
3145 };
3146 
3147 static struct ctl_table ipv4_route_flush_table[] = {
3148 	{
3149 		.procname	= "flush",
3150 		.maxlen		= sizeof(int),
3151 		.mode		= 0200,
3152 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3153 	},
3154 	{ },
3155 };
3156 
3157 static __net_init int sysctl_route_net_init(struct net *net)
3158 {
3159 	struct ctl_table *tbl;
3160 
3161 	tbl = ipv4_route_flush_table;
3162 	if (!net_eq(net, &init_net)) {
3163 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3164 		if (!tbl)
3165 			goto err_dup;
3166 
3167 		/* Don't export sysctls to unprivileged users */
3168 		if (net->user_ns != &init_user_ns)
3169 			tbl[0].procname = NULL;
3170 	}
3171 	tbl[0].extra1 = net;
3172 
3173 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3174 	if (!net->ipv4.route_hdr)
3175 		goto err_reg;
3176 	return 0;
3177 
3178 err_reg:
3179 	if (tbl != ipv4_route_flush_table)
3180 		kfree(tbl);
3181 err_dup:
3182 	return -ENOMEM;
3183 }
3184 
3185 static __net_exit void sysctl_route_net_exit(struct net *net)
3186 {
3187 	struct ctl_table *tbl;
3188 
3189 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3190 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3191 	BUG_ON(tbl == ipv4_route_flush_table);
3192 	kfree(tbl);
3193 }
3194 
3195 static __net_initdata struct pernet_operations sysctl_route_ops = {
3196 	.init = sysctl_route_net_init,
3197 	.exit = sysctl_route_net_exit,
3198 };
3199 #endif
3200 
3201 static __net_init int rt_genid_init(struct net *net)
3202 {
3203 	atomic_set(&net->ipv4.rt_genid, 0);
3204 	atomic_set(&net->fnhe_genid, 0);
3205 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3206 	return 0;
3207 }
3208 
3209 static __net_initdata struct pernet_operations rt_genid_ops = {
3210 	.init = rt_genid_init,
3211 };
3212 
3213 static int __net_init ipv4_inetpeer_init(struct net *net)
3214 {
3215 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3216 
3217 	if (!bp)
3218 		return -ENOMEM;
3219 	inet_peer_base_init(bp);
3220 	net->ipv4.peers = bp;
3221 	return 0;
3222 }
3223 
3224 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3225 {
3226 	struct inet_peer_base *bp = net->ipv4.peers;
3227 
3228 	net->ipv4.peers = NULL;
3229 	inetpeer_invalidate_tree(bp);
3230 	kfree(bp);
3231 }
3232 
3233 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3234 	.init	=	ipv4_inetpeer_init,
3235 	.exit	=	ipv4_inetpeer_exit,
3236 };
3237 
3238 #ifdef CONFIG_IP_ROUTE_CLASSID
3239 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3240 #endif /* CONFIG_IP_ROUTE_CLASSID */
3241 
3242 int __init ip_rt_init(void)
3243 {
3244 	int cpu;
3245 
3246 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3247 				  GFP_KERNEL);
3248 	if (!ip_idents)
3249 		panic("IP: failed to allocate ip_idents\n");
3250 
3251 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3252 
3253 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3254 	if (!ip_tstamps)
3255 		panic("IP: failed to allocate ip_tstamps\n");
3256 
3257 	for_each_possible_cpu(cpu) {
3258 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3259 
3260 		INIT_LIST_HEAD(&ul->head);
3261 		spin_lock_init(&ul->lock);
3262 	}
3263 #ifdef CONFIG_IP_ROUTE_CLASSID
3264 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3265 	if (!ip_rt_acct)
3266 		panic("IP: failed to allocate ip_rt_acct\n");
3267 #endif
3268 
3269 	ipv4_dst_ops.kmem_cachep =
3270 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3271 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3272 
3273 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3274 
3275 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3276 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277 
3278 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280 
3281 	ipv4_dst_ops.gc_thresh = ~0;
3282 	ip_rt_max_size = INT_MAX;
3283 
3284 	devinet_init();
3285 	ip_fib_init();
3286 
3287 	if (ip_rt_proc_init())
3288 		pr_err("Unable to create route proc files\n");
3289 #ifdef CONFIG_XFRM
3290 	xfrm_init();
3291 	xfrm4_init();
3292 #endif
3293 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3294 		      RTNL_FLAG_DOIT_UNLOCKED);
3295 
3296 #ifdef CONFIG_SYSCTL
3297 	register_pernet_subsys(&sysctl_route_ops);
3298 #endif
3299 	register_pernet_subsys(&rt_genid_ops);
3300 	register_pernet_subsys(&ipv4_inetpeer_ops);
3301 	return 0;
3302 }
3303 
3304 #ifdef CONFIG_SYSCTL
3305 /*
3306  * We really need to sanitize the damn ipv4 init order, then all
3307  * this nonsense will go away.
3308  */
3309 void __init ip_static_sysctl_init(void)
3310 {
3311 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3312 }
3313 #endif
3314