xref: /linux/net/ipv4/route.c (revision e3b9626f09d429788d929c9b9000a069fcfc056e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly	= 9;
122 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly	= HZ;
125 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly	= 256;
129 
130 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
131 
132 /*
133  *	Interface to generic destination cache.
134  */
135 
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void		 ipv4_link_failure(struct sk_buff *skb);
141 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142 					   struct sk_buff *skb, u32 mtu,
143 					   bool confirm_neigh);
144 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 					struct sk_buff *skb);
146 static void		ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150 	WARN_ON(1);
151 	return NULL;
152 }
153 
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 					   struct sk_buff *skb,
156 					   const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158 
159 static struct dst_ops ipv4_dst_ops = {
160 	.family =		AF_INET,
161 	.check =		ipv4_dst_check,
162 	.default_advmss =	ipv4_default_advmss,
163 	.mtu =			ipv4_mtu,
164 	.cow_metrics =		ipv4_cow_metrics,
165 	.destroy =		ipv4_dst_destroy,
166 	.negative_advice =	ipv4_negative_advice,
167 	.link_failure =		ipv4_link_failure,
168 	.update_pmtu =		ip_rt_update_pmtu,
169 	.redirect =		ip_do_redirect,
170 	.local_out =		__ip_local_out,
171 	.neigh_lookup =		ipv4_neigh_lookup,
172 	.confirm_neigh =	ipv4_confirm_neigh,
173 };
174 
175 #define ECN_OR_COST(class)	TC_PRIO_##class
176 
177 const __u8 ip_tos2prio[16] = {
178 	TC_PRIO_BESTEFFORT,
179 	ECN_OR_COST(BESTEFFORT),
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BULK,
183 	ECN_OR_COST(BULK),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_INTERACTIVE,
187 	ECN_OR_COST(INTERACTIVE),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE_BULK,
191 	ECN_OR_COST(INTERACTIVE_BULK),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196 
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199 
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203 	if (*pos)
204 		return NULL;
205 	return SEQ_START_TOKEN;
206 }
207 
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210 	++*pos;
211 	return NULL;
212 }
213 
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217 
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220 	if (v == SEQ_START_TOKEN)
221 		seq_printf(seq, "%-127s\n",
222 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224 			   "HHUptod\tSpecDst");
225 	return 0;
226 }
227 
228 static const struct seq_operations rt_cache_seq_ops = {
229 	.start  = rt_cache_seq_start,
230 	.next   = rt_cache_seq_next,
231 	.stop   = rt_cache_seq_stop,
232 	.show   = rt_cache_seq_show,
233 };
234 
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237 	return seq_open(file, &rt_cache_seq_ops);
238 }
239 
240 static const struct proc_ops rt_cache_proc_ops = {
241 	.proc_open	= rt_cache_seq_open,
242 	.proc_read	= seq_read,
243 	.proc_lseek	= seq_lseek,
244 	.proc_release	= seq_release,
245 };
246 
247 
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250 	int cpu;
251 
252 	if (*pos == 0)
253 		return SEQ_START_TOKEN;
254 
255 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 		if (!cpu_possible(cpu))
257 			continue;
258 		*pos = cpu+1;
259 		return &per_cpu(rt_cache_stat, cpu);
260 	}
261 	return NULL;
262 }
263 
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266 	int cpu;
267 
268 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 		if (!cpu_possible(cpu))
270 			continue;
271 		*pos = cpu+1;
272 		return &per_cpu(rt_cache_stat, cpu);
273 	}
274 	(*pos)++;
275 	return NULL;
276 
277 }
278 
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281 
282 }
283 
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286 	struct rt_cache_stat *st = v;
287 
288 	if (v == SEQ_START_TOKEN) {
289 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290 		return 0;
291 	}
292 
293 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295 		   dst_entries_get_slow(&ipv4_dst_ops),
296 		   0, /* st->in_hit */
297 		   st->in_slow_tot,
298 		   st->in_slow_mc,
299 		   st->in_no_route,
300 		   st->in_brd,
301 		   st->in_martian_dst,
302 		   st->in_martian_src,
303 
304 		   0, /* st->out_hit */
305 		   st->out_slow_tot,
306 		   st->out_slow_mc,
307 
308 		   0, /* st->gc_total */
309 		   0, /* st->gc_ignored */
310 		   0, /* st->gc_goal_miss */
311 		   0, /* st->gc_dst_overflow */
312 		   0, /* st->in_hlist_search */
313 		   0  /* st->out_hlist_search */
314 		);
315 	return 0;
316 }
317 
318 static const struct seq_operations rt_cpu_seq_ops = {
319 	.start  = rt_cpu_seq_start,
320 	.next   = rt_cpu_seq_next,
321 	.stop   = rt_cpu_seq_stop,
322 	.show   = rt_cpu_seq_show,
323 };
324 
325 
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328 	return seq_open(file, &rt_cpu_seq_ops);
329 }
330 
331 static const struct proc_ops rt_cpu_proc_ops = {
332 	.proc_open	= rt_cpu_seq_open,
333 	.proc_read	= seq_read,
334 	.proc_lseek	= seq_lseek,
335 	.proc_release	= seq_release,
336 };
337 
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341 	struct ip_rt_acct *dst, *src;
342 	unsigned int i, j;
343 
344 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345 	if (!dst)
346 		return -ENOMEM;
347 
348 	for_each_possible_cpu(i) {
349 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 		for (j = 0; j < 256; j++) {
351 			dst[j].o_bytes   += src[j].o_bytes;
352 			dst[j].o_packets += src[j].o_packets;
353 			dst[j].i_bytes   += src[j].i_bytes;
354 			dst[j].i_packets += src[j].i_packets;
355 		}
356 	}
357 
358 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359 	kfree(dst);
360 	return 0;
361 }
362 #endif
363 
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366 	struct proc_dir_entry *pde;
367 
368 	pde = proc_create("rt_cache", 0444, net->proc_net,
369 			  &rt_cache_proc_ops);
370 	if (!pde)
371 		goto err1;
372 
373 	pde = proc_create("rt_cache", 0444,
374 			  net->proc_net_stat, &rt_cpu_proc_ops);
375 	if (!pde)
376 		goto err2;
377 
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379 	pde = proc_create_single("rt_acct", 0, net->proc_net,
380 			rt_acct_proc_show);
381 	if (!pde)
382 		goto err3;
383 #endif
384 	return 0;
385 
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388 	remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391 	remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393 	return -ENOMEM;
394 }
395 
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398 	remove_proc_entry("rt_cache", net->proc_net_stat);
399 	remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 	remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404 
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406 	.init = ip_rt_do_proc_init,
407 	.exit = ip_rt_do_proc_exit,
408 };
409 
410 static int __init ip_rt_proc_init(void)
411 {
412 	return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414 
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418 	return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421 
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426 
427 void rt_cache_flush(struct net *net)
428 {
429 	rt_genid_bump_ipv4(net);
430 }
431 
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433 					   struct sk_buff *skb,
434 					   const void *daddr)
435 {
436 	const struct rtable *rt = container_of(dst, struct rtable, dst);
437 	struct net_device *dev = dst->dev;
438 	struct neighbour *n;
439 
440 	rcu_read_lock_bh();
441 
442 	if (likely(rt->rt_gw_family == AF_INET)) {
443 		n = ip_neigh_gw4(dev, rt->rt_gw4);
444 	} else if (rt->rt_gw_family == AF_INET6) {
445 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447 		__be32 pkey;
448 
449 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450 		n = ip_neigh_gw4(dev, pkey);
451 	}
452 
453 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454 		n = NULL;
455 
456 	rcu_read_unlock_bh();
457 
458 	return n;
459 }
460 
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463 	const struct rtable *rt = container_of(dst, struct rtable, dst);
464 	struct net_device *dev = dst->dev;
465 	const __be32 *pkey = daddr;
466 
467 	if (rt->rt_gw_family == AF_INET) {
468 		pkey = (const __be32 *)&rt->rt_gw4;
469 	} else if (rt->rt_gw_family == AF_INET6) {
470 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471 	} else if (!daddr ||
472 		 (rt->rt_flags &
473 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474 		return;
475 	}
476 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478 
479 #define IP_IDENTS_SZ 2048u
480 
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483 
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492 	u32 old = READ_ONCE(*p_tstamp);
493 	u32 now = (u32)jiffies;
494 	u32 delta = 0;
495 
496 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
497 		delta = prandom_u32_max(now - old);
498 
499 	/* If UBSAN reports an error there, please make sure your compiler
500 	 * supports -fno-strict-overflow before reporting it that was a bug
501 	 * in UBSAN, and it has been fixed in GCC-8.
502 	 */
503 	return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506 
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509 	u32 hash, id;
510 
511 	/* Note the following code is not safe, but this is okay. */
512 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513 		get_random_bytes(&net->ipv4.ip_id_key,
514 				 sizeof(net->ipv4.ip_id_key));
515 
516 	hash = siphash_3u32((__force u32)iph->daddr,
517 			    (__force u32)iph->saddr,
518 			    iph->protocol,
519 			    &net->ipv4.ip_id_key);
520 	id = ip_idents_reserve(hash, segs);
521 	iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524 
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526 			     const struct sock *sk,
527 			     const struct iphdr *iph,
528 			     int oif, u8 tos,
529 			     u8 prot, u32 mark, int flow_flags)
530 {
531 	if (sk) {
532 		const struct inet_sock *inet = inet_sk(sk);
533 
534 		oif = sk->sk_bound_dev_if;
535 		mark = sk->sk_mark;
536 		tos = RT_CONN_FLAGS(sk);
537 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538 	}
539 	flowi4_init_output(fl4, oif, mark, tos,
540 			   RT_SCOPE_UNIVERSE, prot,
541 			   flow_flags,
542 			   iph->daddr, iph->saddr, 0, 0,
543 			   sock_net_uid(net, sk));
544 }
545 
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547 			       const struct sock *sk)
548 {
549 	const struct net *net = dev_net(skb->dev);
550 	const struct iphdr *iph = ip_hdr(skb);
551 	int oif = skb->dev->ifindex;
552 	u8 tos = RT_TOS(iph->tos);
553 	u8 prot = iph->protocol;
554 	u32 mark = skb->mark;
555 
556 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558 
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561 	const struct inet_sock *inet = inet_sk(sk);
562 	const struct ip_options_rcu *inet_opt;
563 	__be32 daddr = inet->inet_daddr;
564 
565 	rcu_read_lock();
566 	inet_opt = rcu_dereference(inet->inet_opt);
567 	if (inet_opt && inet_opt->opt.srr)
568 		daddr = inet_opt->opt.faddr;
569 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572 			   inet_sk_flowi_flags(sk),
573 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574 	rcu_read_unlock();
575 }
576 
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578 				 const struct sk_buff *skb)
579 {
580 	if (skb)
581 		build_skb_flow_key(fl4, skb, sk);
582 	else
583 		build_sk_flow_key(fl4, sk);
584 }
585 
586 static DEFINE_SPINLOCK(fnhe_lock);
587 
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590 	struct rtable *rt;
591 
592 	rt = rcu_dereference(fnhe->fnhe_rth_input);
593 	if (rt) {
594 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595 		dst_dev_put(&rt->dst);
596 		dst_release(&rt->dst);
597 	}
598 	rt = rcu_dereference(fnhe->fnhe_rth_output);
599 	if (rt) {
600 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601 		dst_dev_put(&rt->dst);
602 		dst_release(&rt->dst);
603 	}
604 }
605 
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608 	struct fib_nh_exception *fnhe, *oldest;
609 
610 	oldest = rcu_dereference(hash->chain);
611 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
613 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614 			oldest = fnhe;
615 	}
616 	fnhe_flush_routes(oldest);
617 	return oldest;
618 }
619 
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622 	static u32 fnhe_hashrnd __read_mostly;
623 	u32 hval;
624 
625 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626 	hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
627 	return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629 
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632 	rt->rt_pmtu = fnhe->fnhe_pmtu;
633 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634 	rt->dst.expires = fnhe->fnhe_expires;
635 
636 	if (fnhe->fnhe_gw) {
637 		rt->rt_flags |= RTCF_REDIRECTED;
638 		rt->rt_uses_gateway = 1;
639 		rt->rt_gw_family = AF_INET;
640 		rt->rt_gw4 = fnhe->fnhe_gw;
641 	}
642 }
643 
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645 				  __be32 gw, u32 pmtu, bool lock,
646 				  unsigned long expires)
647 {
648 	struct fnhe_hash_bucket *hash;
649 	struct fib_nh_exception *fnhe;
650 	struct rtable *rt;
651 	u32 genid, hval;
652 	unsigned int i;
653 	int depth;
654 
655 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
656 	hval = fnhe_hashfun(daddr);
657 
658 	spin_lock_bh(&fnhe_lock);
659 
660 	hash = rcu_dereference(nhc->nhc_exceptions);
661 	if (!hash) {
662 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663 		if (!hash)
664 			goto out_unlock;
665 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
666 	}
667 
668 	hash += hval;
669 
670 	depth = 0;
671 	for (fnhe = rcu_dereference(hash->chain); fnhe;
672 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 		if (fnhe->fnhe_daddr == daddr)
674 			break;
675 		depth++;
676 	}
677 
678 	if (fnhe) {
679 		if (fnhe->fnhe_genid != genid)
680 			fnhe->fnhe_genid = genid;
681 		if (gw)
682 			fnhe->fnhe_gw = gw;
683 		if (pmtu) {
684 			fnhe->fnhe_pmtu = pmtu;
685 			fnhe->fnhe_mtu_locked = lock;
686 		}
687 		fnhe->fnhe_expires = max(1UL, expires);
688 		/* Update all cached dsts too */
689 		rt = rcu_dereference(fnhe->fnhe_rth_input);
690 		if (rt)
691 			fill_route_from_fnhe(rt, fnhe);
692 		rt = rcu_dereference(fnhe->fnhe_rth_output);
693 		if (rt)
694 			fill_route_from_fnhe(rt, fnhe);
695 	} else {
696 		if (depth > FNHE_RECLAIM_DEPTH)
697 			fnhe = fnhe_oldest(hash);
698 		else {
699 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700 			if (!fnhe)
701 				goto out_unlock;
702 
703 			fnhe->fnhe_next = hash->chain;
704 			rcu_assign_pointer(hash->chain, fnhe);
705 		}
706 		fnhe->fnhe_genid = genid;
707 		fnhe->fnhe_daddr = daddr;
708 		fnhe->fnhe_gw = gw;
709 		fnhe->fnhe_pmtu = pmtu;
710 		fnhe->fnhe_mtu_locked = lock;
711 		fnhe->fnhe_expires = max(1UL, expires);
712 
713 		/* Exception created; mark the cached routes for the nexthop
714 		 * stale, so anyone caching it rechecks if this exception
715 		 * applies to them.
716 		 */
717 		rt = rcu_dereference(nhc->nhc_rth_input);
718 		if (rt)
719 			rt->dst.obsolete = DST_OBSOLETE_KILL;
720 
721 		for_each_possible_cpu(i) {
722 			struct rtable __rcu **prt;
723 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724 			rt = rcu_dereference(*prt);
725 			if (rt)
726 				rt->dst.obsolete = DST_OBSOLETE_KILL;
727 		}
728 	}
729 
730 	fnhe->fnhe_stamp = jiffies;
731 
732 out_unlock:
733 	spin_unlock_bh(&fnhe_lock);
734 }
735 
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737 			     bool kill_route)
738 {
739 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
740 	__be32 old_gw = ip_hdr(skb)->saddr;
741 	struct net_device *dev = skb->dev;
742 	struct in_device *in_dev;
743 	struct fib_result res;
744 	struct neighbour *n;
745 	struct net *net;
746 
747 	switch (icmp_hdr(skb)->code & 7) {
748 	case ICMP_REDIR_NET:
749 	case ICMP_REDIR_NETTOS:
750 	case ICMP_REDIR_HOST:
751 	case ICMP_REDIR_HOSTTOS:
752 		break;
753 
754 	default:
755 		return;
756 	}
757 
758 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759 		return;
760 
761 	in_dev = __in_dev_get_rcu(dev);
762 	if (!in_dev)
763 		return;
764 
765 	net = dev_net(dev);
766 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768 	    ipv4_is_zeronet(new_gw))
769 		goto reject_redirect;
770 
771 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773 			goto reject_redirect;
774 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775 			goto reject_redirect;
776 	} else {
777 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778 			goto reject_redirect;
779 	}
780 
781 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782 	if (!n)
783 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784 	if (!IS_ERR(n)) {
785 		if (!(n->nud_state & NUD_VALID)) {
786 			neigh_event_send(n, NULL);
787 		} else {
788 			if (fib_lookup(net, fl4, &res, 0) == 0) {
789 				struct fib_nh_common *nhc = FIB_RES_NHC(res);
790 
791 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
792 						0, false,
793 						jiffies + ip_rt_gc_timeout);
794 			}
795 			if (kill_route)
796 				rt->dst.obsolete = DST_OBSOLETE_KILL;
797 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798 		}
799 		neigh_release(n);
800 	}
801 	return;
802 
803 reject_redirect:
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
806 		const struct iphdr *iph = (const struct iphdr *) skb->data;
807 		__be32 daddr = iph->daddr;
808 		__be32 saddr = iph->saddr;
809 
810 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811 				     "  Advised path = %pI4 -> %pI4\n",
812 				     &old_gw, dev->name, &new_gw,
813 				     &saddr, &daddr);
814 	}
815 #endif
816 	;
817 }
818 
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820 {
821 	struct rtable *rt;
822 	struct flowi4 fl4;
823 	const struct iphdr *iph = (const struct iphdr *) skb->data;
824 	struct net *net = dev_net(skb->dev);
825 	int oif = skb->dev->ifindex;
826 	u8 tos = RT_TOS(iph->tos);
827 	u8 prot = iph->protocol;
828 	u32 mark = skb->mark;
829 
830 	rt = (struct rtable *) dst;
831 
832 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833 	__ip_do_redirect(rt, skb, &fl4, true);
834 }
835 
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837 {
838 	struct rtable *rt = (struct rtable *)dst;
839 	struct dst_entry *ret = dst;
840 
841 	if (rt) {
842 		if (dst->obsolete > 0) {
843 			ip_rt_put(rt);
844 			ret = NULL;
845 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846 			   rt->dst.expires) {
847 			ip_rt_put(rt);
848 			ret = NULL;
849 		}
850 	}
851 	return ret;
852 }
853 
854 /*
855  * Algorithm:
856  *	1. The first ip_rt_redirect_number redirects are sent
857  *	   with exponential backoff, then we stop sending them at all,
858  *	   assuming that the host ignores our redirects.
859  *	2. If we did not see packets requiring redirects
860  *	   during ip_rt_redirect_silence, we assume that the host
861  *	   forgot redirected route and start to send redirects again.
862  *
863  * This algorithm is much cheaper and more intelligent than dumb load limiting
864  * in icmp.c.
865  *
866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
868  */
869 
870 void ip_rt_send_redirect(struct sk_buff *skb)
871 {
872 	struct rtable *rt = skb_rtable(skb);
873 	struct in_device *in_dev;
874 	struct inet_peer *peer;
875 	struct net *net;
876 	int log_martians;
877 	int vif;
878 
879 	rcu_read_lock();
880 	in_dev = __in_dev_get_rcu(rt->dst.dev);
881 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882 		rcu_read_unlock();
883 		return;
884 	}
885 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
887 	rcu_read_unlock();
888 
889 	net = dev_net(rt->dst.dev);
890 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
891 	if (!peer) {
892 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
894 		return;
895 	}
896 
897 	/* No redirected packets during ip_rt_redirect_silence;
898 	 * reset the algorithm.
899 	 */
900 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901 		peer->rate_tokens = 0;
902 		peer->n_redirects = 0;
903 	}
904 
905 	/* Too many ignored redirects; do not send anything
906 	 * set dst.rate_last to the last seen redirected packet.
907 	 */
908 	if (peer->n_redirects >= ip_rt_redirect_number) {
909 		peer->rate_last = jiffies;
910 		goto out_put_peer;
911 	}
912 
913 	/* Check for load limit; set rate_last to the latest sent
914 	 * redirect.
915 	 */
916 	if (peer->n_redirects == 0 ||
917 	    time_after(jiffies,
918 		       (peer->rate_last +
919 			(ip_rt_redirect_load << peer->n_redirects)))) {
920 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921 
922 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923 		peer->rate_last = jiffies;
924 		++peer->n_redirects;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926 		if (log_martians &&
927 		    peer->n_redirects == ip_rt_redirect_number)
928 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929 					     &ip_hdr(skb)->saddr, inet_iif(skb),
930 					     &ip_hdr(skb)->daddr, &gw);
931 #endif
932 	}
933 out_put_peer:
934 	inet_putpeer(peer);
935 }
936 
937 static int ip_error(struct sk_buff *skb)
938 {
939 	struct rtable *rt = skb_rtable(skb);
940 	struct net_device *dev = skb->dev;
941 	struct in_device *in_dev;
942 	struct inet_peer *peer;
943 	unsigned long now;
944 	struct net *net;
945 	bool send;
946 	int code;
947 
948 	if (netif_is_l3_master(skb->dev)) {
949 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950 		if (!dev)
951 			goto out;
952 	}
953 
954 	in_dev = __in_dev_get_rcu(dev);
955 
956 	/* IP on this device is disabled. */
957 	if (!in_dev)
958 		goto out;
959 
960 	net = dev_net(rt->dst.dev);
961 	if (!IN_DEV_FORWARD(in_dev)) {
962 		switch (rt->dst.error) {
963 		case EHOSTUNREACH:
964 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965 			break;
966 
967 		case ENETUNREACH:
968 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969 			break;
970 		}
971 		goto out;
972 	}
973 
974 	switch (rt->dst.error) {
975 	case EINVAL:
976 	default:
977 		goto out;
978 	case EHOSTUNREACH:
979 		code = ICMP_HOST_UNREACH;
980 		break;
981 	case ENETUNREACH:
982 		code = ICMP_NET_UNREACH;
983 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984 		break;
985 	case EACCES:
986 		code = ICMP_PKT_FILTERED;
987 		break;
988 	}
989 
990 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991 			       l3mdev_master_ifindex(skb->dev), 1);
992 
993 	send = true;
994 	if (peer) {
995 		now = jiffies;
996 		peer->rate_tokens += now - peer->rate_last;
997 		if (peer->rate_tokens > ip_rt_error_burst)
998 			peer->rate_tokens = ip_rt_error_burst;
999 		peer->rate_last = now;
1000 		if (peer->rate_tokens >= ip_rt_error_cost)
1001 			peer->rate_tokens -= ip_rt_error_cost;
1002 		else
1003 			send = false;
1004 		inet_putpeer(peer);
1005 	}
1006 	if (send)
1007 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008 
1009 out:	kfree_skb(skb);
1010 	return 0;
1011 }
1012 
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 {
1015 	struct dst_entry *dst = &rt->dst;
1016 	struct fib_result res;
1017 	bool lock = false;
1018 	u32 old_mtu;
1019 
1020 	if (ip_mtu_locked(dst))
1021 		return;
1022 
1023 	old_mtu = ipv4_mtu(dst);
1024 	if (old_mtu < mtu)
1025 		return;
1026 
1027 	if (mtu < ip_rt_min_pmtu) {
1028 		lock = true;
1029 		mtu = min(old_mtu, ip_rt_min_pmtu);
1030 	}
1031 
1032 	if (rt->rt_pmtu == mtu && !lock &&
1033 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034 		return;
1035 
1036 	rcu_read_lock();
1037 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038 		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039 
1040 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041 				      jiffies + ip_rt_mtu_expires);
1042 	}
1043 	rcu_read_unlock();
1044 }
1045 
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047 			      struct sk_buff *skb, u32 mtu,
1048 			      bool confirm_neigh)
1049 {
1050 	struct rtable *rt = (struct rtable *) dst;
1051 	struct flowi4 fl4;
1052 
1053 	ip_rt_build_flow_key(&fl4, sk, skb);
1054 
1055 	/* Don't make lookup fail for bridged encapsulations */
1056 	if (skb && netif_is_any_bridge_port(skb->dev))
1057 		fl4.flowi4_oif = 0;
1058 
1059 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1060 }
1061 
1062 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1063 		      int oif, u8 protocol)
1064 {
1065 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1066 	struct flowi4 fl4;
1067 	struct rtable *rt;
1068 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1069 
1070 	__build_flow_key(net, &fl4, NULL, iph, oif,
1071 			 RT_TOS(iph->tos), protocol, mark, 0);
1072 	rt = __ip_route_output_key(net, &fl4);
1073 	if (!IS_ERR(rt)) {
1074 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1075 		ip_rt_put(rt);
1076 	}
1077 }
1078 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1079 
1080 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1083 	struct flowi4 fl4;
1084 	struct rtable *rt;
1085 
1086 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1087 
1088 	if (!fl4.flowi4_mark)
1089 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1090 
1091 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1092 	if (!IS_ERR(rt)) {
1093 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1094 		ip_rt_put(rt);
1095 	}
1096 }
1097 
1098 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1099 {
1100 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1101 	struct flowi4 fl4;
1102 	struct rtable *rt;
1103 	struct dst_entry *odst = NULL;
1104 	bool new = false;
1105 	struct net *net = sock_net(sk);
1106 
1107 	bh_lock_sock(sk);
1108 
1109 	if (!ip_sk_accept_pmtu(sk))
1110 		goto out;
1111 
1112 	odst = sk_dst_get(sk);
1113 
1114 	if (sock_owned_by_user(sk) || !odst) {
1115 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1116 		goto out;
1117 	}
1118 
1119 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1120 
1121 	rt = (struct rtable *)odst;
1122 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1123 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1124 		if (IS_ERR(rt))
1125 			goto out;
1126 
1127 		new = true;
1128 	}
1129 
1130 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1131 
1132 	if (!dst_check(&rt->dst, 0)) {
1133 		if (new)
1134 			dst_release(&rt->dst);
1135 
1136 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1137 		if (IS_ERR(rt))
1138 			goto out;
1139 
1140 		new = true;
1141 	}
1142 
1143 	if (new)
1144 		sk_dst_set(sk, &rt->dst);
1145 
1146 out:
1147 	bh_unlock_sock(sk);
1148 	dst_release(odst);
1149 }
1150 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1151 
1152 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1153 		   int oif, u8 protocol)
1154 {
1155 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1156 	struct flowi4 fl4;
1157 	struct rtable *rt;
1158 
1159 	__build_flow_key(net, &fl4, NULL, iph, oif,
1160 			 RT_TOS(iph->tos), protocol, 0, 0);
1161 	rt = __ip_route_output_key(net, &fl4);
1162 	if (!IS_ERR(rt)) {
1163 		__ip_do_redirect(rt, skb, &fl4, false);
1164 		ip_rt_put(rt);
1165 	}
1166 }
1167 EXPORT_SYMBOL_GPL(ipv4_redirect);
1168 
1169 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1170 {
1171 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1172 	struct flowi4 fl4;
1173 	struct rtable *rt;
1174 	struct net *net = sock_net(sk);
1175 
1176 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1177 	rt = __ip_route_output_key(net, &fl4);
1178 	if (!IS_ERR(rt)) {
1179 		__ip_do_redirect(rt, skb, &fl4, false);
1180 		ip_rt_put(rt);
1181 	}
1182 }
1183 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1184 
1185 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1186 {
1187 	struct rtable *rt = (struct rtable *) dst;
1188 
1189 	/* All IPV4 dsts are created with ->obsolete set to the value
1190 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1191 	 * into this function always.
1192 	 *
1193 	 * When a PMTU/redirect information update invalidates a route,
1194 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1195 	 * DST_OBSOLETE_DEAD.
1196 	 */
1197 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1198 		return NULL;
1199 	return dst;
1200 }
1201 
1202 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1203 {
1204 	struct ip_options opt;
1205 	int res;
1206 
1207 	/* Recompile ip options since IPCB may not be valid anymore.
1208 	 * Also check we have a reasonable ipv4 header.
1209 	 */
1210 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1211 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1212 		return;
1213 
1214 	memset(&opt, 0, sizeof(opt));
1215 	if (ip_hdr(skb)->ihl > 5) {
1216 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1217 			return;
1218 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1219 
1220 		rcu_read_lock();
1221 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1222 		rcu_read_unlock();
1223 
1224 		if (res)
1225 			return;
1226 	}
1227 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1228 }
1229 
1230 static void ipv4_link_failure(struct sk_buff *skb)
1231 {
1232 	struct rtable *rt;
1233 
1234 	ipv4_send_dest_unreach(skb);
1235 
1236 	rt = skb_rtable(skb);
1237 	if (rt)
1238 		dst_set_expires(&rt->dst, 0);
1239 }
1240 
1241 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1242 {
1243 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1244 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1245 		 skb->dev ? skb->dev->name : "?");
1246 	kfree_skb(skb);
1247 	WARN_ON(1);
1248 	return 0;
1249 }
1250 
1251 /*
1252    We do not cache source address of outgoing interface,
1253    because it is used only by IP RR, TS and SRR options,
1254    so that it out of fast path.
1255 
1256    BTW remember: "addr" is allowed to be not aligned
1257    in IP options!
1258  */
1259 
1260 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1261 {
1262 	__be32 src;
1263 
1264 	if (rt_is_output_route(rt))
1265 		src = ip_hdr(skb)->saddr;
1266 	else {
1267 		struct fib_result res;
1268 		struct iphdr *iph = ip_hdr(skb);
1269 		struct flowi4 fl4 = {
1270 			.daddr = iph->daddr,
1271 			.saddr = iph->saddr,
1272 			.flowi4_tos = RT_TOS(iph->tos),
1273 			.flowi4_oif = rt->dst.dev->ifindex,
1274 			.flowi4_iif = skb->dev->ifindex,
1275 			.flowi4_mark = skb->mark,
1276 		};
1277 
1278 		rcu_read_lock();
1279 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1281 		else
1282 			src = inet_select_addr(rt->dst.dev,
1283 					       rt_nexthop(rt, iph->daddr),
1284 					       RT_SCOPE_UNIVERSE);
1285 		rcu_read_unlock();
1286 	}
1287 	memcpy(addr, &src, 4);
1288 }
1289 
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1292 {
1293 	if (!(rt->dst.tclassid & 0xFFFF))
1294 		rt->dst.tclassid |= tag & 0xFFFF;
1295 	if (!(rt->dst.tclassid & 0xFFFF0000))
1296 		rt->dst.tclassid |= tag & 0xFFFF0000;
1297 }
1298 #endif
1299 
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1301 {
1302 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1304 				    ip_rt_min_advmss);
1305 
1306 	return min(advmss, IPV4_MAX_PMTU - header_size);
1307 }
1308 
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1310 {
1311 	const struct rtable *rt = (const struct rtable *)dst;
1312 	unsigned int mtu = rt->rt_pmtu;
1313 
1314 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315 		mtu = dst_metric_raw(dst, RTAX_MTU);
1316 
1317 	if (mtu)
1318 		return mtu;
1319 
1320 	mtu = READ_ONCE(dst->dev->mtu);
1321 
1322 	if (unlikely(ip_mtu_locked(dst))) {
1323 		if (rt->rt_uses_gateway && mtu > 576)
1324 			mtu = 576;
1325 	}
1326 
1327 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1328 
1329 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1330 }
1331 
1332 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1333 {
1334 	struct fnhe_hash_bucket *hash;
1335 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336 	u32 hval = fnhe_hashfun(daddr);
1337 
1338 	spin_lock_bh(&fnhe_lock);
1339 
1340 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1341 					 lockdep_is_held(&fnhe_lock));
1342 	hash += hval;
1343 
1344 	fnhe_p = &hash->chain;
1345 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1346 	while (fnhe) {
1347 		if (fnhe->fnhe_daddr == daddr) {
1348 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350 			/* set fnhe_daddr to 0 to ensure it won't bind with
1351 			 * new dsts in rt_bind_exception().
1352 			 */
1353 			fnhe->fnhe_daddr = 0;
1354 			fnhe_flush_routes(fnhe);
1355 			kfree_rcu(fnhe, rcu);
1356 			break;
1357 		}
1358 		fnhe_p = &fnhe->fnhe_next;
1359 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360 						 lockdep_is_held(&fnhe_lock));
1361 	}
1362 
1363 	spin_unlock_bh(&fnhe_lock);
1364 }
1365 
1366 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1367 					       __be32 daddr)
1368 {
1369 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1370 	struct fib_nh_exception *fnhe;
1371 	u32 hval;
1372 
1373 	if (!hash)
1374 		return NULL;
1375 
1376 	hval = fnhe_hashfun(daddr);
1377 
1378 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1379 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1380 		if (fnhe->fnhe_daddr == daddr) {
1381 			if (fnhe->fnhe_expires &&
1382 			    time_after(jiffies, fnhe->fnhe_expires)) {
1383 				ip_del_fnhe(nhc, daddr);
1384 				break;
1385 			}
1386 			return fnhe;
1387 		}
1388 	}
1389 	return NULL;
1390 }
1391 
1392 /* MTU selection:
1393  * 1. mtu on route is locked - use it
1394  * 2. mtu from nexthop exception
1395  * 3. mtu from egress device
1396  */
1397 
1398 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1399 {
1400 	struct fib_nh_common *nhc = res->nhc;
1401 	struct net_device *dev = nhc->nhc_dev;
1402 	struct fib_info *fi = res->fi;
1403 	u32 mtu = 0;
1404 
1405 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1406 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1407 		mtu = fi->fib_mtu;
1408 
1409 	if (likely(!mtu)) {
1410 		struct fib_nh_exception *fnhe;
1411 
1412 		fnhe = find_exception(nhc, daddr);
1413 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1414 			mtu = fnhe->fnhe_pmtu;
1415 	}
1416 
1417 	if (likely(!mtu))
1418 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1419 
1420 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1421 }
1422 
1423 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1424 			      __be32 daddr, const bool do_cache)
1425 {
1426 	bool ret = false;
1427 
1428 	spin_lock_bh(&fnhe_lock);
1429 
1430 	if (daddr == fnhe->fnhe_daddr) {
1431 		struct rtable __rcu **porig;
1432 		struct rtable *orig;
1433 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1434 
1435 		if (rt_is_input_route(rt))
1436 			porig = &fnhe->fnhe_rth_input;
1437 		else
1438 			porig = &fnhe->fnhe_rth_output;
1439 		orig = rcu_dereference(*porig);
1440 
1441 		if (fnhe->fnhe_genid != genid) {
1442 			fnhe->fnhe_genid = genid;
1443 			fnhe->fnhe_gw = 0;
1444 			fnhe->fnhe_pmtu = 0;
1445 			fnhe->fnhe_expires = 0;
1446 			fnhe->fnhe_mtu_locked = false;
1447 			fnhe_flush_routes(fnhe);
1448 			orig = NULL;
1449 		}
1450 		fill_route_from_fnhe(rt, fnhe);
1451 		if (!rt->rt_gw4) {
1452 			rt->rt_gw4 = daddr;
1453 			rt->rt_gw_family = AF_INET;
1454 		}
1455 
1456 		if (do_cache) {
1457 			dst_hold(&rt->dst);
1458 			rcu_assign_pointer(*porig, rt);
1459 			if (orig) {
1460 				dst_dev_put(&orig->dst);
1461 				dst_release(&orig->dst);
1462 			}
1463 			ret = true;
1464 		}
1465 
1466 		fnhe->fnhe_stamp = jiffies;
1467 	}
1468 	spin_unlock_bh(&fnhe_lock);
1469 
1470 	return ret;
1471 }
1472 
1473 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1474 {
1475 	struct rtable *orig, *prev, **p;
1476 	bool ret = true;
1477 
1478 	if (rt_is_input_route(rt)) {
1479 		p = (struct rtable **)&nhc->nhc_rth_input;
1480 	} else {
1481 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1482 	}
1483 	orig = *p;
1484 
1485 	/* hold dst before doing cmpxchg() to avoid race condition
1486 	 * on this dst
1487 	 */
1488 	dst_hold(&rt->dst);
1489 	prev = cmpxchg(p, orig, rt);
1490 	if (prev == orig) {
1491 		if (orig) {
1492 			rt_add_uncached_list(orig);
1493 			dst_release(&orig->dst);
1494 		}
1495 	} else {
1496 		dst_release(&rt->dst);
1497 		ret = false;
1498 	}
1499 
1500 	return ret;
1501 }
1502 
1503 struct uncached_list {
1504 	spinlock_t		lock;
1505 	struct list_head	head;
1506 };
1507 
1508 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1509 
1510 void rt_add_uncached_list(struct rtable *rt)
1511 {
1512 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1513 
1514 	rt->rt_uncached_list = ul;
1515 
1516 	spin_lock_bh(&ul->lock);
1517 	list_add_tail(&rt->rt_uncached, &ul->head);
1518 	spin_unlock_bh(&ul->lock);
1519 }
1520 
1521 void rt_del_uncached_list(struct rtable *rt)
1522 {
1523 	if (!list_empty(&rt->rt_uncached)) {
1524 		struct uncached_list *ul = rt->rt_uncached_list;
1525 
1526 		spin_lock_bh(&ul->lock);
1527 		list_del(&rt->rt_uncached);
1528 		spin_unlock_bh(&ul->lock);
1529 	}
1530 }
1531 
1532 static void ipv4_dst_destroy(struct dst_entry *dst)
1533 {
1534 	struct rtable *rt = (struct rtable *)dst;
1535 
1536 	ip_dst_metrics_put(dst);
1537 	rt_del_uncached_list(rt);
1538 }
1539 
1540 void rt_flush_dev(struct net_device *dev)
1541 {
1542 	struct rtable *rt;
1543 	int cpu;
1544 
1545 	for_each_possible_cpu(cpu) {
1546 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1547 
1548 		spin_lock_bh(&ul->lock);
1549 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1550 			if (rt->dst.dev != dev)
1551 				continue;
1552 			rt->dst.dev = blackhole_netdev;
1553 			dev_hold(rt->dst.dev);
1554 			dev_put(dev);
1555 		}
1556 		spin_unlock_bh(&ul->lock);
1557 	}
1558 }
1559 
1560 static bool rt_cache_valid(const struct rtable *rt)
1561 {
1562 	return	rt &&
1563 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1564 		!rt_is_expired(rt);
1565 }
1566 
1567 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1568 			   const struct fib_result *res,
1569 			   struct fib_nh_exception *fnhe,
1570 			   struct fib_info *fi, u16 type, u32 itag,
1571 			   const bool do_cache)
1572 {
1573 	bool cached = false;
1574 
1575 	if (fi) {
1576 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1577 
1578 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1579 			rt->rt_uses_gateway = 1;
1580 			rt->rt_gw_family = nhc->nhc_gw_family;
1581 			/* only INET and INET6 are supported */
1582 			if (likely(nhc->nhc_gw_family == AF_INET))
1583 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1584 			else
1585 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1586 		}
1587 
1588 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1589 
1590 #ifdef CONFIG_IP_ROUTE_CLASSID
1591 		if (nhc->nhc_family == AF_INET) {
1592 			struct fib_nh *nh;
1593 
1594 			nh = container_of(nhc, struct fib_nh, nh_common);
1595 			rt->dst.tclassid = nh->nh_tclassid;
1596 		}
1597 #endif
1598 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1599 		if (unlikely(fnhe))
1600 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1601 		else if (do_cache)
1602 			cached = rt_cache_route(nhc, rt);
1603 		if (unlikely(!cached)) {
1604 			/* Routes we intend to cache in nexthop exception or
1605 			 * FIB nexthop have the DST_NOCACHE bit clear.
1606 			 * However, if we are unsuccessful at storing this
1607 			 * route into the cache we really need to set it.
1608 			 */
1609 			if (!rt->rt_gw4) {
1610 				rt->rt_gw_family = AF_INET;
1611 				rt->rt_gw4 = daddr;
1612 			}
1613 			rt_add_uncached_list(rt);
1614 		}
1615 	} else
1616 		rt_add_uncached_list(rt);
1617 
1618 #ifdef CONFIG_IP_ROUTE_CLASSID
1619 #ifdef CONFIG_IP_MULTIPLE_TABLES
1620 	set_class_tag(rt, res->tclassid);
1621 #endif
1622 	set_class_tag(rt, itag);
1623 #endif
1624 }
1625 
1626 struct rtable *rt_dst_alloc(struct net_device *dev,
1627 			    unsigned int flags, u16 type,
1628 			    bool nopolicy, bool noxfrm)
1629 {
1630 	struct rtable *rt;
1631 
1632 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1633 		       (nopolicy ? DST_NOPOLICY : 0) |
1634 		       (noxfrm ? DST_NOXFRM : 0));
1635 
1636 	if (rt) {
1637 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638 		rt->rt_flags = flags;
1639 		rt->rt_type = type;
1640 		rt->rt_is_input = 0;
1641 		rt->rt_iif = 0;
1642 		rt->rt_pmtu = 0;
1643 		rt->rt_mtu_locked = 0;
1644 		rt->rt_uses_gateway = 0;
1645 		rt->rt_gw_family = 0;
1646 		rt->rt_gw4 = 0;
1647 		INIT_LIST_HEAD(&rt->rt_uncached);
1648 
1649 		rt->dst.output = ip_output;
1650 		if (flags & RTCF_LOCAL)
1651 			rt->dst.input = ip_local_deliver;
1652 	}
1653 
1654 	return rt;
1655 }
1656 EXPORT_SYMBOL(rt_dst_alloc);
1657 
1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1659 {
1660 	struct rtable *new_rt;
1661 
1662 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1663 			   rt->dst.flags);
1664 
1665 	if (new_rt) {
1666 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667 		new_rt->rt_flags = rt->rt_flags;
1668 		new_rt->rt_type = rt->rt_type;
1669 		new_rt->rt_is_input = rt->rt_is_input;
1670 		new_rt->rt_iif = rt->rt_iif;
1671 		new_rt->rt_pmtu = rt->rt_pmtu;
1672 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673 		new_rt->rt_gw_family = rt->rt_gw_family;
1674 		if (rt->rt_gw_family == AF_INET)
1675 			new_rt->rt_gw4 = rt->rt_gw4;
1676 		else if (rt->rt_gw_family == AF_INET6)
1677 			new_rt->rt_gw6 = rt->rt_gw6;
1678 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1679 
1680 		new_rt->dst.input = rt->dst.input;
1681 		new_rt->dst.output = rt->dst.output;
1682 		new_rt->dst.error = rt->dst.error;
1683 		new_rt->dst.lastuse = jiffies;
1684 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685 	}
1686 	return new_rt;
1687 }
1688 EXPORT_SYMBOL(rt_dst_clone);
1689 
1690 /* called in rcu_read_lock() section */
1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692 			  u8 tos, struct net_device *dev,
1693 			  struct in_device *in_dev, u32 *itag)
1694 {
1695 	int err;
1696 
1697 	/* Primary sanity checks. */
1698 	if (!in_dev)
1699 		return -EINVAL;
1700 
1701 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702 	    skb->protocol != htons(ETH_P_IP))
1703 		return -EINVAL;
1704 
1705 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706 		return -EINVAL;
1707 
1708 	if (ipv4_is_zeronet(saddr)) {
1709 		if (!ipv4_is_local_multicast(daddr) &&
1710 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711 			return -EINVAL;
1712 	} else {
1713 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714 					  in_dev, itag);
1715 		if (err < 0)
1716 			return err;
1717 	}
1718 	return 0;
1719 }
1720 
1721 /* called in rcu_read_lock() section */
1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723 			     u8 tos, struct net_device *dev, int our)
1724 {
1725 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1726 	unsigned int flags = RTCF_MULTICAST;
1727 	struct rtable *rth;
1728 	u32 itag = 0;
1729 	int err;
1730 
1731 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732 	if (err)
1733 		return err;
1734 
1735 	if (our)
1736 		flags |= RTCF_LOCAL;
1737 
1738 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1739 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1740 	if (!rth)
1741 		return -ENOBUFS;
1742 
1743 #ifdef CONFIG_IP_ROUTE_CLASSID
1744 	rth->dst.tclassid = itag;
1745 #endif
1746 	rth->dst.output = ip_rt_bug;
1747 	rth->rt_is_input= 1;
1748 
1749 #ifdef CONFIG_IP_MROUTE
1750 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1751 		rth->dst.input = ip_mr_input;
1752 #endif
1753 	RT_CACHE_STAT_INC(in_slow_mc);
1754 
1755 	skb_dst_set(skb, &rth->dst);
1756 	return 0;
1757 }
1758 
1759 
1760 static void ip_handle_martian_source(struct net_device *dev,
1761 				     struct in_device *in_dev,
1762 				     struct sk_buff *skb,
1763 				     __be32 daddr,
1764 				     __be32 saddr)
1765 {
1766 	RT_CACHE_STAT_INC(in_martian_src);
1767 #ifdef CONFIG_IP_ROUTE_VERBOSE
1768 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1769 		/*
1770 		 *	RFC1812 recommendation, if source is martian,
1771 		 *	the only hint is MAC header.
1772 		 */
1773 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1774 			&daddr, &saddr, dev->name);
1775 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1776 			print_hex_dump(KERN_WARNING, "ll header: ",
1777 				       DUMP_PREFIX_OFFSET, 16, 1,
1778 				       skb_mac_header(skb),
1779 				       dev->hard_header_len, false);
1780 		}
1781 	}
1782 #endif
1783 }
1784 
1785 /* called in rcu_read_lock() section */
1786 static int __mkroute_input(struct sk_buff *skb,
1787 			   const struct fib_result *res,
1788 			   struct in_device *in_dev,
1789 			   __be32 daddr, __be32 saddr, u32 tos)
1790 {
1791 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1792 	struct net_device *dev = nhc->nhc_dev;
1793 	struct fib_nh_exception *fnhe;
1794 	struct rtable *rth;
1795 	int err;
1796 	struct in_device *out_dev;
1797 	bool do_cache;
1798 	u32 itag = 0;
1799 
1800 	/* get a working reference to the output device */
1801 	out_dev = __in_dev_get_rcu(dev);
1802 	if (!out_dev) {
1803 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1804 		return -EINVAL;
1805 	}
1806 
1807 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1808 				  in_dev->dev, in_dev, &itag);
1809 	if (err < 0) {
1810 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1811 					 saddr);
1812 
1813 		goto cleanup;
1814 	}
1815 
1816 	do_cache = res->fi && !itag;
1817 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1818 	    skb->protocol == htons(ETH_P_IP)) {
1819 		__be32 gw;
1820 
1821 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1822 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1823 		    inet_addr_onlink(out_dev, saddr, gw))
1824 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1825 	}
1826 
1827 	if (skb->protocol != htons(ETH_P_IP)) {
1828 		/* Not IP (i.e. ARP). Do not create route, if it is
1829 		 * invalid for proxy arp. DNAT routes are always valid.
1830 		 *
1831 		 * Proxy arp feature have been extended to allow, ARP
1832 		 * replies back to the same interface, to support
1833 		 * Private VLAN switch technologies. See arp.c.
1834 		 */
1835 		if (out_dev == in_dev &&
1836 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1837 			err = -EINVAL;
1838 			goto cleanup;
1839 		}
1840 	}
1841 
1842 	fnhe = find_exception(nhc, daddr);
1843 	if (do_cache) {
1844 		if (fnhe)
1845 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1846 		else
1847 			rth = rcu_dereference(nhc->nhc_rth_input);
1848 		if (rt_cache_valid(rth)) {
1849 			skb_dst_set_noref(skb, &rth->dst);
1850 			goto out;
1851 		}
1852 	}
1853 
1854 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1855 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1856 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
1857 	if (!rth) {
1858 		err = -ENOBUFS;
1859 		goto cleanup;
1860 	}
1861 
1862 	rth->rt_is_input = 1;
1863 	RT_CACHE_STAT_INC(in_slow_tot);
1864 
1865 	rth->dst.input = ip_forward;
1866 
1867 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1868 		       do_cache);
1869 	lwtunnel_set_redirect(&rth->dst);
1870 	skb_dst_set(skb, &rth->dst);
1871 out:
1872 	err = 0;
1873  cleanup:
1874 	return err;
1875 }
1876 
1877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1878 /* To make ICMP packets follow the right flow, the multipath hash is
1879  * calculated from the inner IP addresses.
1880  */
1881 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1882 				 struct flow_keys *hash_keys)
1883 {
1884 	const struct iphdr *outer_iph = ip_hdr(skb);
1885 	const struct iphdr *key_iph = outer_iph;
1886 	const struct iphdr *inner_iph;
1887 	const struct icmphdr *icmph;
1888 	struct iphdr _inner_iph;
1889 	struct icmphdr _icmph;
1890 
1891 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1892 		goto out;
1893 
1894 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1895 		goto out;
1896 
1897 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1898 				   &_icmph);
1899 	if (!icmph)
1900 		goto out;
1901 
1902 	if (!icmp_is_err(icmph->type))
1903 		goto out;
1904 
1905 	inner_iph = skb_header_pointer(skb,
1906 				       outer_iph->ihl * 4 + sizeof(_icmph),
1907 				       sizeof(_inner_iph), &_inner_iph);
1908 	if (!inner_iph)
1909 		goto out;
1910 
1911 	key_iph = inner_iph;
1912 out:
1913 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1914 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1915 }
1916 
1917 /* if skb is set it will be used and fl4 can be NULL */
1918 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1919 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1920 {
1921 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1922 	struct flow_keys hash_keys;
1923 	u32 mhash;
1924 
1925 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1926 	case 0:
1927 		memset(&hash_keys, 0, sizeof(hash_keys));
1928 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1929 		if (skb) {
1930 			ip_multipath_l3_keys(skb, &hash_keys);
1931 		} else {
1932 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1933 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1934 		}
1935 		break;
1936 	case 1:
1937 		/* skb is currently provided only when forwarding */
1938 		if (skb) {
1939 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1940 			struct flow_keys keys;
1941 
1942 			/* short-circuit if we already have L4 hash present */
1943 			if (skb->l4_hash)
1944 				return skb_get_hash_raw(skb) >> 1;
1945 
1946 			memset(&hash_keys, 0, sizeof(hash_keys));
1947 
1948 			if (!flkeys) {
1949 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1950 				flkeys = &keys;
1951 			}
1952 
1953 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1954 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1955 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1956 			hash_keys.ports.src = flkeys->ports.src;
1957 			hash_keys.ports.dst = flkeys->ports.dst;
1958 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1959 		} else {
1960 			memset(&hash_keys, 0, sizeof(hash_keys));
1961 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1962 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1963 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1964 			hash_keys.ports.src = fl4->fl4_sport;
1965 			hash_keys.ports.dst = fl4->fl4_dport;
1966 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1967 		}
1968 		break;
1969 	case 2:
1970 		memset(&hash_keys, 0, sizeof(hash_keys));
1971 		/* skb is currently provided only when forwarding */
1972 		if (skb) {
1973 			struct flow_keys keys;
1974 
1975 			skb_flow_dissect_flow_keys(skb, &keys, 0);
1976 			/* Inner can be v4 or v6 */
1977 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1978 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1979 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1980 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1984 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1985 				hash_keys.tags.flow_label = keys.tags.flow_label;
1986 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1987 			} else {
1988 				/* Same as case 0 */
1989 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1990 				ip_multipath_l3_keys(skb, &hash_keys);
1991 			}
1992 		} else {
1993 			/* Same as case 0 */
1994 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1996 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1997 		}
1998 		break;
1999 	}
2000 	mhash = flow_hash_from_keys(&hash_keys);
2001 
2002 	if (multipath_hash)
2003 		mhash = jhash_2words(mhash, multipath_hash, 0);
2004 
2005 	return mhash >> 1;
2006 }
2007 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2008 
2009 static int ip_mkroute_input(struct sk_buff *skb,
2010 			    struct fib_result *res,
2011 			    struct in_device *in_dev,
2012 			    __be32 daddr, __be32 saddr, u32 tos,
2013 			    struct flow_keys *hkeys)
2014 {
2015 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2016 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2017 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2018 
2019 		fib_select_multipath(res, h);
2020 	}
2021 #endif
2022 
2023 	/* create a routing cache entry */
2024 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2025 }
2026 
2027 /* Implements all the saddr-related checks as ip_route_input_slow(),
2028  * assuming daddr is valid and the destination is not a local broadcast one.
2029  * Uses the provided hint instead of performing a route lookup.
2030  */
2031 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2032 		      u8 tos, struct net_device *dev,
2033 		      const struct sk_buff *hint)
2034 {
2035 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2036 	struct rtable *rt = skb_rtable(hint);
2037 	struct net *net = dev_net(dev);
2038 	int err = -EINVAL;
2039 	u32 tag = 0;
2040 
2041 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2042 		goto martian_source;
2043 
2044 	if (ipv4_is_zeronet(saddr))
2045 		goto martian_source;
2046 
2047 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2048 		goto martian_source;
2049 
2050 	if (rt->rt_type != RTN_LOCAL)
2051 		goto skip_validate_source;
2052 
2053 	tos &= IPTOS_RT_MASK;
2054 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2055 	if (err < 0)
2056 		goto martian_source;
2057 
2058 skip_validate_source:
2059 	skb_dst_copy(skb, hint);
2060 	return 0;
2061 
2062 martian_source:
2063 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2064 	return err;
2065 }
2066 
2067 /*
2068  *	NOTE. We drop all the packets that has local source
2069  *	addresses, because every properly looped back packet
2070  *	must have correct destination already attached by output routine.
2071  *	Changes in the enforced policies must be applied also to
2072  *	ip_route_use_hint().
2073  *
2074  *	Such approach solves two big problems:
2075  *	1. Not simplex devices are handled properly.
2076  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2077  *	called with rcu_read_lock()
2078  */
2079 
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 			       u8 tos, struct net_device *dev,
2082 			       struct fib_result *res)
2083 {
2084 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2085 	struct flow_keys *flkeys = NULL, _flkeys;
2086 	struct net    *net = dev_net(dev);
2087 	struct ip_tunnel_info *tun_info;
2088 	int		err = -EINVAL;
2089 	unsigned int	flags = 0;
2090 	u32		itag = 0;
2091 	struct rtable	*rth;
2092 	struct flowi4	fl4;
2093 	bool do_cache = true;
2094 
2095 	/* IP on this device is disabled. */
2096 
2097 	if (!in_dev)
2098 		goto out;
2099 
2100 	/* Check for the most weird martians, which can be not detected
2101 	   by fib_lookup.
2102 	 */
2103 
2104 	tun_info = skb_tunnel_info(skb);
2105 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2106 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2107 	else
2108 		fl4.flowi4_tun_key.tun_id = 0;
2109 	skb_dst_drop(skb);
2110 
2111 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2112 		goto martian_source;
2113 
2114 	res->fi = NULL;
2115 	res->table = NULL;
2116 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2117 		goto brd_input;
2118 
2119 	/* Accept zero addresses only to limited broadcast;
2120 	 * I even do not know to fix it or not. Waiting for complains :-)
2121 	 */
2122 	if (ipv4_is_zeronet(saddr))
2123 		goto martian_source;
2124 
2125 	if (ipv4_is_zeronet(daddr))
2126 		goto martian_destination;
2127 
2128 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2129 	 * and call it once if daddr or/and saddr are loopback addresses
2130 	 */
2131 	if (ipv4_is_loopback(daddr)) {
2132 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2133 			goto martian_destination;
2134 	} else if (ipv4_is_loopback(saddr)) {
2135 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2136 			goto martian_source;
2137 	}
2138 
2139 	/*
2140 	 *	Now we are ready to route packet.
2141 	 */
2142 	fl4.flowi4_oif = 0;
2143 	fl4.flowi4_iif = dev->ifindex;
2144 	fl4.flowi4_mark = skb->mark;
2145 	fl4.flowi4_tos = tos;
2146 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2147 	fl4.flowi4_flags = 0;
2148 	fl4.daddr = daddr;
2149 	fl4.saddr = saddr;
2150 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2151 
2152 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2153 		flkeys = &_flkeys;
2154 	} else {
2155 		fl4.flowi4_proto = 0;
2156 		fl4.fl4_sport = 0;
2157 		fl4.fl4_dport = 0;
2158 	}
2159 
2160 	err = fib_lookup(net, &fl4, res, 0);
2161 	if (err != 0) {
2162 		if (!IN_DEV_FORWARD(in_dev))
2163 			err = -EHOSTUNREACH;
2164 		goto no_route;
2165 	}
2166 
2167 	if (res->type == RTN_BROADCAST) {
2168 		if (IN_DEV_BFORWARD(in_dev))
2169 			goto make_route;
2170 		/* not do cache if bc_forwarding is enabled */
2171 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2172 			do_cache = false;
2173 		goto brd_input;
2174 	}
2175 
2176 	if (res->type == RTN_LOCAL) {
2177 		err = fib_validate_source(skb, saddr, daddr, tos,
2178 					  0, dev, in_dev, &itag);
2179 		if (err < 0)
2180 			goto martian_source;
2181 		goto local_input;
2182 	}
2183 
2184 	if (!IN_DEV_FORWARD(in_dev)) {
2185 		err = -EHOSTUNREACH;
2186 		goto no_route;
2187 	}
2188 	if (res->type != RTN_UNICAST)
2189 		goto martian_destination;
2190 
2191 make_route:
2192 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2193 out:	return err;
2194 
2195 brd_input:
2196 	if (skb->protocol != htons(ETH_P_IP))
2197 		goto e_inval;
2198 
2199 	if (!ipv4_is_zeronet(saddr)) {
2200 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2201 					  in_dev, &itag);
2202 		if (err < 0)
2203 			goto martian_source;
2204 	}
2205 	flags |= RTCF_BROADCAST;
2206 	res->type = RTN_BROADCAST;
2207 	RT_CACHE_STAT_INC(in_brd);
2208 
2209 local_input:
2210 	do_cache &= res->fi && !itag;
2211 	if (do_cache) {
2212 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2213 
2214 		rth = rcu_dereference(nhc->nhc_rth_input);
2215 		if (rt_cache_valid(rth)) {
2216 			skb_dst_set_noref(skb, &rth->dst);
2217 			err = 0;
2218 			goto out;
2219 		}
2220 	}
2221 
2222 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2223 			   flags | RTCF_LOCAL, res->type,
2224 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2225 	if (!rth)
2226 		goto e_nobufs;
2227 
2228 	rth->dst.output= ip_rt_bug;
2229 #ifdef CONFIG_IP_ROUTE_CLASSID
2230 	rth->dst.tclassid = itag;
2231 #endif
2232 	rth->rt_is_input = 1;
2233 
2234 	RT_CACHE_STAT_INC(in_slow_tot);
2235 	if (res->type == RTN_UNREACHABLE) {
2236 		rth->dst.input= ip_error;
2237 		rth->dst.error= -err;
2238 		rth->rt_flags 	&= ~RTCF_LOCAL;
2239 	}
2240 
2241 	if (do_cache) {
2242 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2243 
2244 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2245 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2246 			WARN_ON(rth->dst.input == lwtunnel_input);
2247 			rth->dst.lwtstate->orig_input = rth->dst.input;
2248 			rth->dst.input = lwtunnel_input;
2249 		}
2250 
2251 		if (unlikely(!rt_cache_route(nhc, rth)))
2252 			rt_add_uncached_list(rth);
2253 	}
2254 	skb_dst_set(skb, &rth->dst);
2255 	err = 0;
2256 	goto out;
2257 
2258 no_route:
2259 	RT_CACHE_STAT_INC(in_no_route);
2260 	res->type = RTN_UNREACHABLE;
2261 	res->fi = NULL;
2262 	res->table = NULL;
2263 	goto local_input;
2264 
2265 	/*
2266 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2267 	 */
2268 martian_destination:
2269 	RT_CACHE_STAT_INC(in_martian_dst);
2270 #ifdef CONFIG_IP_ROUTE_VERBOSE
2271 	if (IN_DEV_LOG_MARTIANS(in_dev))
2272 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2273 				     &daddr, &saddr, dev->name);
2274 #endif
2275 
2276 e_inval:
2277 	err = -EINVAL;
2278 	goto out;
2279 
2280 e_nobufs:
2281 	err = -ENOBUFS;
2282 	goto out;
2283 
2284 martian_source:
2285 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2286 	goto out;
2287 }
2288 
2289 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2290 			 u8 tos, struct net_device *dev)
2291 {
2292 	struct fib_result res;
2293 	int err;
2294 
2295 	tos &= IPTOS_RT_MASK;
2296 	rcu_read_lock();
2297 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2298 	rcu_read_unlock();
2299 
2300 	return err;
2301 }
2302 EXPORT_SYMBOL(ip_route_input_noref);
2303 
2304 /* called with rcu_read_lock held */
2305 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2306 		       u8 tos, struct net_device *dev, struct fib_result *res)
2307 {
2308 	/* Multicast recognition logic is moved from route cache to here.
2309 	   The problem was that too many Ethernet cards have broken/missing
2310 	   hardware multicast filters :-( As result the host on multicasting
2311 	   network acquires a lot of useless route cache entries, sort of
2312 	   SDR messages from all the world. Now we try to get rid of them.
2313 	   Really, provided software IP multicast filter is organized
2314 	   reasonably (at least, hashed), it does not result in a slowdown
2315 	   comparing with route cache reject entries.
2316 	   Note, that multicast routers are not affected, because
2317 	   route cache entry is created eventually.
2318 	 */
2319 	if (ipv4_is_multicast(daddr)) {
2320 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2321 		int our = 0;
2322 		int err = -EINVAL;
2323 
2324 		if (!in_dev)
2325 			return err;
2326 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2327 				      ip_hdr(skb)->protocol);
2328 
2329 		/* check l3 master if no match yet */
2330 		if (!our && netif_is_l3_slave(dev)) {
2331 			struct in_device *l3_in_dev;
2332 
2333 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2334 			if (l3_in_dev)
2335 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2336 						      ip_hdr(skb)->protocol);
2337 		}
2338 
2339 		if (our
2340 #ifdef CONFIG_IP_MROUTE
2341 			||
2342 		    (!ipv4_is_local_multicast(daddr) &&
2343 		     IN_DEV_MFORWARD(in_dev))
2344 #endif
2345 		   ) {
2346 			err = ip_route_input_mc(skb, daddr, saddr,
2347 						tos, dev, our);
2348 		}
2349 		return err;
2350 	}
2351 
2352 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2353 }
2354 
2355 /* called with rcu_read_lock() */
2356 static struct rtable *__mkroute_output(const struct fib_result *res,
2357 				       const struct flowi4 *fl4, int orig_oif,
2358 				       struct net_device *dev_out,
2359 				       unsigned int flags)
2360 {
2361 	struct fib_info *fi = res->fi;
2362 	struct fib_nh_exception *fnhe;
2363 	struct in_device *in_dev;
2364 	u16 type = res->type;
2365 	struct rtable *rth;
2366 	bool do_cache;
2367 
2368 	in_dev = __in_dev_get_rcu(dev_out);
2369 	if (!in_dev)
2370 		return ERR_PTR(-EINVAL);
2371 
2372 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2373 		if (ipv4_is_loopback(fl4->saddr) &&
2374 		    !(dev_out->flags & IFF_LOOPBACK) &&
2375 		    !netif_is_l3_master(dev_out))
2376 			return ERR_PTR(-EINVAL);
2377 
2378 	if (ipv4_is_lbcast(fl4->daddr))
2379 		type = RTN_BROADCAST;
2380 	else if (ipv4_is_multicast(fl4->daddr))
2381 		type = RTN_MULTICAST;
2382 	else if (ipv4_is_zeronet(fl4->daddr))
2383 		return ERR_PTR(-EINVAL);
2384 
2385 	if (dev_out->flags & IFF_LOOPBACK)
2386 		flags |= RTCF_LOCAL;
2387 
2388 	do_cache = true;
2389 	if (type == RTN_BROADCAST) {
2390 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2391 		fi = NULL;
2392 	} else if (type == RTN_MULTICAST) {
2393 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2394 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2395 				     fl4->flowi4_proto))
2396 			flags &= ~RTCF_LOCAL;
2397 		else
2398 			do_cache = false;
2399 		/* If multicast route do not exist use
2400 		 * default one, but do not gateway in this case.
2401 		 * Yes, it is hack.
2402 		 */
2403 		if (fi && res->prefixlen < 4)
2404 			fi = NULL;
2405 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2406 		   (orig_oif != dev_out->ifindex)) {
2407 		/* For local routes that require a particular output interface
2408 		 * we do not want to cache the result.  Caching the result
2409 		 * causes incorrect behaviour when there are multiple source
2410 		 * addresses on the interface, the end result being that if the
2411 		 * intended recipient is waiting on that interface for the
2412 		 * packet he won't receive it because it will be delivered on
2413 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2414 		 * be set to the loopback interface as well.
2415 		 */
2416 		do_cache = false;
2417 	}
2418 
2419 	fnhe = NULL;
2420 	do_cache &= fi != NULL;
2421 	if (fi) {
2422 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2423 		struct rtable __rcu **prth;
2424 
2425 		fnhe = find_exception(nhc, fl4->daddr);
2426 		if (!do_cache)
2427 			goto add;
2428 		if (fnhe) {
2429 			prth = &fnhe->fnhe_rth_output;
2430 		} else {
2431 			if (unlikely(fl4->flowi4_flags &
2432 				     FLOWI_FLAG_KNOWN_NH &&
2433 				     !(nhc->nhc_gw_family &&
2434 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2435 				do_cache = false;
2436 				goto add;
2437 			}
2438 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2439 		}
2440 		rth = rcu_dereference(*prth);
2441 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2442 			return rth;
2443 	}
2444 
2445 add:
2446 	rth = rt_dst_alloc(dev_out, flags, type,
2447 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2448 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2449 	if (!rth)
2450 		return ERR_PTR(-ENOBUFS);
2451 
2452 	rth->rt_iif = orig_oif;
2453 
2454 	RT_CACHE_STAT_INC(out_slow_tot);
2455 
2456 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2457 		if (flags & RTCF_LOCAL &&
2458 		    !(dev_out->flags & IFF_LOOPBACK)) {
2459 			rth->dst.output = ip_mc_output;
2460 			RT_CACHE_STAT_INC(out_slow_mc);
2461 		}
2462 #ifdef CONFIG_IP_MROUTE
2463 		if (type == RTN_MULTICAST) {
2464 			if (IN_DEV_MFORWARD(in_dev) &&
2465 			    !ipv4_is_local_multicast(fl4->daddr)) {
2466 				rth->dst.input = ip_mr_input;
2467 				rth->dst.output = ip_mc_output;
2468 			}
2469 		}
2470 #endif
2471 	}
2472 
2473 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2474 	lwtunnel_set_redirect(&rth->dst);
2475 
2476 	return rth;
2477 }
2478 
2479 /*
2480  * Major route resolver routine.
2481  */
2482 
2483 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2484 					const struct sk_buff *skb)
2485 {
2486 	__u8 tos = RT_FL_TOS(fl4);
2487 	struct fib_result res = {
2488 		.type		= RTN_UNSPEC,
2489 		.fi		= NULL,
2490 		.table		= NULL,
2491 		.tclassid	= 0,
2492 	};
2493 	struct rtable *rth;
2494 
2495 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2496 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2497 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2498 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2499 
2500 	rcu_read_lock();
2501 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2502 	rcu_read_unlock();
2503 
2504 	return rth;
2505 }
2506 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2507 
2508 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2509 					    struct fib_result *res,
2510 					    const struct sk_buff *skb)
2511 {
2512 	struct net_device *dev_out = NULL;
2513 	int orig_oif = fl4->flowi4_oif;
2514 	unsigned int flags = 0;
2515 	struct rtable *rth;
2516 	int err;
2517 
2518 	if (fl4->saddr) {
2519 		if (ipv4_is_multicast(fl4->saddr) ||
2520 		    ipv4_is_lbcast(fl4->saddr) ||
2521 		    ipv4_is_zeronet(fl4->saddr)) {
2522 			rth = ERR_PTR(-EINVAL);
2523 			goto out;
2524 		}
2525 
2526 		rth = ERR_PTR(-ENETUNREACH);
2527 
2528 		/* I removed check for oif == dev_out->oif here.
2529 		   It was wrong for two reasons:
2530 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2531 		      is assigned to multiple interfaces.
2532 		   2. Moreover, we are allowed to send packets with saddr
2533 		      of another iface. --ANK
2534 		 */
2535 
2536 		if (fl4->flowi4_oif == 0 &&
2537 		    (ipv4_is_multicast(fl4->daddr) ||
2538 		     ipv4_is_lbcast(fl4->daddr))) {
2539 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2540 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2541 			if (!dev_out)
2542 				goto out;
2543 
2544 			/* Special hack: user can direct multicasts
2545 			   and limited broadcast via necessary interface
2546 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2547 			   This hack is not just for fun, it allows
2548 			   vic,vat and friends to work.
2549 			   They bind socket to loopback, set ttl to zero
2550 			   and expect that it will work.
2551 			   From the viewpoint of routing cache they are broken,
2552 			   because we are not allowed to build multicast path
2553 			   with loopback source addr (look, routing cache
2554 			   cannot know, that ttl is zero, so that packet
2555 			   will not leave this host and route is valid).
2556 			   Luckily, this hack is good workaround.
2557 			 */
2558 
2559 			fl4->flowi4_oif = dev_out->ifindex;
2560 			goto make_route;
2561 		}
2562 
2563 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2564 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2565 			if (!__ip_dev_find(net, fl4->saddr, false))
2566 				goto out;
2567 		}
2568 	}
2569 
2570 
2571 	if (fl4->flowi4_oif) {
2572 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2573 		rth = ERR_PTR(-ENODEV);
2574 		if (!dev_out)
2575 			goto out;
2576 
2577 		/* RACE: Check return value of inet_select_addr instead. */
2578 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2579 			rth = ERR_PTR(-ENETUNREACH);
2580 			goto out;
2581 		}
2582 		if (ipv4_is_local_multicast(fl4->daddr) ||
2583 		    ipv4_is_lbcast(fl4->daddr) ||
2584 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2585 			if (!fl4->saddr)
2586 				fl4->saddr = inet_select_addr(dev_out, 0,
2587 							      RT_SCOPE_LINK);
2588 			goto make_route;
2589 		}
2590 		if (!fl4->saddr) {
2591 			if (ipv4_is_multicast(fl4->daddr))
2592 				fl4->saddr = inet_select_addr(dev_out, 0,
2593 							      fl4->flowi4_scope);
2594 			else if (!fl4->daddr)
2595 				fl4->saddr = inet_select_addr(dev_out, 0,
2596 							      RT_SCOPE_HOST);
2597 		}
2598 	}
2599 
2600 	if (!fl4->daddr) {
2601 		fl4->daddr = fl4->saddr;
2602 		if (!fl4->daddr)
2603 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2604 		dev_out = net->loopback_dev;
2605 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2606 		res->type = RTN_LOCAL;
2607 		flags |= RTCF_LOCAL;
2608 		goto make_route;
2609 	}
2610 
2611 	err = fib_lookup(net, fl4, res, 0);
2612 	if (err) {
2613 		res->fi = NULL;
2614 		res->table = NULL;
2615 		if (fl4->flowi4_oif &&
2616 		    (ipv4_is_multicast(fl4->daddr) ||
2617 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2618 			/* Apparently, routing tables are wrong. Assume,
2619 			   that the destination is on link.
2620 
2621 			   WHY? DW.
2622 			   Because we are allowed to send to iface
2623 			   even if it has NO routes and NO assigned
2624 			   addresses. When oif is specified, routing
2625 			   tables are looked up with only one purpose:
2626 			   to catch if destination is gatewayed, rather than
2627 			   direct. Moreover, if MSG_DONTROUTE is set,
2628 			   we send packet, ignoring both routing tables
2629 			   and ifaddr state. --ANK
2630 
2631 
2632 			   We could make it even if oif is unknown,
2633 			   likely IPv6, but we do not.
2634 			 */
2635 
2636 			if (fl4->saddr == 0)
2637 				fl4->saddr = inet_select_addr(dev_out, 0,
2638 							      RT_SCOPE_LINK);
2639 			res->type = RTN_UNICAST;
2640 			goto make_route;
2641 		}
2642 		rth = ERR_PTR(err);
2643 		goto out;
2644 	}
2645 
2646 	if (res->type == RTN_LOCAL) {
2647 		if (!fl4->saddr) {
2648 			if (res->fi->fib_prefsrc)
2649 				fl4->saddr = res->fi->fib_prefsrc;
2650 			else
2651 				fl4->saddr = fl4->daddr;
2652 		}
2653 
2654 		/* L3 master device is the loopback for that domain */
2655 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2656 			net->loopback_dev;
2657 
2658 		/* make sure orig_oif points to fib result device even
2659 		 * though packet rx/tx happens over loopback or l3mdev
2660 		 */
2661 		orig_oif = FIB_RES_OIF(*res);
2662 
2663 		fl4->flowi4_oif = dev_out->ifindex;
2664 		flags |= RTCF_LOCAL;
2665 		goto make_route;
2666 	}
2667 
2668 	fib_select_path(net, res, fl4, skb);
2669 
2670 	dev_out = FIB_RES_DEV(*res);
2671 	fl4->flowi4_oif = dev_out->ifindex;
2672 
2673 
2674 make_route:
2675 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2676 
2677 out:
2678 	return rth;
2679 }
2680 
2681 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2682 {
2683 	return NULL;
2684 }
2685 
2686 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2687 {
2688 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2689 
2690 	return mtu ? : dst->dev->mtu;
2691 }
2692 
2693 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2694 					  struct sk_buff *skb, u32 mtu,
2695 					  bool confirm_neigh)
2696 {
2697 }
2698 
2699 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2700 				       struct sk_buff *skb)
2701 {
2702 }
2703 
2704 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2705 					  unsigned long old)
2706 {
2707 	return NULL;
2708 }
2709 
2710 static struct dst_ops ipv4_dst_blackhole_ops = {
2711 	.family			=	AF_INET,
2712 	.check			=	ipv4_blackhole_dst_check,
2713 	.mtu			=	ipv4_blackhole_mtu,
2714 	.default_advmss		=	ipv4_default_advmss,
2715 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2716 	.redirect		=	ipv4_rt_blackhole_redirect,
2717 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2718 	.neigh_lookup		=	ipv4_neigh_lookup,
2719 };
2720 
2721 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2722 {
2723 	struct rtable *ort = (struct rtable *) dst_orig;
2724 	struct rtable *rt;
2725 
2726 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2727 	if (rt) {
2728 		struct dst_entry *new = &rt->dst;
2729 
2730 		new->__use = 1;
2731 		new->input = dst_discard;
2732 		new->output = dst_discard_out;
2733 
2734 		new->dev = net->loopback_dev;
2735 		if (new->dev)
2736 			dev_hold(new->dev);
2737 
2738 		rt->rt_is_input = ort->rt_is_input;
2739 		rt->rt_iif = ort->rt_iif;
2740 		rt->rt_pmtu = ort->rt_pmtu;
2741 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2742 
2743 		rt->rt_genid = rt_genid_ipv4(net);
2744 		rt->rt_flags = ort->rt_flags;
2745 		rt->rt_type = ort->rt_type;
2746 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2747 		rt->rt_gw_family = ort->rt_gw_family;
2748 		if (rt->rt_gw_family == AF_INET)
2749 			rt->rt_gw4 = ort->rt_gw4;
2750 		else if (rt->rt_gw_family == AF_INET6)
2751 			rt->rt_gw6 = ort->rt_gw6;
2752 
2753 		INIT_LIST_HEAD(&rt->rt_uncached);
2754 	}
2755 
2756 	dst_release(dst_orig);
2757 
2758 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2759 }
2760 
2761 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2762 				    const struct sock *sk)
2763 {
2764 	struct rtable *rt = __ip_route_output_key(net, flp4);
2765 
2766 	if (IS_ERR(rt))
2767 		return rt;
2768 
2769 	if (flp4->flowi4_proto)
2770 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2771 							flowi4_to_flowi(flp4),
2772 							sk, 0);
2773 
2774 	return rt;
2775 }
2776 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2777 
2778 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2779 				      struct net_device *dev,
2780 				      struct net *net, __be32 *saddr,
2781 				      const struct ip_tunnel_info *info,
2782 				      u8 protocol, bool use_cache)
2783 {
2784 #ifdef CONFIG_DST_CACHE
2785 	struct dst_cache *dst_cache;
2786 #endif
2787 	struct rtable *rt = NULL;
2788 	struct flowi4 fl4;
2789 	__u8 tos;
2790 
2791 #ifdef CONFIG_DST_CACHE
2792 	dst_cache = (struct dst_cache *)&info->dst_cache;
2793 	if (use_cache) {
2794 		rt = dst_cache_get_ip4(dst_cache, saddr);
2795 		if (rt)
2796 			return rt;
2797 	}
2798 #endif
2799 	memset(&fl4, 0, sizeof(fl4));
2800 	fl4.flowi4_mark = skb->mark;
2801 	fl4.flowi4_proto = protocol;
2802 	fl4.daddr = info->key.u.ipv4.dst;
2803 	fl4.saddr = info->key.u.ipv4.src;
2804 	tos = info->key.tos;
2805 	fl4.flowi4_tos = RT_TOS(tos);
2806 
2807 	rt = ip_route_output_key(net, &fl4);
2808 	if (IS_ERR(rt)) {
2809 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2810 		return ERR_PTR(-ENETUNREACH);
2811 	}
2812 	if (rt->dst.dev == dev) { /* is this necessary? */
2813 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2814 		ip_rt_put(rt);
2815 		return ERR_PTR(-ELOOP);
2816 	}
2817 #ifdef CONFIG_DST_CACHE
2818 	if (use_cache)
2819 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2820 #endif
2821 	*saddr = fl4.saddr;
2822 	return rt;
2823 }
2824 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2825 
2826 /* called with rcu_read_lock held */
2827 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2828 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2829 			struct sk_buff *skb, u32 portid, u32 seq,
2830 			unsigned int flags)
2831 {
2832 	struct rtmsg *r;
2833 	struct nlmsghdr *nlh;
2834 	unsigned long expires = 0;
2835 	u32 error;
2836 	u32 metrics[RTAX_MAX];
2837 
2838 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2839 	if (!nlh)
2840 		return -EMSGSIZE;
2841 
2842 	r = nlmsg_data(nlh);
2843 	r->rtm_family	 = AF_INET;
2844 	r->rtm_dst_len	= 32;
2845 	r->rtm_src_len	= 0;
2846 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2847 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2848 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2849 		goto nla_put_failure;
2850 	r->rtm_type	= rt->rt_type;
2851 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2852 	r->rtm_protocol = RTPROT_UNSPEC;
2853 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2854 	if (rt->rt_flags & RTCF_NOTIFY)
2855 		r->rtm_flags |= RTM_F_NOTIFY;
2856 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2857 		r->rtm_flags |= RTCF_DOREDIRECT;
2858 
2859 	if (nla_put_in_addr(skb, RTA_DST, dst))
2860 		goto nla_put_failure;
2861 	if (src) {
2862 		r->rtm_src_len = 32;
2863 		if (nla_put_in_addr(skb, RTA_SRC, src))
2864 			goto nla_put_failure;
2865 	}
2866 	if (rt->dst.dev &&
2867 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2868 		goto nla_put_failure;
2869 #ifdef CONFIG_IP_ROUTE_CLASSID
2870 	if (rt->dst.tclassid &&
2871 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2872 		goto nla_put_failure;
2873 #endif
2874 	if (fl4 && !rt_is_input_route(rt) &&
2875 	    fl4->saddr != src) {
2876 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2877 			goto nla_put_failure;
2878 	}
2879 	if (rt->rt_uses_gateway) {
2880 		if (rt->rt_gw_family == AF_INET &&
2881 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2882 			goto nla_put_failure;
2883 		} else if (rt->rt_gw_family == AF_INET6) {
2884 			int alen = sizeof(struct in6_addr);
2885 			struct nlattr *nla;
2886 			struct rtvia *via;
2887 
2888 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2889 			if (!nla)
2890 				goto nla_put_failure;
2891 
2892 			via = nla_data(nla);
2893 			via->rtvia_family = AF_INET6;
2894 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2895 		}
2896 	}
2897 
2898 	expires = rt->dst.expires;
2899 	if (expires) {
2900 		unsigned long now = jiffies;
2901 
2902 		if (time_before(now, expires))
2903 			expires -= now;
2904 		else
2905 			expires = 0;
2906 	}
2907 
2908 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2909 	if (rt->rt_pmtu && expires)
2910 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2911 	if (rt->rt_mtu_locked && expires)
2912 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2913 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2914 		goto nla_put_failure;
2915 
2916 	if (fl4) {
2917 		if (fl4->flowi4_mark &&
2918 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2919 			goto nla_put_failure;
2920 
2921 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2922 		    nla_put_u32(skb, RTA_UID,
2923 				from_kuid_munged(current_user_ns(),
2924 						 fl4->flowi4_uid)))
2925 			goto nla_put_failure;
2926 
2927 		if (rt_is_input_route(rt)) {
2928 #ifdef CONFIG_IP_MROUTE
2929 			if (ipv4_is_multicast(dst) &&
2930 			    !ipv4_is_local_multicast(dst) &&
2931 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2932 				int err = ipmr_get_route(net, skb,
2933 							 fl4->saddr, fl4->daddr,
2934 							 r, portid);
2935 
2936 				if (err <= 0) {
2937 					if (err == 0)
2938 						return 0;
2939 					goto nla_put_failure;
2940 				}
2941 			} else
2942 #endif
2943 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2944 					goto nla_put_failure;
2945 		}
2946 	}
2947 
2948 	error = rt->dst.error;
2949 
2950 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2951 		goto nla_put_failure;
2952 
2953 	nlmsg_end(skb, nlh);
2954 	return 0;
2955 
2956 nla_put_failure:
2957 	nlmsg_cancel(skb, nlh);
2958 	return -EMSGSIZE;
2959 }
2960 
2961 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2962 			    struct netlink_callback *cb, u32 table_id,
2963 			    struct fnhe_hash_bucket *bucket, int genid,
2964 			    int *fa_index, int fa_start, unsigned int flags)
2965 {
2966 	int i;
2967 
2968 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2969 		struct fib_nh_exception *fnhe;
2970 
2971 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2972 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2973 			struct rtable *rt;
2974 			int err;
2975 
2976 			if (*fa_index < fa_start)
2977 				goto next;
2978 
2979 			if (fnhe->fnhe_genid != genid)
2980 				goto next;
2981 
2982 			if (fnhe->fnhe_expires &&
2983 			    time_after(jiffies, fnhe->fnhe_expires))
2984 				goto next;
2985 
2986 			rt = rcu_dereference(fnhe->fnhe_rth_input);
2987 			if (!rt)
2988 				rt = rcu_dereference(fnhe->fnhe_rth_output);
2989 			if (!rt)
2990 				goto next;
2991 
2992 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2993 					   table_id, NULL, skb,
2994 					   NETLINK_CB(cb->skb).portid,
2995 					   cb->nlh->nlmsg_seq, flags);
2996 			if (err)
2997 				return err;
2998 next:
2999 			(*fa_index)++;
3000 		}
3001 	}
3002 
3003 	return 0;
3004 }
3005 
3006 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3007 		       u32 table_id, struct fib_info *fi,
3008 		       int *fa_index, int fa_start, unsigned int flags)
3009 {
3010 	struct net *net = sock_net(cb->skb->sk);
3011 	int nhsel, genid = fnhe_genid(net);
3012 
3013 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3014 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3015 		struct fnhe_hash_bucket *bucket;
3016 		int err;
3017 
3018 		if (nhc->nhc_flags & RTNH_F_DEAD)
3019 			continue;
3020 
3021 		rcu_read_lock();
3022 		bucket = rcu_dereference(nhc->nhc_exceptions);
3023 		err = 0;
3024 		if (bucket)
3025 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3026 					       genid, fa_index, fa_start,
3027 					       flags);
3028 		rcu_read_unlock();
3029 		if (err)
3030 			return err;
3031 	}
3032 
3033 	return 0;
3034 }
3035 
3036 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3037 						   u8 ip_proto, __be16 sport,
3038 						   __be16 dport)
3039 {
3040 	struct sk_buff *skb;
3041 	struct iphdr *iph;
3042 
3043 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3044 	if (!skb)
3045 		return NULL;
3046 
3047 	/* Reserve room for dummy headers, this skb can pass
3048 	 * through good chunk of routing engine.
3049 	 */
3050 	skb_reset_mac_header(skb);
3051 	skb_reset_network_header(skb);
3052 	skb->protocol = htons(ETH_P_IP);
3053 	iph = skb_put(skb, sizeof(struct iphdr));
3054 	iph->protocol = ip_proto;
3055 	iph->saddr = src;
3056 	iph->daddr = dst;
3057 	iph->version = 0x4;
3058 	iph->frag_off = 0;
3059 	iph->ihl = 0x5;
3060 	skb_set_transport_header(skb, skb->len);
3061 
3062 	switch (iph->protocol) {
3063 	case IPPROTO_UDP: {
3064 		struct udphdr *udph;
3065 
3066 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3067 		udph->source = sport;
3068 		udph->dest = dport;
3069 		udph->len = sizeof(struct udphdr);
3070 		udph->check = 0;
3071 		break;
3072 	}
3073 	case IPPROTO_TCP: {
3074 		struct tcphdr *tcph;
3075 
3076 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3077 		tcph->source	= sport;
3078 		tcph->dest	= dport;
3079 		tcph->doff	= sizeof(struct tcphdr) / 4;
3080 		tcph->rst = 1;
3081 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3082 					    src, dst, 0);
3083 		break;
3084 	}
3085 	case IPPROTO_ICMP: {
3086 		struct icmphdr *icmph;
3087 
3088 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3089 		icmph->type = ICMP_ECHO;
3090 		icmph->code = 0;
3091 	}
3092 	}
3093 
3094 	return skb;
3095 }
3096 
3097 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3098 				       const struct nlmsghdr *nlh,
3099 				       struct nlattr **tb,
3100 				       struct netlink_ext_ack *extack)
3101 {
3102 	struct rtmsg *rtm;
3103 	int i, err;
3104 
3105 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3106 		NL_SET_ERR_MSG(extack,
3107 			       "ipv4: Invalid header for route get request");
3108 		return -EINVAL;
3109 	}
3110 
3111 	if (!netlink_strict_get_check(skb))
3112 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3113 					      rtm_ipv4_policy, extack);
3114 
3115 	rtm = nlmsg_data(nlh);
3116 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3117 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3118 	    rtm->rtm_table || rtm->rtm_protocol ||
3119 	    rtm->rtm_scope || rtm->rtm_type) {
3120 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3121 		return -EINVAL;
3122 	}
3123 
3124 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3125 			       RTM_F_LOOKUP_TABLE |
3126 			       RTM_F_FIB_MATCH)) {
3127 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3128 		return -EINVAL;
3129 	}
3130 
3131 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3132 					    rtm_ipv4_policy, extack);
3133 	if (err)
3134 		return err;
3135 
3136 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3137 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3138 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3139 		return -EINVAL;
3140 	}
3141 
3142 	for (i = 0; i <= RTA_MAX; i++) {
3143 		if (!tb[i])
3144 			continue;
3145 
3146 		switch (i) {
3147 		case RTA_IIF:
3148 		case RTA_OIF:
3149 		case RTA_SRC:
3150 		case RTA_DST:
3151 		case RTA_IP_PROTO:
3152 		case RTA_SPORT:
3153 		case RTA_DPORT:
3154 		case RTA_MARK:
3155 		case RTA_UID:
3156 			break;
3157 		default:
3158 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3159 			return -EINVAL;
3160 		}
3161 	}
3162 
3163 	return 0;
3164 }
3165 
3166 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3167 			     struct netlink_ext_ack *extack)
3168 {
3169 	struct net *net = sock_net(in_skb->sk);
3170 	struct nlattr *tb[RTA_MAX+1];
3171 	u32 table_id = RT_TABLE_MAIN;
3172 	__be16 sport = 0, dport = 0;
3173 	struct fib_result res = {};
3174 	u8 ip_proto = IPPROTO_UDP;
3175 	struct rtable *rt = NULL;
3176 	struct sk_buff *skb;
3177 	struct rtmsg *rtm;
3178 	struct flowi4 fl4 = {};
3179 	__be32 dst = 0;
3180 	__be32 src = 0;
3181 	kuid_t uid;
3182 	u32 iif;
3183 	int err;
3184 	int mark;
3185 
3186 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3187 	if (err < 0)
3188 		return err;
3189 
3190 	rtm = nlmsg_data(nlh);
3191 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3192 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3193 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3194 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3195 	if (tb[RTA_UID])
3196 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3197 	else
3198 		uid = (iif ? INVALID_UID : current_uid());
3199 
3200 	if (tb[RTA_IP_PROTO]) {
3201 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3202 						  &ip_proto, AF_INET, extack);
3203 		if (err)
3204 			return err;
3205 	}
3206 
3207 	if (tb[RTA_SPORT])
3208 		sport = nla_get_be16(tb[RTA_SPORT]);
3209 
3210 	if (tb[RTA_DPORT])
3211 		dport = nla_get_be16(tb[RTA_DPORT]);
3212 
3213 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3214 	if (!skb)
3215 		return -ENOBUFS;
3216 
3217 	fl4.daddr = dst;
3218 	fl4.saddr = src;
3219 	fl4.flowi4_tos = rtm->rtm_tos;
3220 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3221 	fl4.flowi4_mark = mark;
3222 	fl4.flowi4_uid = uid;
3223 	if (sport)
3224 		fl4.fl4_sport = sport;
3225 	if (dport)
3226 		fl4.fl4_dport = dport;
3227 	fl4.flowi4_proto = ip_proto;
3228 
3229 	rcu_read_lock();
3230 
3231 	if (iif) {
3232 		struct net_device *dev;
3233 
3234 		dev = dev_get_by_index_rcu(net, iif);
3235 		if (!dev) {
3236 			err = -ENODEV;
3237 			goto errout_rcu;
3238 		}
3239 
3240 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3241 		skb->dev	= dev;
3242 		skb->mark	= mark;
3243 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3244 					 dev, &res);
3245 
3246 		rt = skb_rtable(skb);
3247 		if (err == 0 && rt->dst.error)
3248 			err = -rt->dst.error;
3249 	} else {
3250 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3251 		skb->dev = net->loopback_dev;
3252 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3253 		err = 0;
3254 		if (IS_ERR(rt))
3255 			err = PTR_ERR(rt);
3256 		else
3257 			skb_dst_set(skb, &rt->dst);
3258 	}
3259 
3260 	if (err)
3261 		goto errout_rcu;
3262 
3263 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3264 		rt->rt_flags |= RTCF_NOTIFY;
3265 
3266 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3267 		table_id = res.table ? res.table->tb_id : 0;
3268 
3269 	/* reset skb for netlink reply msg */
3270 	skb_trim(skb, 0);
3271 	skb_reset_network_header(skb);
3272 	skb_reset_transport_header(skb);
3273 	skb_reset_mac_header(skb);
3274 
3275 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3276 		struct fib_rt_info fri;
3277 
3278 		if (!res.fi) {
3279 			err = fib_props[res.type].error;
3280 			if (!err)
3281 				err = -EHOSTUNREACH;
3282 			goto errout_rcu;
3283 		}
3284 		fri.fi = res.fi;
3285 		fri.tb_id = table_id;
3286 		fri.dst = res.prefix;
3287 		fri.dst_len = res.prefixlen;
3288 		fri.tos = fl4.flowi4_tos;
3289 		fri.type = rt->rt_type;
3290 		fri.offload = 0;
3291 		fri.trap = 0;
3292 		if (res.fa_head) {
3293 			struct fib_alias *fa;
3294 
3295 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3296 				u8 slen = 32 - fri.dst_len;
3297 
3298 				if (fa->fa_slen == slen &&
3299 				    fa->tb_id == fri.tb_id &&
3300 				    fa->fa_tos == fri.tos &&
3301 				    fa->fa_info == res.fi &&
3302 				    fa->fa_type == fri.type) {
3303 					fri.offload = fa->offload;
3304 					fri.trap = fa->trap;
3305 					break;
3306 				}
3307 			}
3308 		}
3309 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3310 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3311 	} else {
3312 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3313 				   NETLINK_CB(in_skb).portid,
3314 				   nlh->nlmsg_seq, 0);
3315 	}
3316 	if (err < 0)
3317 		goto errout_rcu;
3318 
3319 	rcu_read_unlock();
3320 
3321 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3322 
3323 errout_free:
3324 	return err;
3325 errout_rcu:
3326 	rcu_read_unlock();
3327 	kfree_skb(skb);
3328 	goto errout_free;
3329 }
3330 
3331 void ip_rt_multicast_event(struct in_device *in_dev)
3332 {
3333 	rt_cache_flush(dev_net(in_dev->dev));
3334 }
3335 
3336 #ifdef CONFIG_SYSCTL
3337 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3338 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3339 static int ip_rt_gc_elasticity __read_mostly	= 8;
3340 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3341 
3342 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3343 		void *buffer, size_t *lenp, loff_t *ppos)
3344 {
3345 	struct net *net = (struct net *)__ctl->extra1;
3346 
3347 	if (write) {
3348 		rt_cache_flush(net);
3349 		fnhe_genid_bump(net);
3350 		return 0;
3351 	}
3352 
3353 	return -EINVAL;
3354 }
3355 
3356 static struct ctl_table ipv4_route_table[] = {
3357 	{
3358 		.procname	= "gc_thresh",
3359 		.data		= &ipv4_dst_ops.gc_thresh,
3360 		.maxlen		= sizeof(int),
3361 		.mode		= 0644,
3362 		.proc_handler	= proc_dointvec,
3363 	},
3364 	{
3365 		.procname	= "max_size",
3366 		.data		= &ip_rt_max_size,
3367 		.maxlen		= sizeof(int),
3368 		.mode		= 0644,
3369 		.proc_handler	= proc_dointvec,
3370 	},
3371 	{
3372 		/*  Deprecated. Use gc_min_interval_ms */
3373 
3374 		.procname	= "gc_min_interval",
3375 		.data		= &ip_rt_gc_min_interval,
3376 		.maxlen		= sizeof(int),
3377 		.mode		= 0644,
3378 		.proc_handler	= proc_dointvec_jiffies,
3379 	},
3380 	{
3381 		.procname	= "gc_min_interval_ms",
3382 		.data		= &ip_rt_gc_min_interval,
3383 		.maxlen		= sizeof(int),
3384 		.mode		= 0644,
3385 		.proc_handler	= proc_dointvec_ms_jiffies,
3386 	},
3387 	{
3388 		.procname	= "gc_timeout",
3389 		.data		= &ip_rt_gc_timeout,
3390 		.maxlen		= sizeof(int),
3391 		.mode		= 0644,
3392 		.proc_handler	= proc_dointvec_jiffies,
3393 	},
3394 	{
3395 		.procname	= "gc_interval",
3396 		.data		= &ip_rt_gc_interval,
3397 		.maxlen		= sizeof(int),
3398 		.mode		= 0644,
3399 		.proc_handler	= proc_dointvec_jiffies,
3400 	},
3401 	{
3402 		.procname	= "redirect_load",
3403 		.data		= &ip_rt_redirect_load,
3404 		.maxlen		= sizeof(int),
3405 		.mode		= 0644,
3406 		.proc_handler	= proc_dointvec,
3407 	},
3408 	{
3409 		.procname	= "redirect_number",
3410 		.data		= &ip_rt_redirect_number,
3411 		.maxlen		= sizeof(int),
3412 		.mode		= 0644,
3413 		.proc_handler	= proc_dointvec,
3414 	},
3415 	{
3416 		.procname	= "redirect_silence",
3417 		.data		= &ip_rt_redirect_silence,
3418 		.maxlen		= sizeof(int),
3419 		.mode		= 0644,
3420 		.proc_handler	= proc_dointvec,
3421 	},
3422 	{
3423 		.procname	= "error_cost",
3424 		.data		= &ip_rt_error_cost,
3425 		.maxlen		= sizeof(int),
3426 		.mode		= 0644,
3427 		.proc_handler	= proc_dointvec,
3428 	},
3429 	{
3430 		.procname	= "error_burst",
3431 		.data		= &ip_rt_error_burst,
3432 		.maxlen		= sizeof(int),
3433 		.mode		= 0644,
3434 		.proc_handler	= proc_dointvec,
3435 	},
3436 	{
3437 		.procname	= "gc_elasticity",
3438 		.data		= &ip_rt_gc_elasticity,
3439 		.maxlen		= sizeof(int),
3440 		.mode		= 0644,
3441 		.proc_handler	= proc_dointvec,
3442 	},
3443 	{
3444 		.procname	= "mtu_expires",
3445 		.data		= &ip_rt_mtu_expires,
3446 		.maxlen		= sizeof(int),
3447 		.mode		= 0644,
3448 		.proc_handler	= proc_dointvec_jiffies,
3449 	},
3450 	{
3451 		.procname	= "min_pmtu",
3452 		.data		= &ip_rt_min_pmtu,
3453 		.maxlen		= sizeof(int),
3454 		.mode		= 0644,
3455 		.proc_handler	= proc_dointvec_minmax,
3456 		.extra1		= &ip_min_valid_pmtu,
3457 	},
3458 	{
3459 		.procname	= "min_adv_mss",
3460 		.data		= &ip_rt_min_advmss,
3461 		.maxlen		= sizeof(int),
3462 		.mode		= 0644,
3463 		.proc_handler	= proc_dointvec,
3464 	},
3465 	{ }
3466 };
3467 
3468 static const char ipv4_route_flush_procname[] = "flush";
3469 
3470 static struct ctl_table ipv4_route_flush_table[] = {
3471 	{
3472 		.procname	= ipv4_route_flush_procname,
3473 		.maxlen		= sizeof(int),
3474 		.mode		= 0200,
3475 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3476 	},
3477 	{ },
3478 };
3479 
3480 static __net_init int sysctl_route_net_init(struct net *net)
3481 {
3482 	struct ctl_table *tbl;
3483 
3484 	tbl = ipv4_route_flush_table;
3485 	if (!net_eq(net, &init_net)) {
3486 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3487 		if (!tbl)
3488 			goto err_dup;
3489 
3490 		/* Don't export non-whitelisted sysctls to unprivileged users */
3491 		if (net->user_ns != &init_user_ns) {
3492 			if (tbl[0].procname != ipv4_route_flush_procname)
3493 				tbl[0].procname = NULL;
3494 		}
3495 	}
3496 	tbl[0].extra1 = net;
3497 
3498 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3499 	if (!net->ipv4.route_hdr)
3500 		goto err_reg;
3501 	return 0;
3502 
3503 err_reg:
3504 	if (tbl != ipv4_route_flush_table)
3505 		kfree(tbl);
3506 err_dup:
3507 	return -ENOMEM;
3508 }
3509 
3510 static __net_exit void sysctl_route_net_exit(struct net *net)
3511 {
3512 	struct ctl_table *tbl;
3513 
3514 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3515 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3516 	BUG_ON(tbl == ipv4_route_flush_table);
3517 	kfree(tbl);
3518 }
3519 
3520 static __net_initdata struct pernet_operations sysctl_route_ops = {
3521 	.init = sysctl_route_net_init,
3522 	.exit = sysctl_route_net_exit,
3523 };
3524 #endif
3525 
3526 static __net_init int rt_genid_init(struct net *net)
3527 {
3528 	atomic_set(&net->ipv4.rt_genid, 0);
3529 	atomic_set(&net->fnhe_genid, 0);
3530 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3531 	return 0;
3532 }
3533 
3534 static __net_initdata struct pernet_operations rt_genid_ops = {
3535 	.init = rt_genid_init,
3536 };
3537 
3538 static int __net_init ipv4_inetpeer_init(struct net *net)
3539 {
3540 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3541 
3542 	if (!bp)
3543 		return -ENOMEM;
3544 	inet_peer_base_init(bp);
3545 	net->ipv4.peers = bp;
3546 	return 0;
3547 }
3548 
3549 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3550 {
3551 	struct inet_peer_base *bp = net->ipv4.peers;
3552 
3553 	net->ipv4.peers = NULL;
3554 	inetpeer_invalidate_tree(bp);
3555 	kfree(bp);
3556 }
3557 
3558 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3559 	.init	=	ipv4_inetpeer_init,
3560 	.exit	=	ipv4_inetpeer_exit,
3561 };
3562 
3563 #ifdef CONFIG_IP_ROUTE_CLASSID
3564 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3565 #endif /* CONFIG_IP_ROUTE_CLASSID */
3566 
3567 int __init ip_rt_init(void)
3568 {
3569 	int cpu;
3570 
3571 	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3572 				  GFP_KERNEL);
3573 	if (!ip_idents)
3574 		panic("IP: failed to allocate ip_idents\n");
3575 
3576 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3577 
3578 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3579 	if (!ip_tstamps)
3580 		panic("IP: failed to allocate ip_tstamps\n");
3581 
3582 	for_each_possible_cpu(cpu) {
3583 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3584 
3585 		INIT_LIST_HEAD(&ul->head);
3586 		spin_lock_init(&ul->lock);
3587 	}
3588 #ifdef CONFIG_IP_ROUTE_CLASSID
3589 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3590 	if (!ip_rt_acct)
3591 		panic("IP: failed to allocate ip_rt_acct\n");
3592 #endif
3593 
3594 	ipv4_dst_ops.kmem_cachep =
3595 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3596 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3597 
3598 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3599 
3600 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3601 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3602 
3603 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3604 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3605 
3606 	ipv4_dst_ops.gc_thresh = ~0;
3607 	ip_rt_max_size = INT_MAX;
3608 
3609 	devinet_init();
3610 	ip_fib_init();
3611 
3612 	if (ip_rt_proc_init())
3613 		pr_err("Unable to create route proc files\n");
3614 #ifdef CONFIG_XFRM
3615 	xfrm_init();
3616 	xfrm4_init();
3617 #endif
3618 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3619 		      RTNL_FLAG_DOIT_UNLOCKED);
3620 
3621 #ifdef CONFIG_SYSCTL
3622 	register_pernet_subsys(&sysctl_route_ops);
3623 #endif
3624 	register_pernet_subsys(&rt_genid_ops);
3625 	register_pernet_subsys(&ipv4_inetpeer_ops);
3626 	return 0;
3627 }
3628 
3629 #ifdef CONFIG_SYSCTL
3630 /*
3631  * We really need to sanitize the damn ipv4 init order, then all
3632  * this nonsense will go away.
3633  */
3634 void __init ip_static_sysctl_init(void)
3635 {
3636 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3637 }
3638 #endif
3639