xref: /linux/net/ipv4/route.c (revision c98be0c96db00e9b6b02d31e0fa7590c54cdaaac)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly	= 9;
119 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly	= HZ;
122 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly	= 256;
126 
127 /*
128  *	Interface to generic destination cache.
129  */
130 
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void		 ipv4_link_failure(struct sk_buff *skb);
136 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137 					   struct sk_buff *skb, u32 mtu);
138 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139 					struct sk_buff *skb);
140 static void		ipv4_dst_destroy(struct dst_entry *dst);
141 
142 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
143 			    int how)
144 {
145 }
146 
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 {
149 	WARN_ON(1);
150 	return NULL;
151 }
152 
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
154 					   struct sk_buff *skb,
155 					   const void *daddr);
156 
157 static struct dst_ops ipv4_dst_ops = {
158 	.family =		AF_INET,
159 	.protocol =		cpu_to_be16(ETH_P_IP),
160 	.check =		ipv4_dst_check,
161 	.default_advmss =	ipv4_default_advmss,
162 	.mtu =			ipv4_mtu,
163 	.cow_metrics =		ipv4_cow_metrics,
164 	.destroy =		ipv4_dst_destroy,
165 	.ifdown =		ipv4_dst_ifdown,
166 	.negative_advice =	ipv4_negative_advice,
167 	.link_failure =		ipv4_link_failure,
168 	.update_pmtu =		ip_rt_update_pmtu,
169 	.redirect =		ip_do_redirect,
170 	.local_out =		__ip_local_out,
171 	.neigh_lookup =		ipv4_neigh_lookup,
172 };
173 
174 #define ECN_OR_COST(class)	TC_PRIO_##class
175 
176 const __u8 ip_tos2prio[16] = {
177 	TC_PRIO_BESTEFFORT,
178 	ECN_OR_COST(BESTEFFORT),
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BULK,
182 	ECN_OR_COST(BULK),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_INTERACTIVE,
186 	ECN_OR_COST(INTERACTIVE),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE_BULK,
190 	ECN_OR_COST(INTERACTIVE_BULK),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK)
193 };
194 EXPORT_SYMBOL(ip_tos2prio);
195 
196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
197 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
198 
199 #ifdef CONFIG_PROC_FS
200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
201 {
202 	if (*pos)
203 		return NULL;
204 	return SEQ_START_TOKEN;
205 }
206 
207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
208 {
209 	++*pos;
210 	return NULL;
211 }
212 
213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
214 {
215 }
216 
217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
218 {
219 	if (v == SEQ_START_TOKEN)
220 		seq_printf(seq, "%-127s\n",
221 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
222 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
223 			   "HHUptod\tSpecDst");
224 	return 0;
225 }
226 
227 static const struct seq_operations rt_cache_seq_ops = {
228 	.start  = rt_cache_seq_start,
229 	.next   = rt_cache_seq_next,
230 	.stop   = rt_cache_seq_stop,
231 	.show   = rt_cache_seq_show,
232 };
233 
234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
235 {
236 	return seq_open(file, &rt_cache_seq_ops);
237 }
238 
239 static const struct file_operations rt_cache_seq_fops = {
240 	.owner	 = THIS_MODULE,
241 	.open	 = rt_cache_seq_open,
242 	.read	 = seq_read,
243 	.llseek	 = seq_lseek,
244 	.release = seq_release,
245 };
246 
247 
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250 	int cpu;
251 
252 	if (*pos == 0)
253 		return SEQ_START_TOKEN;
254 
255 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 		if (!cpu_possible(cpu))
257 			continue;
258 		*pos = cpu+1;
259 		return &per_cpu(rt_cache_stat, cpu);
260 	}
261 	return NULL;
262 }
263 
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266 	int cpu;
267 
268 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 		if (!cpu_possible(cpu))
270 			continue;
271 		*pos = cpu+1;
272 		return &per_cpu(rt_cache_stat, cpu);
273 	}
274 	return NULL;
275 
276 }
277 
278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
279 {
280 
281 }
282 
283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
284 {
285 	struct rt_cache_stat *st = v;
286 
287 	if (v == SEQ_START_TOKEN) {
288 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
289 		return 0;
290 	}
291 
292 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
293 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
294 		   dst_entries_get_slow(&ipv4_dst_ops),
295 		   0, /* st->in_hit */
296 		   st->in_slow_tot,
297 		   st->in_slow_mc,
298 		   st->in_no_route,
299 		   st->in_brd,
300 		   st->in_martian_dst,
301 		   st->in_martian_src,
302 
303 		   0, /* st->out_hit */
304 		   st->out_slow_tot,
305 		   st->out_slow_mc,
306 
307 		   0, /* st->gc_total */
308 		   0, /* st->gc_ignored */
309 		   0, /* st->gc_goal_miss */
310 		   0, /* st->gc_dst_overflow */
311 		   0, /* st->in_hlist_search */
312 		   0  /* st->out_hlist_search */
313 		);
314 	return 0;
315 }
316 
317 static const struct seq_operations rt_cpu_seq_ops = {
318 	.start  = rt_cpu_seq_start,
319 	.next   = rt_cpu_seq_next,
320 	.stop   = rt_cpu_seq_stop,
321 	.show   = rt_cpu_seq_show,
322 };
323 
324 
325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
326 {
327 	return seq_open(file, &rt_cpu_seq_ops);
328 }
329 
330 static const struct file_operations rt_cpu_seq_fops = {
331 	.owner	 = THIS_MODULE,
332 	.open	 = rt_cpu_seq_open,
333 	.read	 = seq_read,
334 	.llseek	 = seq_lseek,
335 	.release = seq_release,
336 };
337 
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341 	struct ip_rt_acct *dst, *src;
342 	unsigned int i, j;
343 
344 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345 	if (!dst)
346 		return -ENOMEM;
347 
348 	for_each_possible_cpu(i) {
349 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 		for (j = 0; j < 256; j++) {
351 			dst[j].o_bytes   += src[j].o_bytes;
352 			dst[j].o_packets += src[j].o_packets;
353 			dst[j].i_bytes   += src[j].i_bytes;
354 			dst[j].i_packets += src[j].i_packets;
355 		}
356 	}
357 
358 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359 	kfree(dst);
360 	return 0;
361 }
362 
363 static int rt_acct_proc_open(struct inode *inode, struct file *file)
364 {
365 	return single_open(file, rt_acct_proc_show, NULL);
366 }
367 
368 static const struct file_operations rt_acct_proc_fops = {
369 	.owner		= THIS_MODULE,
370 	.open		= rt_acct_proc_open,
371 	.read		= seq_read,
372 	.llseek		= seq_lseek,
373 	.release	= single_release,
374 };
375 #endif
376 
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379 	struct proc_dir_entry *pde;
380 
381 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
382 			  &rt_cache_seq_fops);
383 	if (!pde)
384 		goto err1;
385 
386 	pde = proc_create("rt_cache", S_IRUGO,
387 			  net->proc_net_stat, &rt_cpu_seq_fops);
388 	if (!pde)
389 		goto err2;
390 
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393 	if (!pde)
394 		goto err3;
395 #endif
396 	return 0;
397 
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403 	remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405 	return -ENOMEM;
406 }
407 
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410 	remove_proc_entry("rt_cache", net->proc_net_stat);
411 	remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413 	remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416 
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418 	.init = ip_rt_do_proc_init,
419 	.exit = ip_rt_do_proc_exit,
420 };
421 
422 static int __init ip_rt_proc_init(void)
423 {
424 	return register_pernet_subsys(&ip_rt_proc_ops);
425 }
426 
427 #else
428 static inline int ip_rt_proc_init(void)
429 {
430 	return 0;
431 }
432 #endif /* CONFIG_PROC_FS */
433 
434 static inline bool rt_is_expired(const struct rtable *rth)
435 {
436 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
437 }
438 
439 void rt_cache_flush(struct net *net)
440 {
441 	rt_genid_bump_ipv4(net);
442 }
443 
444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
445 					   struct sk_buff *skb,
446 					   const void *daddr)
447 {
448 	struct net_device *dev = dst->dev;
449 	const __be32 *pkey = daddr;
450 	const struct rtable *rt;
451 	struct neighbour *n;
452 
453 	rt = (const struct rtable *) dst;
454 	if (rt->rt_gateway)
455 		pkey = (const __be32 *) &rt->rt_gateway;
456 	else if (skb)
457 		pkey = &ip_hdr(skb)->daddr;
458 
459 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
460 	if (n)
461 		return n;
462 	return neigh_create(&arp_tbl, pkey, dev);
463 }
464 
465 /*
466  * Peer allocation may fail only in serious out-of-memory conditions.  However
467  * we still can generate some output.
468  * Random ID selection looks a bit dangerous because we have no chances to
469  * select ID being unique in a reasonable period of time.
470  * But broken packet identifier may be better than no packet at all.
471  */
472 static void ip_select_fb_ident(struct iphdr *iph)
473 {
474 	static DEFINE_SPINLOCK(ip_fb_id_lock);
475 	static u32 ip_fallback_id;
476 	u32 salt;
477 
478 	spin_lock_bh(&ip_fb_id_lock);
479 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
480 	iph->id = htons(salt & 0xFFFF);
481 	ip_fallback_id = salt;
482 	spin_unlock_bh(&ip_fb_id_lock);
483 }
484 
485 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
486 {
487 	struct net *net = dev_net(dst->dev);
488 	struct inet_peer *peer;
489 
490 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
491 	if (peer) {
492 		iph->id = htons(inet_getid(peer, more));
493 		inet_putpeer(peer);
494 		return;
495 	}
496 
497 	ip_select_fb_ident(iph);
498 }
499 EXPORT_SYMBOL(__ip_select_ident);
500 
501 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
502 			     const struct iphdr *iph,
503 			     int oif, u8 tos,
504 			     u8 prot, u32 mark, int flow_flags)
505 {
506 	if (sk) {
507 		const struct inet_sock *inet = inet_sk(sk);
508 
509 		oif = sk->sk_bound_dev_if;
510 		mark = sk->sk_mark;
511 		tos = RT_CONN_FLAGS(sk);
512 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
513 	}
514 	flowi4_init_output(fl4, oif, mark, tos,
515 			   RT_SCOPE_UNIVERSE, prot,
516 			   flow_flags,
517 			   iph->daddr, iph->saddr, 0, 0);
518 }
519 
520 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
521 			       const struct sock *sk)
522 {
523 	const struct iphdr *iph = ip_hdr(skb);
524 	int oif = skb->dev->ifindex;
525 	u8 tos = RT_TOS(iph->tos);
526 	u8 prot = iph->protocol;
527 	u32 mark = skb->mark;
528 
529 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
530 }
531 
532 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
533 {
534 	const struct inet_sock *inet = inet_sk(sk);
535 	const struct ip_options_rcu *inet_opt;
536 	__be32 daddr = inet->inet_daddr;
537 
538 	rcu_read_lock();
539 	inet_opt = rcu_dereference(inet->inet_opt);
540 	if (inet_opt && inet_opt->opt.srr)
541 		daddr = inet_opt->opt.faddr;
542 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
543 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
544 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
545 			   inet_sk_flowi_flags(sk),
546 			   daddr, inet->inet_saddr, 0, 0);
547 	rcu_read_unlock();
548 }
549 
550 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
551 				 const struct sk_buff *skb)
552 {
553 	if (skb)
554 		build_skb_flow_key(fl4, skb, sk);
555 	else
556 		build_sk_flow_key(fl4, sk);
557 }
558 
559 static inline void rt_free(struct rtable *rt)
560 {
561 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
562 }
563 
564 static DEFINE_SPINLOCK(fnhe_lock);
565 
566 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
567 {
568 	struct rtable *rt;
569 
570 	rt = rcu_dereference(fnhe->fnhe_rth_input);
571 	if (rt) {
572 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
573 		rt_free(rt);
574 	}
575 	rt = rcu_dereference(fnhe->fnhe_rth_output);
576 	if (rt) {
577 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
578 		rt_free(rt);
579 	}
580 }
581 
582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
583 {
584 	struct fib_nh_exception *fnhe, *oldest;
585 
586 	oldest = rcu_dereference(hash->chain);
587 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
588 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
589 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
590 			oldest = fnhe;
591 	}
592 	fnhe_flush_routes(oldest);
593 	return oldest;
594 }
595 
596 static inline u32 fnhe_hashfun(__be32 daddr)
597 {
598 	u32 hval;
599 
600 	hval = (__force u32) daddr;
601 	hval ^= (hval >> 11) ^ (hval >> 22);
602 
603 	return hval & (FNHE_HASH_SIZE - 1);
604 }
605 
606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
607 {
608 	rt->rt_pmtu = fnhe->fnhe_pmtu;
609 	rt->dst.expires = fnhe->fnhe_expires;
610 
611 	if (fnhe->fnhe_gw) {
612 		rt->rt_flags |= RTCF_REDIRECTED;
613 		rt->rt_gateway = fnhe->fnhe_gw;
614 		rt->rt_uses_gateway = 1;
615 	}
616 }
617 
618 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
619 				  u32 pmtu, unsigned long expires)
620 {
621 	struct fnhe_hash_bucket *hash;
622 	struct fib_nh_exception *fnhe;
623 	struct rtable *rt;
624 	unsigned int i;
625 	int depth;
626 	u32 hval = fnhe_hashfun(daddr);
627 
628 	spin_lock_bh(&fnhe_lock);
629 
630 	hash = nh->nh_exceptions;
631 	if (!hash) {
632 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
633 		if (!hash)
634 			goto out_unlock;
635 		nh->nh_exceptions = hash;
636 	}
637 
638 	hash += hval;
639 
640 	depth = 0;
641 	for (fnhe = rcu_dereference(hash->chain); fnhe;
642 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
643 		if (fnhe->fnhe_daddr == daddr)
644 			break;
645 		depth++;
646 	}
647 
648 	if (fnhe) {
649 		if (gw)
650 			fnhe->fnhe_gw = gw;
651 		if (pmtu) {
652 			fnhe->fnhe_pmtu = pmtu;
653 			fnhe->fnhe_expires = max(1UL, expires);
654 		}
655 		/* Update all cached dsts too */
656 		rt = rcu_dereference(fnhe->fnhe_rth_input);
657 		if (rt)
658 			fill_route_from_fnhe(rt, fnhe);
659 		rt = rcu_dereference(fnhe->fnhe_rth_output);
660 		if (rt)
661 			fill_route_from_fnhe(rt, fnhe);
662 	} else {
663 		if (depth > FNHE_RECLAIM_DEPTH)
664 			fnhe = fnhe_oldest(hash);
665 		else {
666 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
667 			if (!fnhe)
668 				goto out_unlock;
669 
670 			fnhe->fnhe_next = hash->chain;
671 			rcu_assign_pointer(hash->chain, fnhe);
672 		}
673 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
674 		fnhe->fnhe_daddr = daddr;
675 		fnhe->fnhe_gw = gw;
676 		fnhe->fnhe_pmtu = pmtu;
677 		fnhe->fnhe_expires = expires;
678 
679 		/* Exception created; mark the cached routes for the nexthop
680 		 * stale, so anyone caching it rechecks if this exception
681 		 * applies to them.
682 		 */
683 		rt = rcu_dereference(nh->nh_rth_input);
684 		if (rt)
685 			rt->dst.obsolete = DST_OBSOLETE_KILL;
686 
687 		for_each_possible_cpu(i) {
688 			struct rtable __rcu **prt;
689 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
690 			rt = rcu_dereference(*prt);
691 			if (rt)
692 				rt->dst.obsolete = DST_OBSOLETE_KILL;
693 		}
694 	}
695 
696 	fnhe->fnhe_stamp = jiffies;
697 
698 out_unlock:
699 	spin_unlock_bh(&fnhe_lock);
700 	return;
701 }
702 
703 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
704 			     bool kill_route)
705 {
706 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
707 	__be32 old_gw = ip_hdr(skb)->saddr;
708 	struct net_device *dev = skb->dev;
709 	struct in_device *in_dev;
710 	struct fib_result res;
711 	struct neighbour *n;
712 	struct net *net;
713 
714 	switch (icmp_hdr(skb)->code & 7) {
715 	case ICMP_REDIR_NET:
716 	case ICMP_REDIR_NETTOS:
717 	case ICMP_REDIR_HOST:
718 	case ICMP_REDIR_HOSTTOS:
719 		break;
720 
721 	default:
722 		return;
723 	}
724 
725 	if (rt->rt_gateway != old_gw)
726 		return;
727 
728 	in_dev = __in_dev_get_rcu(dev);
729 	if (!in_dev)
730 		return;
731 
732 	net = dev_net(dev);
733 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735 	    ipv4_is_zeronet(new_gw))
736 		goto reject_redirect;
737 
738 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740 			goto reject_redirect;
741 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742 			goto reject_redirect;
743 	} else {
744 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745 			goto reject_redirect;
746 	}
747 
748 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
749 	if (n) {
750 		if (!(n->nud_state & NUD_VALID)) {
751 			neigh_event_send(n, NULL);
752 		} else {
753 			if (fib_lookup(net, fl4, &res) == 0) {
754 				struct fib_nh *nh = &FIB_RES_NH(res);
755 
756 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
757 						      0, 0);
758 			}
759 			if (kill_route)
760 				rt->dst.obsolete = DST_OBSOLETE_KILL;
761 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
762 		}
763 		neigh_release(n);
764 	}
765 	return;
766 
767 reject_redirect:
768 #ifdef CONFIG_IP_ROUTE_VERBOSE
769 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
770 		const struct iphdr *iph = (const struct iphdr *) skb->data;
771 		__be32 daddr = iph->daddr;
772 		__be32 saddr = iph->saddr;
773 
774 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775 				     "  Advised path = %pI4 -> %pI4\n",
776 				     &old_gw, dev->name, &new_gw,
777 				     &saddr, &daddr);
778 	}
779 #endif
780 	;
781 }
782 
783 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
784 {
785 	struct rtable *rt;
786 	struct flowi4 fl4;
787 	const struct iphdr *iph = (const struct iphdr *) skb->data;
788 	int oif = skb->dev->ifindex;
789 	u8 tos = RT_TOS(iph->tos);
790 	u8 prot = iph->protocol;
791 	u32 mark = skb->mark;
792 
793 	rt = (struct rtable *) dst;
794 
795 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796 	__ip_do_redirect(rt, skb, &fl4, true);
797 }
798 
799 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
800 {
801 	struct rtable *rt = (struct rtable *)dst;
802 	struct dst_entry *ret = dst;
803 
804 	if (rt) {
805 		if (dst->obsolete > 0) {
806 			ip_rt_put(rt);
807 			ret = NULL;
808 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
809 			   rt->dst.expires) {
810 			ip_rt_put(rt);
811 			ret = NULL;
812 		}
813 	}
814 	return ret;
815 }
816 
817 /*
818  * Algorithm:
819  *	1. The first ip_rt_redirect_number redirects are sent
820  *	   with exponential backoff, then we stop sending them at all,
821  *	   assuming that the host ignores our redirects.
822  *	2. If we did not see packets requiring redirects
823  *	   during ip_rt_redirect_silence, we assume that the host
824  *	   forgot redirected route and start to send redirects again.
825  *
826  * This algorithm is much cheaper and more intelligent than dumb load limiting
827  * in icmp.c.
828  *
829  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830  * and "frag. need" (breaks PMTU discovery) in icmp.c.
831  */
832 
833 void ip_rt_send_redirect(struct sk_buff *skb)
834 {
835 	struct rtable *rt = skb_rtable(skb);
836 	struct in_device *in_dev;
837 	struct inet_peer *peer;
838 	struct net *net;
839 	int log_martians;
840 
841 	rcu_read_lock();
842 	in_dev = __in_dev_get_rcu(rt->dst.dev);
843 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
844 		rcu_read_unlock();
845 		return;
846 	}
847 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
848 	rcu_read_unlock();
849 
850 	net = dev_net(rt->dst.dev);
851 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
852 	if (!peer) {
853 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
855 		return;
856 	}
857 
858 	/* No redirected packets during ip_rt_redirect_silence;
859 	 * reset the algorithm.
860 	 */
861 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862 		peer->rate_tokens = 0;
863 
864 	/* Too many ignored redirects; do not send anything
865 	 * set dst.rate_last to the last seen redirected packet.
866 	 */
867 	if (peer->rate_tokens >= ip_rt_redirect_number) {
868 		peer->rate_last = jiffies;
869 		goto out_put_peer;
870 	}
871 
872 	/* Check for load limit; set rate_last to the latest sent
873 	 * redirect.
874 	 */
875 	if (peer->rate_tokens == 0 ||
876 	    time_after(jiffies,
877 		       (peer->rate_last +
878 			(ip_rt_redirect_load << peer->rate_tokens)))) {
879 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
880 
881 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882 		peer->rate_last = jiffies;
883 		++peer->rate_tokens;
884 #ifdef CONFIG_IP_ROUTE_VERBOSE
885 		if (log_martians &&
886 		    peer->rate_tokens == ip_rt_redirect_number)
887 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888 					     &ip_hdr(skb)->saddr, inet_iif(skb),
889 					     &ip_hdr(skb)->daddr, &gw);
890 #endif
891 	}
892 out_put_peer:
893 	inet_putpeer(peer);
894 }
895 
896 static int ip_error(struct sk_buff *skb)
897 {
898 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899 	struct rtable *rt = skb_rtable(skb);
900 	struct inet_peer *peer;
901 	unsigned long now;
902 	struct net *net;
903 	bool send;
904 	int code;
905 
906 	net = dev_net(rt->dst.dev);
907 	if (!IN_DEV_FORWARD(in_dev)) {
908 		switch (rt->dst.error) {
909 		case EHOSTUNREACH:
910 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
911 			break;
912 
913 		case ENETUNREACH:
914 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
915 			break;
916 		}
917 		goto out;
918 	}
919 
920 	switch (rt->dst.error) {
921 	case EINVAL:
922 	default:
923 		goto out;
924 	case EHOSTUNREACH:
925 		code = ICMP_HOST_UNREACH;
926 		break;
927 	case ENETUNREACH:
928 		code = ICMP_NET_UNREACH;
929 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
930 		break;
931 	case EACCES:
932 		code = ICMP_PKT_FILTERED;
933 		break;
934 	}
935 
936 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
937 
938 	send = true;
939 	if (peer) {
940 		now = jiffies;
941 		peer->rate_tokens += now - peer->rate_last;
942 		if (peer->rate_tokens > ip_rt_error_burst)
943 			peer->rate_tokens = ip_rt_error_burst;
944 		peer->rate_last = now;
945 		if (peer->rate_tokens >= ip_rt_error_cost)
946 			peer->rate_tokens -= ip_rt_error_cost;
947 		else
948 			send = false;
949 		inet_putpeer(peer);
950 	}
951 	if (send)
952 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
953 
954 out:	kfree_skb(skb);
955 	return 0;
956 }
957 
958 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
959 {
960 	struct dst_entry *dst = &rt->dst;
961 	struct fib_result res;
962 
963 	if (dst_metric_locked(dst, RTAX_MTU))
964 		return;
965 
966 	if (dst->dev->mtu < mtu)
967 		return;
968 
969 	if (mtu < ip_rt_min_pmtu)
970 		mtu = ip_rt_min_pmtu;
971 
972 	if (rt->rt_pmtu == mtu &&
973 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
974 		return;
975 
976 	rcu_read_lock();
977 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
978 		struct fib_nh *nh = &FIB_RES_NH(res);
979 
980 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
981 				      jiffies + ip_rt_mtu_expires);
982 	}
983 	rcu_read_unlock();
984 }
985 
986 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
987 			      struct sk_buff *skb, u32 mtu)
988 {
989 	struct rtable *rt = (struct rtable *) dst;
990 	struct flowi4 fl4;
991 
992 	ip_rt_build_flow_key(&fl4, sk, skb);
993 	__ip_rt_update_pmtu(rt, &fl4, mtu);
994 }
995 
996 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
997 		      int oif, u32 mark, u8 protocol, int flow_flags)
998 {
999 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 	struct flowi4 fl4;
1001 	struct rtable *rt;
1002 
1003 	__build_flow_key(&fl4, NULL, iph, oif,
1004 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1005 	rt = __ip_route_output_key(net, &fl4);
1006 	if (!IS_ERR(rt)) {
1007 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1008 		ip_rt_put(rt);
1009 	}
1010 }
1011 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1012 
1013 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1014 {
1015 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1016 	struct flowi4 fl4;
1017 	struct rtable *rt;
1018 
1019 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1020 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1021 	if (!IS_ERR(rt)) {
1022 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1023 		ip_rt_put(rt);
1024 	}
1025 }
1026 
1027 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1028 {
1029 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1030 	struct flowi4 fl4;
1031 	struct rtable *rt;
1032 	struct dst_entry *dst;
1033 	bool new = false;
1034 
1035 	bh_lock_sock(sk);
1036 
1037 	if (!ip_sk_accept_pmtu(sk))
1038 		goto out;
1039 
1040 	rt = (struct rtable *) __sk_dst_get(sk);
1041 
1042 	if (sock_owned_by_user(sk) || !rt) {
1043 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1044 		goto out;
1045 	}
1046 
1047 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1048 
1049 	if (!__sk_dst_check(sk, 0)) {
1050 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1051 		if (IS_ERR(rt))
1052 			goto out;
1053 
1054 		new = true;
1055 	}
1056 
1057 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1058 
1059 	dst = dst_check(&rt->dst, 0);
1060 	if (!dst) {
1061 		if (new)
1062 			dst_release(&rt->dst);
1063 
1064 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1065 		if (IS_ERR(rt))
1066 			goto out;
1067 
1068 		new = true;
1069 	}
1070 
1071 	if (new)
1072 		__sk_dst_set(sk, &rt->dst);
1073 
1074 out:
1075 	bh_unlock_sock(sk);
1076 }
1077 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1078 
1079 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1080 		   int oif, u32 mark, u8 protocol, int flow_flags)
1081 {
1082 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1083 	struct flowi4 fl4;
1084 	struct rtable *rt;
1085 
1086 	__build_flow_key(&fl4, NULL, iph, oif,
1087 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1088 	rt = __ip_route_output_key(net, &fl4);
1089 	if (!IS_ERR(rt)) {
1090 		__ip_do_redirect(rt, skb, &fl4, false);
1091 		ip_rt_put(rt);
1092 	}
1093 }
1094 EXPORT_SYMBOL_GPL(ipv4_redirect);
1095 
1096 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1097 {
1098 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1099 	struct flowi4 fl4;
1100 	struct rtable *rt;
1101 
1102 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1103 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1104 	if (!IS_ERR(rt)) {
1105 		__ip_do_redirect(rt, skb, &fl4, false);
1106 		ip_rt_put(rt);
1107 	}
1108 }
1109 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1110 
1111 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1112 {
1113 	struct rtable *rt = (struct rtable *) dst;
1114 
1115 	/* All IPV4 dsts are created with ->obsolete set to the value
1116 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1117 	 * into this function always.
1118 	 *
1119 	 * When a PMTU/redirect information update invalidates a route,
1120 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1121 	 * DST_OBSOLETE_DEAD by dst_free().
1122 	 */
1123 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1124 		return NULL;
1125 	return dst;
1126 }
1127 
1128 static void ipv4_link_failure(struct sk_buff *skb)
1129 {
1130 	struct rtable *rt;
1131 
1132 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1133 
1134 	rt = skb_rtable(skb);
1135 	if (rt)
1136 		dst_set_expires(&rt->dst, 0);
1137 }
1138 
1139 static int ip_rt_bug(struct sk_buff *skb)
1140 {
1141 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1142 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1143 		 skb->dev ? skb->dev->name : "?");
1144 	kfree_skb(skb);
1145 	WARN_ON(1);
1146 	return 0;
1147 }
1148 
1149 /*
1150    We do not cache source address of outgoing interface,
1151    because it is used only by IP RR, TS and SRR options,
1152    so that it out of fast path.
1153 
1154    BTW remember: "addr" is allowed to be not aligned
1155    in IP options!
1156  */
1157 
1158 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1159 {
1160 	__be32 src;
1161 
1162 	if (rt_is_output_route(rt))
1163 		src = ip_hdr(skb)->saddr;
1164 	else {
1165 		struct fib_result res;
1166 		struct flowi4 fl4;
1167 		struct iphdr *iph;
1168 
1169 		iph = ip_hdr(skb);
1170 
1171 		memset(&fl4, 0, sizeof(fl4));
1172 		fl4.daddr = iph->daddr;
1173 		fl4.saddr = iph->saddr;
1174 		fl4.flowi4_tos = RT_TOS(iph->tos);
1175 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1176 		fl4.flowi4_iif = skb->dev->ifindex;
1177 		fl4.flowi4_mark = skb->mark;
1178 
1179 		rcu_read_lock();
1180 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1181 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1182 		else
1183 			src = inet_select_addr(rt->dst.dev,
1184 					       rt_nexthop(rt, iph->daddr),
1185 					       RT_SCOPE_UNIVERSE);
1186 		rcu_read_unlock();
1187 	}
1188 	memcpy(addr, &src, 4);
1189 }
1190 
1191 #ifdef CONFIG_IP_ROUTE_CLASSID
1192 static void set_class_tag(struct rtable *rt, u32 tag)
1193 {
1194 	if (!(rt->dst.tclassid & 0xFFFF))
1195 		rt->dst.tclassid |= tag & 0xFFFF;
1196 	if (!(rt->dst.tclassid & 0xFFFF0000))
1197 		rt->dst.tclassid |= tag & 0xFFFF0000;
1198 }
1199 #endif
1200 
1201 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1202 {
1203 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1204 
1205 	if (advmss == 0) {
1206 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1207 			       ip_rt_min_advmss);
1208 		if (advmss > 65535 - 40)
1209 			advmss = 65535 - 40;
1210 	}
1211 	return advmss;
1212 }
1213 
1214 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1215 {
1216 	const struct rtable *rt = (const struct rtable *) dst;
1217 	unsigned int mtu = rt->rt_pmtu;
1218 
1219 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1220 		mtu = dst_metric_raw(dst, RTAX_MTU);
1221 
1222 	if (mtu)
1223 		return mtu;
1224 
1225 	mtu = dst->dev->mtu;
1226 
1227 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1228 		if (rt->rt_uses_gateway && mtu > 576)
1229 			mtu = 576;
1230 	}
1231 
1232 	return min_t(unsigned int, mtu, IP_MAX_MTU);
1233 }
1234 
1235 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1236 {
1237 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1238 	struct fib_nh_exception *fnhe;
1239 	u32 hval;
1240 
1241 	if (!hash)
1242 		return NULL;
1243 
1244 	hval = fnhe_hashfun(daddr);
1245 
1246 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1247 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1248 		if (fnhe->fnhe_daddr == daddr)
1249 			return fnhe;
1250 	}
1251 	return NULL;
1252 }
1253 
1254 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1255 			      __be32 daddr)
1256 {
1257 	bool ret = false;
1258 
1259 	spin_lock_bh(&fnhe_lock);
1260 
1261 	if (daddr == fnhe->fnhe_daddr) {
1262 		struct rtable __rcu **porig;
1263 		struct rtable *orig;
1264 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1265 
1266 		if (rt_is_input_route(rt))
1267 			porig = &fnhe->fnhe_rth_input;
1268 		else
1269 			porig = &fnhe->fnhe_rth_output;
1270 		orig = rcu_dereference(*porig);
1271 
1272 		if (fnhe->fnhe_genid != genid) {
1273 			fnhe->fnhe_genid = genid;
1274 			fnhe->fnhe_gw = 0;
1275 			fnhe->fnhe_pmtu = 0;
1276 			fnhe->fnhe_expires = 0;
1277 			fnhe_flush_routes(fnhe);
1278 			orig = NULL;
1279 		}
1280 		fill_route_from_fnhe(rt, fnhe);
1281 		if (!rt->rt_gateway)
1282 			rt->rt_gateway = daddr;
1283 
1284 		if (!(rt->dst.flags & DST_NOCACHE)) {
1285 			rcu_assign_pointer(*porig, rt);
1286 			if (orig)
1287 				rt_free(orig);
1288 			ret = true;
1289 		}
1290 
1291 		fnhe->fnhe_stamp = jiffies;
1292 	}
1293 	spin_unlock_bh(&fnhe_lock);
1294 
1295 	return ret;
1296 }
1297 
1298 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1299 {
1300 	struct rtable *orig, *prev, **p;
1301 	bool ret = true;
1302 
1303 	if (rt_is_input_route(rt)) {
1304 		p = (struct rtable **)&nh->nh_rth_input;
1305 	} else {
1306 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1307 	}
1308 	orig = *p;
1309 
1310 	prev = cmpxchg(p, orig, rt);
1311 	if (prev == orig) {
1312 		if (orig)
1313 			rt_free(orig);
1314 	} else
1315 		ret = false;
1316 
1317 	return ret;
1318 }
1319 
1320 static DEFINE_SPINLOCK(rt_uncached_lock);
1321 static LIST_HEAD(rt_uncached_list);
1322 
1323 static void rt_add_uncached_list(struct rtable *rt)
1324 {
1325 	spin_lock_bh(&rt_uncached_lock);
1326 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1327 	spin_unlock_bh(&rt_uncached_lock);
1328 }
1329 
1330 static void ipv4_dst_destroy(struct dst_entry *dst)
1331 {
1332 	struct rtable *rt = (struct rtable *) dst;
1333 
1334 	if (!list_empty(&rt->rt_uncached)) {
1335 		spin_lock_bh(&rt_uncached_lock);
1336 		list_del(&rt->rt_uncached);
1337 		spin_unlock_bh(&rt_uncached_lock);
1338 	}
1339 }
1340 
1341 void rt_flush_dev(struct net_device *dev)
1342 {
1343 	if (!list_empty(&rt_uncached_list)) {
1344 		struct net *net = dev_net(dev);
1345 		struct rtable *rt;
1346 
1347 		spin_lock_bh(&rt_uncached_lock);
1348 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1349 			if (rt->dst.dev != dev)
1350 				continue;
1351 			rt->dst.dev = net->loopback_dev;
1352 			dev_hold(rt->dst.dev);
1353 			dev_put(dev);
1354 		}
1355 		spin_unlock_bh(&rt_uncached_lock);
1356 	}
1357 }
1358 
1359 static bool rt_cache_valid(const struct rtable *rt)
1360 {
1361 	return	rt &&
1362 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1363 		!rt_is_expired(rt);
1364 }
1365 
1366 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1367 			   const struct fib_result *res,
1368 			   struct fib_nh_exception *fnhe,
1369 			   struct fib_info *fi, u16 type, u32 itag)
1370 {
1371 	bool cached = false;
1372 
1373 	if (fi) {
1374 		struct fib_nh *nh = &FIB_RES_NH(*res);
1375 
1376 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1377 			rt->rt_gateway = nh->nh_gw;
1378 			rt->rt_uses_gateway = 1;
1379 		}
1380 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1381 #ifdef CONFIG_IP_ROUTE_CLASSID
1382 		rt->dst.tclassid = nh->nh_tclassid;
1383 #endif
1384 		if (unlikely(fnhe))
1385 			cached = rt_bind_exception(rt, fnhe, daddr);
1386 		else if (!(rt->dst.flags & DST_NOCACHE))
1387 			cached = rt_cache_route(nh, rt);
1388 		if (unlikely(!cached)) {
1389 			/* Routes we intend to cache in nexthop exception or
1390 			 * FIB nexthop have the DST_NOCACHE bit clear.
1391 			 * However, if we are unsuccessful at storing this
1392 			 * route into the cache we really need to set it.
1393 			 */
1394 			rt->dst.flags |= DST_NOCACHE;
1395 			if (!rt->rt_gateway)
1396 				rt->rt_gateway = daddr;
1397 			rt_add_uncached_list(rt);
1398 		}
1399 	} else
1400 		rt_add_uncached_list(rt);
1401 
1402 #ifdef CONFIG_IP_ROUTE_CLASSID
1403 #ifdef CONFIG_IP_MULTIPLE_TABLES
1404 	set_class_tag(rt, res->tclassid);
1405 #endif
1406 	set_class_tag(rt, itag);
1407 #endif
1408 }
1409 
1410 static struct rtable *rt_dst_alloc(struct net_device *dev,
1411 				   bool nopolicy, bool noxfrm, bool will_cache)
1412 {
1413 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1414 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1415 			 (nopolicy ? DST_NOPOLICY : 0) |
1416 			 (noxfrm ? DST_NOXFRM : 0));
1417 }
1418 
1419 /* called in rcu_read_lock() section */
1420 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1421 				u8 tos, struct net_device *dev, int our)
1422 {
1423 	struct rtable *rth;
1424 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1425 	u32 itag = 0;
1426 	int err;
1427 
1428 	/* Primary sanity checks. */
1429 
1430 	if (in_dev == NULL)
1431 		return -EINVAL;
1432 
1433 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1434 	    skb->protocol != htons(ETH_P_IP))
1435 		goto e_inval;
1436 
1437 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1438 		if (ipv4_is_loopback(saddr))
1439 			goto e_inval;
1440 
1441 	if (ipv4_is_zeronet(saddr)) {
1442 		if (!ipv4_is_local_multicast(daddr))
1443 			goto e_inval;
1444 	} else {
1445 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1446 					  in_dev, &itag);
1447 		if (err < 0)
1448 			goto e_err;
1449 	}
1450 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1451 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1452 	if (!rth)
1453 		goto e_nobufs;
1454 
1455 #ifdef CONFIG_IP_ROUTE_CLASSID
1456 	rth->dst.tclassid = itag;
1457 #endif
1458 	rth->dst.output = ip_rt_bug;
1459 
1460 	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1461 	rth->rt_flags	= RTCF_MULTICAST;
1462 	rth->rt_type	= RTN_MULTICAST;
1463 	rth->rt_is_input= 1;
1464 	rth->rt_iif	= 0;
1465 	rth->rt_pmtu	= 0;
1466 	rth->rt_gateway	= 0;
1467 	rth->rt_uses_gateway = 0;
1468 	INIT_LIST_HEAD(&rth->rt_uncached);
1469 	if (our) {
1470 		rth->dst.input= ip_local_deliver;
1471 		rth->rt_flags |= RTCF_LOCAL;
1472 	}
1473 
1474 #ifdef CONFIG_IP_MROUTE
1475 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1476 		rth->dst.input = ip_mr_input;
1477 #endif
1478 	RT_CACHE_STAT_INC(in_slow_mc);
1479 
1480 	skb_dst_set(skb, &rth->dst);
1481 	return 0;
1482 
1483 e_nobufs:
1484 	return -ENOBUFS;
1485 e_inval:
1486 	return -EINVAL;
1487 e_err:
1488 	return err;
1489 }
1490 
1491 
1492 static void ip_handle_martian_source(struct net_device *dev,
1493 				     struct in_device *in_dev,
1494 				     struct sk_buff *skb,
1495 				     __be32 daddr,
1496 				     __be32 saddr)
1497 {
1498 	RT_CACHE_STAT_INC(in_martian_src);
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1501 		/*
1502 		 *	RFC1812 recommendation, if source is martian,
1503 		 *	the only hint is MAC header.
1504 		 */
1505 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1506 			&daddr, &saddr, dev->name);
1507 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1508 			print_hex_dump(KERN_WARNING, "ll header: ",
1509 				       DUMP_PREFIX_OFFSET, 16, 1,
1510 				       skb_mac_header(skb),
1511 				       dev->hard_header_len, true);
1512 		}
1513 	}
1514 #endif
1515 }
1516 
1517 /* called in rcu_read_lock() section */
1518 static int __mkroute_input(struct sk_buff *skb,
1519 			   const struct fib_result *res,
1520 			   struct in_device *in_dev,
1521 			   __be32 daddr, __be32 saddr, u32 tos)
1522 {
1523 	struct fib_nh_exception *fnhe;
1524 	struct rtable *rth;
1525 	int err;
1526 	struct in_device *out_dev;
1527 	unsigned int flags = 0;
1528 	bool do_cache;
1529 	u32 itag;
1530 
1531 	/* get a working reference to the output device */
1532 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1533 	if (out_dev == NULL) {
1534 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1535 		return -EINVAL;
1536 	}
1537 
1538 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1539 				  in_dev->dev, in_dev, &itag);
1540 	if (err < 0) {
1541 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1542 					 saddr);
1543 
1544 		goto cleanup;
1545 	}
1546 
1547 	do_cache = res->fi && !itag;
1548 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1549 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1550 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1551 		flags |= RTCF_DOREDIRECT;
1552 		do_cache = false;
1553 	}
1554 
1555 	if (skb->protocol != htons(ETH_P_IP)) {
1556 		/* Not IP (i.e. ARP). Do not create route, if it is
1557 		 * invalid for proxy arp. DNAT routes are always valid.
1558 		 *
1559 		 * Proxy arp feature have been extended to allow, ARP
1560 		 * replies back to the same interface, to support
1561 		 * Private VLAN switch technologies. See arp.c.
1562 		 */
1563 		if (out_dev == in_dev &&
1564 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1565 			err = -EINVAL;
1566 			goto cleanup;
1567 		}
1568 	}
1569 
1570 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1571 	if (do_cache) {
1572 		if (fnhe != NULL)
1573 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1574 		else
1575 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1576 
1577 		if (rt_cache_valid(rth)) {
1578 			skb_dst_set_noref(skb, &rth->dst);
1579 			goto out;
1580 		}
1581 	}
1582 
1583 	rth = rt_dst_alloc(out_dev->dev,
1584 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1585 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1586 	if (!rth) {
1587 		err = -ENOBUFS;
1588 		goto cleanup;
1589 	}
1590 
1591 	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1592 	rth->rt_flags = flags;
1593 	rth->rt_type = res->type;
1594 	rth->rt_is_input = 1;
1595 	rth->rt_iif 	= 0;
1596 	rth->rt_pmtu	= 0;
1597 	rth->rt_gateway	= 0;
1598 	rth->rt_uses_gateway = 0;
1599 	INIT_LIST_HEAD(&rth->rt_uncached);
1600 	RT_CACHE_STAT_INC(in_slow_tot);
1601 
1602 	rth->dst.input = ip_forward;
1603 	rth->dst.output = ip_output;
1604 
1605 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1606 	skb_dst_set(skb, &rth->dst);
1607 out:
1608 	err = 0;
1609  cleanup:
1610 	return err;
1611 }
1612 
1613 static int ip_mkroute_input(struct sk_buff *skb,
1614 			    struct fib_result *res,
1615 			    const struct flowi4 *fl4,
1616 			    struct in_device *in_dev,
1617 			    __be32 daddr, __be32 saddr, u32 tos)
1618 {
1619 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1620 	if (res->fi && res->fi->fib_nhs > 1)
1621 		fib_select_multipath(res);
1622 #endif
1623 
1624 	/* create a routing cache entry */
1625 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1626 }
1627 
1628 /*
1629  *	NOTE. We drop all the packets that has local source
1630  *	addresses, because every properly looped back packet
1631  *	must have correct destination already attached by output routine.
1632  *
1633  *	Such approach solves two big problems:
1634  *	1. Not simplex devices are handled properly.
1635  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1636  *	called with rcu_read_lock()
1637  */
1638 
1639 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640 			       u8 tos, struct net_device *dev)
1641 {
1642 	struct fib_result res;
1643 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1644 	struct flowi4	fl4;
1645 	unsigned int	flags = 0;
1646 	u32		itag = 0;
1647 	struct rtable	*rth;
1648 	int		err = -EINVAL;
1649 	struct net    *net = dev_net(dev);
1650 	bool do_cache;
1651 
1652 	/* IP on this device is disabled. */
1653 
1654 	if (!in_dev)
1655 		goto out;
1656 
1657 	/* Check for the most weird martians, which can be not detected
1658 	   by fib_lookup.
1659 	 */
1660 
1661 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1662 		goto martian_source;
1663 
1664 	res.fi = NULL;
1665 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1666 		goto brd_input;
1667 
1668 	/* Accept zero addresses only to limited broadcast;
1669 	 * I even do not know to fix it or not. Waiting for complains :-)
1670 	 */
1671 	if (ipv4_is_zeronet(saddr))
1672 		goto martian_source;
1673 
1674 	if (ipv4_is_zeronet(daddr))
1675 		goto martian_destination;
1676 
1677 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1678 	 * and call it once if daddr or/and saddr are loopback addresses
1679 	 */
1680 	if (ipv4_is_loopback(daddr)) {
1681 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1682 			goto martian_destination;
1683 	} else if (ipv4_is_loopback(saddr)) {
1684 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1685 			goto martian_source;
1686 	}
1687 
1688 	/*
1689 	 *	Now we are ready to route packet.
1690 	 */
1691 	fl4.flowi4_oif = 0;
1692 	fl4.flowi4_iif = dev->ifindex;
1693 	fl4.flowi4_mark = skb->mark;
1694 	fl4.flowi4_tos = tos;
1695 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1696 	fl4.daddr = daddr;
1697 	fl4.saddr = saddr;
1698 	err = fib_lookup(net, &fl4, &res);
1699 	if (err != 0) {
1700 		if (!IN_DEV_FORWARD(in_dev))
1701 			err = -EHOSTUNREACH;
1702 		goto no_route;
1703 	}
1704 
1705 	if (res.type == RTN_BROADCAST)
1706 		goto brd_input;
1707 
1708 	if (res.type == RTN_LOCAL) {
1709 		err = fib_validate_source(skb, saddr, daddr, tos,
1710 					  LOOPBACK_IFINDEX,
1711 					  dev, in_dev, &itag);
1712 		if (err < 0)
1713 			goto martian_source_keep_err;
1714 		goto local_input;
1715 	}
1716 
1717 	if (!IN_DEV_FORWARD(in_dev)) {
1718 		err = -EHOSTUNREACH;
1719 		goto no_route;
1720 	}
1721 	if (res.type != RTN_UNICAST)
1722 		goto martian_destination;
1723 
1724 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1725 out:	return err;
1726 
1727 brd_input:
1728 	if (skb->protocol != htons(ETH_P_IP))
1729 		goto e_inval;
1730 
1731 	if (!ipv4_is_zeronet(saddr)) {
1732 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1733 					  in_dev, &itag);
1734 		if (err < 0)
1735 			goto martian_source_keep_err;
1736 	}
1737 	flags |= RTCF_BROADCAST;
1738 	res.type = RTN_BROADCAST;
1739 	RT_CACHE_STAT_INC(in_brd);
1740 
1741 local_input:
1742 	do_cache = false;
1743 	if (res.fi) {
1744 		if (!itag) {
1745 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1746 			if (rt_cache_valid(rth)) {
1747 				skb_dst_set_noref(skb, &rth->dst);
1748 				err = 0;
1749 				goto out;
1750 			}
1751 			do_cache = true;
1752 		}
1753 	}
1754 
1755 	rth = rt_dst_alloc(net->loopback_dev,
1756 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1757 	if (!rth)
1758 		goto e_nobufs;
1759 
1760 	rth->dst.input= ip_local_deliver;
1761 	rth->dst.output= ip_rt_bug;
1762 #ifdef CONFIG_IP_ROUTE_CLASSID
1763 	rth->dst.tclassid = itag;
1764 #endif
1765 
1766 	rth->rt_genid = rt_genid_ipv4(net);
1767 	rth->rt_flags 	= flags|RTCF_LOCAL;
1768 	rth->rt_type	= res.type;
1769 	rth->rt_is_input = 1;
1770 	rth->rt_iif	= 0;
1771 	rth->rt_pmtu	= 0;
1772 	rth->rt_gateway	= 0;
1773 	rth->rt_uses_gateway = 0;
1774 	INIT_LIST_HEAD(&rth->rt_uncached);
1775 	RT_CACHE_STAT_INC(in_slow_tot);
1776 	if (res.type == RTN_UNREACHABLE) {
1777 		rth->dst.input= ip_error;
1778 		rth->dst.error= -err;
1779 		rth->rt_flags 	&= ~RTCF_LOCAL;
1780 	}
1781 	if (do_cache) {
1782 		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1783 			rth->dst.flags |= DST_NOCACHE;
1784 			rt_add_uncached_list(rth);
1785 		}
1786 	}
1787 	skb_dst_set(skb, &rth->dst);
1788 	err = 0;
1789 	goto out;
1790 
1791 no_route:
1792 	RT_CACHE_STAT_INC(in_no_route);
1793 	res.type = RTN_UNREACHABLE;
1794 	if (err == -ESRCH)
1795 		err = -ENETUNREACH;
1796 	goto local_input;
1797 
1798 	/*
1799 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1800 	 */
1801 martian_destination:
1802 	RT_CACHE_STAT_INC(in_martian_dst);
1803 #ifdef CONFIG_IP_ROUTE_VERBOSE
1804 	if (IN_DEV_LOG_MARTIANS(in_dev))
1805 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1806 				     &daddr, &saddr, dev->name);
1807 #endif
1808 
1809 e_inval:
1810 	err = -EINVAL;
1811 	goto out;
1812 
1813 e_nobufs:
1814 	err = -ENOBUFS;
1815 	goto out;
1816 
1817 martian_source:
1818 	err = -EINVAL;
1819 martian_source_keep_err:
1820 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1821 	goto out;
1822 }
1823 
1824 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1825 			 u8 tos, struct net_device *dev)
1826 {
1827 	int res;
1828 
1829 	rcu_read_lock();
1830 
1831 	/* Multicast recognition logic is moved from route cache to here.
1832 	   The problem was that too many Ethernet cards have broken/missing
1833 	   hardware multicast filters :-( As result the host on multicasting
1834 	   network acquires a lot of useless route cache entries, sort of
1835 	   SDR messages from all the world. Now we try to get rid of them.
1836 	   Really, provided software IP multicast filter is organized
1837 	   reasonably (at least, hashed), it does not result in a slowdown
1838 	   comparing with route cache reject entries.
1839 	   Note, that multicast routers are not affected, because
1840 	   route cache entry is created eventually.
1841 	 */
1842 	if (ipv4_is_multicast(daddr)) {
1843 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1844 
1845 		if (in_dev) {
1846 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1847 						  ip_hdr(skb)->protocol);
1848 			if (our
1849 #ifdef CONFIG_IP_MROUTE
1850 				||
1851 			    (!ipv4_is_local_multicast(daddr) &&
1852 			     IN_DEV_MFORWARD(in_dev))
1853 #endif
1854 			   ) {
1855 				int res = ip_route_input_mc(skb, daddr, saddr,
1856 							    tos, dev, our);
1857 				rcu_read_unlock();
1858 				return res;
1859 			}
1860 		}
1861 		rcu_read_unlock();
1862 		return -EINVAL;
1863 	}
1864 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1865 	rcu_read_unlock();
1866 	return res;
1867 }
1868 EXPORT_SYMBOL(ip_route_input_noref);
1869 
1870 /* called with rcu_read_lock() */
1871 static struct rtable *__mkroute_output(const struct fib_result *res,
1872 				       const struct flowi4 *fl4, int orig_oif,
1873 				       struct net_device *dev_out,
1874 				       unsigned int flags)
1875 {
1876 	struct fib_info *fi = res->fi;
1877 	struct fib_nh_exception *fnhe;
1878 	struct in_device *in_dev;
1879 	u16 type = res->type;
1880 	struct rtable *rth;
1881 	bool do_cache;
1882 
1883 	in_dev = __in_dev_get_rcu(dev_out);
1884 	if (!in_dev)
1885 		return ERR_PTR(-EINVAL);
1886 
1887 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1888 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1889 			return ERR_PTR(-EINVAL);
1890 
1891 	if (ipv4_is_lbcast(fl4->daddr))
1892 		type = RTN_BROADCAST;
1893 	else if (ipv4_is_multicast(fl4->daddr))
1894 		type = RTN_MULTICAST;
1895 	else if (ipv4_is_zeronet(fl4->daddr))
1896 		return ERR_PTR(-EINVAL);
1897 
1898 	if (dev_out->flags & IFF_LOOPBACK)
1899 		flags |= RTCF_LOCAL;
1900 
1901 	do_cache = true;
1902 	if (type == RTN_BROADCAST) {
1903 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1904 		fi = NULL;
1905 	} else if (type == RTN_MULTICAST) {
1906 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1907 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1908 				     fl4->flowi4_proto))
1909 			flags &= ~RTCF_LOCAL;
1910 		else
1911 			do_cache = false;
1912 		/* If multicast route do not exist use
1913 		 * default one, but do not gateway in this case.
1914 		 * Yes, it is hack.
1915 		 */
1916 		if (fi && res->prefixlen < 4)
1917 			fi = NULL;
1918 	}
1919 
1920 	fnhe = NULL;
1921 	do_cache &= fi != NULL;
1922 	if (do_cache) {
1923 		struct rtable __rcu **prth;
1924 		struct fib_nh *nh = &FIB_RES_NH(*res);
1925 
1926 		fnhe = find_exception(nh, fl4->daddr);
1927 		if (fnhe)
1928 			prth = &fnhe->fnhe_rth_output;
1929 		else {
1930 			if (unlikely(fl4->flowi4_flags &
1931 				     FLOWI_FLAG_KNOWN_NH &&
1932 				     !(nh->nh_gw &&
1933 				       nh->nh_scope == RT_SCOPE_LINK))) {
1934 				do_cache = false;
1935 				goto add;
1936 			}
1937 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1938 		}
1939 		rth = rcu_dereference(*prth);
1940 		if (rt_cache_valid(rth)) {
1941 			dst_hold(&rth->dst);
1942 			return rth;
1943 		}
1944 	}
1945 
1946 add:
1947 	rth = rt_dst_alloc(dev_out,
1948 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1949 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1950 			   do_cache);
1951 	if (!rth)
1952 		return ERR_PTR(-ENOBUFS);
1953 
1954 	rth->dst.output = ip_output;
1955 
1956 	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1957 	rth->rt_flags	= flags;
1958 	rth->rt_type	= type;
1959 	rth->rt_is_input = 0;
1960 	rth->rt_iif	= orig_oif ? : 0;
1961 	rth->rt_pmtu	= 0;
1962 	rth->rt_gateway = 0;
1963 	rth->rt_uses_gateway = 0;
1964 	INIT_LIST_HEAD(&rth->rt_uncached);
1965 
1966 	RT_CACHE_STAT_INC(out_slow_tot);
1967 
1968 	if (flags & RTCF_LOCAL)
1969 		rth->dst.input = ip_local_deliver;
1970 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1971 		if (flags & RTCF_LOCAL &&
1972 		    !(dev_out->flags & IFF_LOOPBACK)) {
1973 			rth->dst.output = ip_mc_output;
1974 			RT_CACHE_STAT_INC(out_slow_mc);
1975 		}
1976 #ifdef CONFIG_IP_MROUTE
1977 		if (type == RTN_MULTICAST) {
1978 			if (IN_DEV_MFORWARD(in_dev) &&
1979 			    !ipv4_is_local_multicast(fl4->daddr)) {
1980 				rth->dst.input = ip_mr_input;
1981 				rth->dst.output = ip_mc_output;
1982 			}
1983 		}
1984 #endif
1985 	}
1986 
1987 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1988 
1989 	return rth;
1990 }
1991 
1992 /*
1993  * Major route resolver routine.
1994  */
1995 
1996 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1997 {
1998 	struct net_device *dev_out = NULL;
1999 	__u8 tos = RT_FL_TOS(fl4);
2000 	unsigned int flags = 0;
2001 	struct fib_result res;
2002 	struct rtable *rth;
2003 	int orig_oif;
2004 
2005 	res.tclassid	= 0;
2006 	res.fi		= NULL;
2007 	res.table	= NULL;
2008 
2009 	orig_oif = fl4->flowi4_oif;
2010 
2011 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2012 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2013 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2014 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2015 
2016 	rcu_read_lock();
2017 	if (fl4->saddr) {
2018 		rth = ERR_PTR(-EINVAL);
2019 		if (ipv4_is_multicast(fl4->saddr) ||
2020 		    ipv4_is_lbcast(fl4->saddr) ||
2021 		    ipv4_is_zeronet(fl4->saddr))
2022 			goto out;
2023 
2024 		/* I removed check for oif == dev_out->oif here.
2025 		   It was wrong for two reasons:
2026 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2027 		      is assigned to multiple interfaces.
2028 		   2. Moreover, we are allowed to send packets with saddr
2029 		      of another iface. --ANK
2030 		 */
2031 
2032 		if (fl4->flowi4_oif == 0 &&
2033 		    (ipv4_is_multicast(fl4->daddr) ||
2034 		     ipv4_is_lbcast(fl4->daddr))) {
2035 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2036 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2037 			if (dev_out == NULL)
2038 				goto out;
2039 
2040 			/* Special hack: user can direct multicasts
2041 			   and limited broadcast via necessary interface
2042 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2043 			   This hack is not just for fun, it allows
2044 			   vic,vat and friends to work.
2045 			   They bind socket to loopback, set ttl to zero
2046 			   and expect that it will work.
2047 			   From the viewpoint of routing cache they are broken,
2048 			   because we are not allowed to build multicast path
2049 			   with loopback source addr (look, routing cache
2050 			   cannot know, that ttl is zero, so that packet
2051 			   will not leave this host and route is valid).
2052 			   Luckily, this hack is good workaround.
2053 			 */
2054 
2055 			fl4->flowi4_oif = dev_out->ifindex;
2056 			goto make_route;
2057 		}
2058 
2059 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2060 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2061 			if (!__ip_dev_find(net, fl4->saddr, false))
2062 				goto out;
2063 		}
2064 	}
2065 
2066 
2067 	if (fl4->flowi4_oif) {
2068 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2069 		rth = ERR_PTR(-ENODEV);
2070 		if (dev_out == NULL)
2071 			goto out;
2072 
2073 		/* RACE: Check return value of inet_select_addr instead. */
2074 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2075 			rth = ERR_PTR(-ENETUNREACH);
2076 			goto out;
2077 		}
2078 		if (ipv4_is_local_multicast(fl4->daddr) ||
2079 		    ipv4_is_lbcast(fl4->daddr)) {
2080 			if (!fl4->saddr)
2081 				fl4->saddr = inet_select_addr(dev_out, 0,
2082 							      RT_SCOPE_LINK);
2083 			goto make_route;
2084 		}
2085 		if (!fl4->saddr) {
2086 			if (ipv4_is_multicast(fl4->daddr))
2087 				fl4->saddr = inet_select_addr(dev_out, 0,
2088 							      fl4->flowi4_scope);
2089 			else if (!fl4->daddr)
2090 				fl4->saddr = inet_select_addr(dev_out, 0,
2091 							      RT_SCOPE_HOST);
2092 		}
2093 	}
2094 
2095 	if (!fl4->daddr) {
2096 		fl4->daddr = fl4->saddr;
2097 		if (!fl4->daddr)
2098 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2099 		dev_out = net->loopback_dev;
2100 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2101 		res.type = RTN_LOCAL;
2102 		flags |= RTCF_LOCAL;
2103 		goto make_route;
2104 	}
2105 
2106 	if (fib_lookup(net, fl4, &res)) {
2107 		res.fi = NULL;
2108 		res.table = NULL;
2109 		if (fl4->flowi4_oif) {
2110 			/* Apparently, routing tables are wrong. Assume,
2111 			   that the destination is on link.
2112 
2113 			   WHY? DW.
2114 			   Because we are allowed to send to iface
2115 			   even if it has NO routes and NO assigned
2116 			   addresses. When oif is specified, routing
2117 			   tables are looked up with only one purpose:
2118 			   to catch if destination is gatewayed, rather than
2119 			   direct. Moreover, if MSG_DONTROUTE is set,
2120 			   we send packet, ignoring both routing tables
2121 			   and ifaddr state. --ANK
2122 
2123 
2124 			   We could make it even if oif is unknown,
2125 			   likely IPv6, but we do not.
2126 			 */
2127 
2128 			if (fl4->saddr == 0)
2129 				fl4->saddr = inet_select_addr(dev_out, 0,
2130 							      RT_SCOPE_LINK);
2131 			res.type = RTN_UNICAST;
2132 			goto make_route;
2133 		}
2134 		rth = ERR_PTR(-ENETUNREACH);
2135 		goto out;
2136 	}
2137 
2138 	if (res.type == RTN_LOCAL) {
2139 		if (!fl4->saddr) {
2140 			if (res.fi->fib_prefsrc)
2141 				fl4->saddr = res.fi->fib_prefsrc;
2142 			else
2143 				fl4->saddr = fl4->daddr;
2144 		}
2145 		dev_out = net->loopback_dev;
2146 		fl4->flowi4_oif = dev_out->ifindex;
2147 		flags |= RTCF_LOCAL;
2148 		goto make_route;
2149 	}
2150 
2151 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2152 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2153 		fib_select_multipath(&res);
2154 	else
2155 #endif
2156 	if (!res.prefixlen &&
2157 	    res.table->tb_num_default > 1 &&
2158 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2159 		fib_select_default(&res);
2160 
2161 	if (!fl4->saddr)
2162 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2163 
2164 	dev_out = FIB_RES_DEV(res);
2165 	fl4->flowi4_oif = dev_out->ifindex;
2166 
2167 
2168 make_route:
2169 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2170 
2171 out:
2172 	rcu_read_unlock();
2173 	return rth;
2174 }
2175 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2176 
2177 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2178 {
2179 	return NULL;
2180 }
2181 
2182 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2183 {
2184 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2185 
2186 	return mtu ? : dst->dev->mtu;
2187 }
2188 
2189 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2190 					  struct sk_buff *skb, u32 mtu)
2191 {
2192 }
2193 
2194 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2195 				       struct sk_buff *skb)
2196 {
2197 }
2198 
2199 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2200 					  unsigned long old)
2201 {
2202 	return NULL;
2203 }
2204 
2205 static struct dst_ops ipv4_dst_blackhole_ops = {
2206 	.family			=	AF_INET,
2207 	.protocol		=	cpu_to_be16(ETH_P_IP),
2208 	.check			=	ipv4_blackhole_dst_check,
2209 	.mtu			=	ipv4_blackhole_mtu,
2210 	.default_advmss		=	ipv4_default_advmss,
2211 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2212 	.redirect		=	ipv4_rt_blackhole_redirect,
2213 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2214 	.neigh_lookup		=	ipv4_neigh_lookup,
2215 };
2216 
2217 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2218 {
2219 	struct rtable *ort = (struct rtable *) dst_orig;
2220 	struct rtable *rt;
2221 
2222 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2223 	if (rt) {
2224 		struct dst_entry *new = &rt->dst;
2225 
2226 		new->__use = 1;
2227 		new->input = dst_discard;
2228 		new->output = dst_discard;
2229 
2230 		new->dev = ort->dst.dev;
2231 		if (new->dev)
2232 			dev_hold(new->dev);
2233 
2234 		rt->rt_is_input = ort->rt_is_input;
2235 		rt->rt_iif = ort->rt_iif;
2236 		rt->rt_pmtu = ort->rt_pmtu;
2237 
2238 		rt->rt_genid = rt_genid_ipv4(net);
2239 		rt->rt_flags = ort->rt_flags;
2240 		rt->rt_type = ort->rt_type;
2241 		rt->rt_gateway = ort->rt_gateway;
2242 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2243 
2244 		INIT_LIST_HEAD(&rt->rt_uncached);
2245 
2246 		dst_free(new);
2247 	}
2248 
2249 	dst_release(dst_orig);
2250 
2251 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2252 }
2253 
2254 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2255 				    struct sock *sk)
2256 {
2257 	struct rtable *rt = __ip_route_output_key(net, flp4);
2258 
2259 	if (IS_ERR(rt))
2260 		return rt;
2261 
2262 	if (flp4->flowi4_proto)
2263 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2264 						   flowi4_to_flowi(flp4),
2265 						   sk, 0);
2266 
2267 	return rt;
2268 }
2269 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2270 
2271 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2272 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2273 			u32 seq, int event, int nowait, unsigned int flags)
2274 {
2275 	struct rtable *rt = skb_rtable(skb);
2276 	struct rtmsg *r;
2277 	struct nlmsghdr *nlh;
2278 	unsigned long expires = 0;
2279 	u32 error;
2280 	u32 metrics[RTAX_MAX];
2281 
2282 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2283 	if (nlh == NULL)
2284 		return -EMSGSIZE;
2285 
2286 	r = nlmsg_data(nlh);
2287 	r->rtm_family	 = AF_INET;
2288 	r->rtm_dst_len	= 32;
2289 	r->rtm_src_len	= 0;
2290 	r->rtm_tos	= fl4->flowi4_tos;
2291 	r->rtm_table	= RT_TABLE_MAIN;
2292 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2293 		goto nla_put_failure;
2294 	r->rtm_type	= rt->rt_type;
2295 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2296 	r->rtm_protocol = RTPROT_UNSPEC;
2297 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2298 	if (rt->rt_flags & RTCF_NOTIFY)
2299 		r->rtm_flags |= RTM_F_NOTIFY;
2300 
2301 	if (nla_put_be32(skb, RTA_DST, dst))
2302 		goto nla_put_failure;
2303 	if (src) {
2304 		r->rtm_src_len = 32;
2305 		if (nla_put_be32(skb, RTA_SRC, src))
2306 			goto nla_put_failure;
2307 	}
2308 	if (rt->dst.dev &&
2309 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2310 		goto nla_put_failure;
2311 #ifdef CONFIG_IP_ROUTE_CLASSID
2312 	if (rt->dst.tclassid &&
2313 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2314 		goto nla_put_failure;
2315 #endif
2316 	if (!rt_is_input_route(rt) &&
2317 	    fl4->saddr != src) {
2318 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2319 			goto nla_put_failure;
2320 	}
2321 	if (rt->rt_uses_gateway &&
2322 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2323 		goto nla_put_failure;
2324 
2325 	expires = rt->dst.expires;
2326 	if (expires) {
2327 		unsigned long now = jiffies;
2328 
2329 		if (time_before(now, expires))
2330 			expires -= now;
2331 		else
2332 			expires = 0;
2333 	}
2334 
2335 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2336 	if (rt->rt_pmtu && expires)
2337 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2338 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2339 		goto nla_put_failure;
2340 
2341 	if (fl4->flowi4_mark &&
2342 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2343 		goto nla_put_failure;
2344 
2345 	error = rt->dst.error;
2346 
2347 	if (rt_is_input_route(rt)) {
2348 #ifdef CONFIG_IP_MROUTE
2349 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2350 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2351 			int err = ipmr_get_route(net, skb,
2352 						 fl4->saddr, fl4->daddr,
2353 						 r, nowait);
2354 			if (err <= 0) {
2355 				if (!nowait) {
2356 					if (err == 0)
2357 						return 0;
2358 					goto nla_put_failure;
2359 				} else {
2360 					if (err == -EMSGSIZE)
2361 						goto nla_put_failure;
2362 					error = err;
2363 				}
2364 			}
2365 		} else
2366 #endif
2367 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2368 				goto nla_put_failure;
2369 	}
2370 
2371 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2372 		goto nla_put_failure;
2373 
2374 	return nlmsg_end(skb, nlh);
2375 
2376 nla_put_failure:
2377 	nlmsg_cancel(skb, nlh);
2378 	return -EMSGSIZE;
2379 }
2380 
2381 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2382 {
2383 	struct net *net = sock_net(in_skb->sk);
2384 	struct rtmsg *rtm;
2385 	struct nlattr *tb[RTA_MAX+1];
2386 	struct rtable *rt = NULL;
2387 	struct flowi4 fl4;
2388 	__be32 dst = 0;
2389 	__be32 src = 0;
2390 	u32 iif;
2391 	int err;
2392 	int mark;
2393 	struct sk_buff *skb;
2394 
2395 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2396 	if (err < 0)
2397 		goto errout;
2398 
2399 	rtm = nlmsg_data(nlh);
2400 
2401 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2402 	if (skb == NULL) {
2403 		err = -ENOBUFS;
2404 		goto errout;
2405 	}
2406 
2407 	/* Reserve room for dummy headers, this skb can pass
2408 	   through good chunk of routing engine.
2409 	 */
2410 	skb_reset_mac_header(skb);
2411 	skb_reset_network_header(skb);
2412 
2413 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2414 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2415 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2416 
2417 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2418 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2419 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2420 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2421 
2422 	memset(&fl4, 0, sizeof(fl4));
2423 	fl4.daddr = dst;
2424 	fl4.saddr = src;
2425 	fl4.flowi4_tos = rtm->rtm_tos;
2426 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2427 	fl4.flowi4_mark = mark;
2428 
2429 	if (iif) {
2430 		struct net_device *dev;
2431 
2432 		dev = __dev_get_by_index(net, iif);
2433 		if (dev == NULL) {
2434 			err = -ENODEV;
2435 			goto errout_free;
2436 		}
2437 
2438 		skb->protocol	= htons(ETH_P_IP);
2439 		skb->dev	= dev;
2440 		skb->mark	= mark;
2441 		local_bh_disable();
2442 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2443 		local_bh_enable();
2444 
2445 		rt = skb_rtable(skb);
2446 		if (err == 0 && rt->dst.error)
2447 			err = -rt->dst.error;
2448 	} else {
2449 		rt = ip_route_output_key(net, &fl4);
2450 
2451 		err = 0;
2452 		if (IS_ERR(rt))
2453 			err = PTR_ERR(rt);
2454 	}
2455 
2456 	if (err)
2457 		goto errout_free;
2458 
2459 	skb_dst_set(skb, &rt->dst);
2460 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2461 		rt->rt_flags |= RTCF_NOTIFY;
2462 
2463 	err = rt_fill_info(net, dst, src, &fl4, skb,
2464 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2465 			   RTM_NEWROUTE, 0, 0);
2466 	if (err <= 0)
2467 		goto errout_free;
2468 
2469 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2470 errout:
2471 	return err;
2472 
2473 errout_free:
2474 	kfree_skb(skb);
2475 	goto errout;
2476 }
2477 
2478 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2479 {
2480 	return skb->len;
2481 }
2482 
2483 void ip_rt_multicast_event(struct in_device *in_dev)
2484 {
2485 	rt_cache_flush(dev_net(in_dev->dev));
2486 }
2487 
2488 #ifdef CONFIG_SYSCTL
2489 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2490 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2491 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2492 static int ip_rt_gc_elasticity __read_mostly	= 8;
2493 
2494 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2495 					void __user *buffer,
2496 					size_t *lenp, loff_t *ppos)
2497 {
2498 	struct net *net = (struct net *)__ctl->extra1;
2499 
2500 	if (write) {
2501 		rt_cache_flush(net);
2502 		fnhe_genid_bump(net);
2503 		return 0;
2504 	}
2505 
2506 	return -EINVAL;
2507 }
2508 
2509 static struct ctl_table ipv4_route_table[] = {
2510 	{
2511 		.procname	= "gc_thresh",
2512 		.data		= &ipv4_dst_ops.gc_thresh,
2513 		.maxlen		= sizeof(int),
2514 		.mode		= 0644,
2515 		.proc_handler	= proc_dointvec,
2516 	},
2517 	{
2518 		.procname	= "max_size",
2519 		.data		= &ip_rt_max_size,
2520 		.maxlen		= sizeof(int),
2521 		.mode		= 0644,
2522 		.proc_handler	= proc_dointvec,
2523 	},
2524 	{
2525 		/*  Deprecated. Use gc_min_interval_ms */
2526 
2527 		.procname	= "gc_min_interval",
2528 		.data		= &ip_rt_gc_min_interval,
2529 		.maxlen		= sizeof(int),
2530 		.mode		= 0644,
2531 		.proc_handler	= proc_dointvec_jiffies,
2532 	},
2533 	{
2534 		.procname	= "gc_min_interval_ms",
2535 		.data		= &ip_rt_gc_min_interval,
2536 		.maxlen		= sizeof(int),
2537 		.mode		= 0644,
2538 		.proc_handler	= proc_dointvec_ms_jiffies,
2539 	},
2540 	{
2541 		.procname	= "gc_timeout",
2542 		.data		= &ip_rt_gc_timeout,
2543 		.maxlen		= sizeof(int),
2544 		.mode		= 0644,
2545 		.proc_handler	= proc_dointvec_jiffies,
2546 	},
2547 	{
2548 		.procname	= "gc_interval",
2549 		.data		= &ip_rt_gc_interval,
2550 		.maxlen		= sizeof(int),
2551 		.mode		= 0644,
2552 		.proc_handler	= proc_dointvec_jiffies,
2553 	},
2554 	{
2555 		.procname	= "redirect_load",
2556 		.data		= &ip_rt_redirect_load,
2557 		.maxlen		= sizeof(int),
2558 		.mode		= 0644,
2559 		.proc_handler	= proc_dointvec,
2560 	},
2561 	{
2562 		.procname	= "redirect_number",
2563 		.data		= &ip_rt_redirect_number,
2564 		.maxlen		= sizeof(int),
2565 		.mode		= 0644,
2566 		.proc_handler	= proc_dointvec,
2567 	},
2568 	{
2569 		.procname	= "redirect_silence",
2570 		.data		= &ip_rt_redirect_silence,
2571 		.maxlen		= sizeof(int),
2572 		.mode		= 0644,
2573 		.proc_handler	= proc_dointvec,
2574 	},
2575 	{
2576 		.procname	= "error_cost",
2577 		.data		= &ip_rt_error_cost,
2578 		.maxlen		= sizeof(int),
2579 		.mode		= 0644,
2580 		.proc_handler	= proc_dointvec,
2581 	},
2582 	{
2583 		.procname	= "error_burst",
2584 		.data		= &ip_rt_error_burst,
2585 		.maxlen		= sizeof(int),
2586 		.mode		= 0644,
2587 		.proc_handler	= proc_dointvec,
2588 	},
2589 	{
2590 		.procname	= "gc_elasticity",
2591 		.data		= &ip_rt_gc_elasticity,
2592 		.maxlen		= sizeof(int),
2593 		.mode		= 0644,
2594 		.proc_handler	= proc_dointvec,
2595 	},
2596 	{
2597 		.procname	= "mtu_expires",
2598 		.data		= &ip_rt_mtu_expires,
2599 		.maxlen		= sizeof(int),
2600 		.mode		= 0644,
2601 		.proc_handler	= proc_dointvec_jiffies,
2602 	},
2603 	{
2604 		.procname	= "min_pmtu",
2605 		.data		= &ip_rt_min_pmtu,
2606 		.maxlen		= sizeof(int),
2607 		.mode		= 0644,
2608 		.proc_handler	= proc_dointvec,
2609 	},
2610 	{
2611 		.procname	= "min_adv_mss",
2612 		.data		= &ip_rt_min_advmss,
2613 		.maxlen		= sizeof(int),
2614 		.mode		= 0644,
2615 		.proc_handler	= proc_dointvec,
2616 	},
2617 	{ }
2618 };
2619 
2620 static struct ctl_table ipv4_route_flush_table[] = {
2621 	{
2622 		.procname	= "flush",
2623 		.maxlen		= sizeof(int),
2624 		.mode		= 0200,
2625 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2626 	},
2627 	{ },
2628 };
2629 
2630 static __net_init int sysctl_route_net_init(struct net *net)
2631 {
2632 	struct ctl_table *tbl;
2633 
2634 	tbl = ipv4_route_flush_table;
2635 	if (!net_eq(net, &init_net)) {
2636 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2637 		if (tbl == NULL)
2638 			goto err_dup;
2639 
2640 		/* Don't export sysctls to unprivileged users */
2641 		if (net->user_ns != &init_user_ns)
2642 			tbl[0].procname = NULL;
2643 	}
2644 	tbl[0].extra1 = net;
2645 
2646 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2647 	if (net->ipv4.route_hdr == NULL)
2648 		goto err_reg;
2649 	return 0;
2650 
2651 err_reg:
2652 	if (tbl != ipv4_route_flush_table)
2653 		kfree(tbl);
2654 err_dup:
2655 	return -ENOMEM;
2656 }
2657 
2658 static __net_exit void sysctl_route_net_exit(struct net *net)
2659 {
2660 	struct ctl_table *tbl;
2661 
2662 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2663 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2664 	BUG_ON(tbl == ipv4_route_flush_table);
2665 	kfree(tbl);
2666 }
2667 
2668 static __net_initdata struct pernet_operations sysctl_route_ops = {
2669 	.init = sysctl_route_net_init,
2670 	.exit = sysctl_route_net_exit,
2671 };
2672 #endif
2673 
2674 static __net_init int rt_genid_init(struct net *net)
2675 {
2676 	atomic_set(&net->ipv4.rt_genid, 0);
2677 	atomic_set(&net->fnhe_genid, 0);
2678 	get_random_bytes(&net->ipv4.dev_addr_genid,
2679 			 sizeof(net->ipv4.dev_addr_genid));
2680 	return 0;
2681 }
2682 
2683 static __net_initdata struct pernet_operations rt_genid_ops = {
2684 	.init = rt_genid_init,
2685 };
2686 
2687 static int __net_init ipv4_inetpeer_init(struct net *net)
2688 {
2689 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2690 
2691 	if (!bp)
2692 		return -ENOMEM;
2693 	inet_peer_base_init(bp);
2694 	net->ipv4.peers = bp;
2695 	return 0;
2696 }
2697 
2698 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2699 {
2700 	struct inet_peer_base *bp = net->ipv4.peers;
2701 
2702 	net->ipv4.peers = NULL;
2703 	inetpeer_invalidate_tree(bp);
2704 	kfree(bp);
2705 }
2706 
2707 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2708 	.init	=	ipv4_inetpeer_init,
2709 	.exit	=	ipv4_inetpeer_exit,
2710 };
2711 
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2714 #endif /* CONFIG_IP_ROUTE_CLASSID */
2715 
2716 int __init ip_rt_init(void)
2717 {
2718 	int rc = 0;
2719 
2720 #ifdef CONFIG_IP_ROUTE_CLASSID
2721 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2722 	if (!ip_rt_acct)
2723 		panic("IP: failed to allocate ip_rt_acct\n");
2724 #endif
2725 
2726 	ipv4_dst_ops.kmem_cachep =
2727 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2728 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2729 
2730 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2731 
2732 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2733 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2734 
2735 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2736 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2737 
2738 	ipv4_dst_ops.gc_thresh = ~0;
2739 	ip_rt_max_size = INT_MAX;
2740 
2741 	devinet_init();
2742 	ip_fib_init();
2743 
2744 	if (ip_rt_proc_init())
2745 		pr_err("Unable to create route proc files\n");
2746 #ifdef CONFIG_XFRM
2747 	xfrm_init();
2748 	xfrm4_init();
2749 #endif
2750 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2751 
2752 #ifdef CONFIG_SYSCTL
2753 	register_pernet_subsys(&sysctl_route_ops);
2754 #endif
2755 	register_pernet_subsys(&rt_genid_ops);
2756 	register_pernet_subsys(&ipv4_inetpeer_ops);
2757 	return rc;
2758 }
2759 
2760 #ifdef CONFIG_SYSCTL
2761 /*
2762  * We really need to sanitize the damn ipv4 init order, then all
2763  * this nonsense will go away.
2764  */
2765 void __init ip_static_sysctl_init(void)
2766 {
2767 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2768 }
2769 #endif
2770