xref: /linux/net/ipv4/route.c (revision ff5599816711d2e67da2d7561fd36ac48debd433)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly	= 9;
121 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly	= HZ;
124 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly	= 256;
128 
129 /*
130  *	Interface to generic destination cache.
131  */
132 
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void		 ipv4_link_failure(struct sk_buff *skb);
138 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 					   struct sk_buff *skb, u32 mtu);
140 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 					struct sk_buff *skb);
142 static void		ipv4_dst_destroy(struct dst_entry *dst);
143 
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145 			    int how)
146 {
147 }
148 
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 
159 static struct dst_ops ipv4_dst_ops = {
160 	.family =		AF_INET,
161 	.protocol =		cpu_to_be16(ETH_P_IP),
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.ifdown =		ipv4_dst_ifdown,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct file_operations rt_cache_seq_fops = {
242 	.owner	 = THIS_MODULE,
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   st->in_hit,
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   st->out_hit,
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   st->gc_total,
310 		   st->gc_ignored,
311 		   st->gc_goal_miss,
312 		   st->gc_dst_overflow,
313 		   st->in_hlist_search,
314 		   st->out_hlist_search
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.owner	 = THIS_MODULE,
334 	.open	 = rt_cpu_seq_open,
335 	.read	 = seq_read,
336 	.llseek	 = seq_lseek,
337 	.release = seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 	return single_open(file, rt_acct_proc_show, NULL);
368 }
369 
370 static const struct file_operations rt_acct_proc_fops = {
371 	.owner		= THIS_MODULE,
372 	.open		= rt_acct_proc_open,
373 	.read		= seq_read,
374 	.llseek		= seq_lseek,
375 	.release	= single_release,
376 };
377 #endif
378 
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381 	struct proc_dir_entry *pde;
382 
383 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 			  &rt_cache_seq_fops);
385 	if (!pde)
386 		goto err1;
387 
388 	pde = proc_create("rt_cache", S_IRUGO,
389 			  net->proc_net_stat, &rt_cpu_seq_fops);
390 	if (!pde)
391 		goto err2;
392 
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 	if (!pde)
396 		goto err3;
397 #endif
398 	return 0;
399 
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402 	remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405 	remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407 	return -ENOMEM;
408 }
409 
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412 	remove_proc_entry("rt_cache", net->proc_net_stat);
413 	remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415 	remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418 
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420 	.init = ip_rt_do_proc_init,
421 	.exit = ip_rt_do_proc_exit,
422 };
423 
424 static int __init ip_rt_proc_init(void)
425 {
426 	return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428 
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432 	return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435 
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440 
441 void rt_cache_flush(struct net *net)
442 {
443 	rt_genid_bump(net);
444 }
445 
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 					   struct sk_buff *skb,
448 					   const void *daddr)
449 {
450 	struct net_device *dev = dst->dev;
451 	const __be32 *pkey = daddr;
452 	const struct rtable *rt;
453 	struct neighbour *n;
454 
455 	rt = (const struct rtable *) dst;
456 	if (rt->rt_gateway)
457 		pkey = (const __be32 *) &rt->rt_gateway;
458 	else if (skb)
459 		pkey = &ip_hdr(skb)->daddr;
460 
461 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 	if (n)
463 		return n;
464 	return neigh_create(&arp_tbl, pkey, dev);
465 }
466 
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476 	static DEFINE_SPINLOCK(ip_fb_id_lock);
477 	static u32 ip_fallback_id;
478 	u32 salt;
479 
480 	spin_lock_bh(&ip_fb_id_lock);
481 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482 	iph->id = htons(salt & 0xFFFF);
483 	ip_fallback_id = salt;
484 	spin_unlock_bh(&ip_fb_id_lock);
485 }
486 
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489 	struct net *net = dev_net(dst->dev);
490 	struct inet_peer *peer;
491 
492 	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493 	if (peer) {
494 		iph->id = htons(inet_getid(peer, more));
495 		inet_putpeer(peer);
496 		return;
497 	}
498 
499 	ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502 
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504 			     const struct iphdr *iph,
505 			     int oif, u8 tos,
506 			     u8 prot, u32 mark, int flow_flags)
507 {
508 	if (sk) {
509 		const struct inet_sock *inet = inet_sk(sk);
510 
511 		oif = sk->sk_bound_dev_if;
512 		mark = sk->sk_mark;
513 		tos = RT_CONN_FLAGS(sk);
514 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515 	}
516 	flowi4_init_output(fl4, oif, mark, tos,
517 			   RT_SCOPE_UNIVERSE, prot,
518 			   flow_flags,
519 			   iph->daddr, iph->saddr, 0, 0);
520 }
521 
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523 			       const struct sock *sk)
524 {
525 	const struct iphdr *iph = ip_hdr(skb);
526 	int oif = skb->dev->ifindex;
527 	u8 tos = RT_TOS(iph->tos);
528 	u8 prot = iph->protocol;
529 	u32 mark = skb->mark;
530 
531 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533 
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536 	const struct inet_sock *inet = inet_sk(sk);
537 	const struct ip_options_rcu *inet_opt;
538 	__be32 daddr = inet->inet_daddr;
539 
540 	rcu_read_lock();
541 	inet_opt = rcu_dereference(inet->inet_opt);
542 	if (inet_opt && inet_opt->opt.srr)
543 		daddr = inet_opt->opt.faddr;
544 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547 			   inet_sk_flowi_flags(sk),
548 			   daddr, inet->inet_saddr, 0, 0);
549 	rcu_read_unlock();
550 }
551 
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553 				 const struct sk_buff *skb)
554 {
555 	if (skb)
556 		build_skb_flow_key(fl4, skb, sk);
557 	else
558 		build_sk_flow_key(fl4, sk);
559 }
560 
561 static inline void rt_free(struct rtable *rt)
562 {
563 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565 
566 static DEFINE_SPINLOCK(fnhe_lock);
567 
568 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
569 {
570 	struct rtable *rt;
571 
572 	rt = rcu_dereference(fnhe->fnhe_rth_input);
573 	if (rt) {
574 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
575 		rt_free(rt);
576 	}
577 	rt = rcu_dereference(fnhe->fnhe_rth_output);
578 	if (rt) {
579 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
580 		rt_free(rt);
581 	}
582 }
583 
584 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
585 {
586 	struct fib_nh_exception *fnhe, *oldest;
587 
588 	oldest = rcu_dereference(hash->chain);
589 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
591 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592 			oldest = fnhe;
593 	}
594 	fnhe_flush_routes(oldest);
595 	return oldest;
596 }
597 
598 static inline u32 fnhe_hashfun(__be32 daddr)
599 {
600 	u32 hval;
601 
602 	hval = (__force u32) daddr;
603 	hval ^= (hval >> 11) ^ (hval >> 22);
604 
605 	return hval & (FNHE_HASH_SIZE - 1);
606 }
607 
608 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
609 {
610 	rt->rt_pmtu = fnhe->fnhe_pmtu;
611 	rt->dst.expires = fnhe->fnhe_expires;
612 
613 	if (fnhe->fnhe_gw) {
614 		rt->rt_flags |= RTCF_REDIRECTED;
615 		rt->rt_gateway = fnhe->fnhe_gw;
616 		rt->rt_uses_gateway = 1;
617 	}
618 }
619 
620 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
621 				  u32 pmtu, unsigned long expires)
622 {
623 	struct fnhe_hash_bucket *hash;
624 	struct fib_nh_exception *fnhe;
625 	struct rtable *rt;
626 	unsigned int i;
627 	int depth;
628 	u32 hval = fnhe_hashfun(daddr);
629 
630 	spin_lock_bh(&fnhe_lock);
631 
632 	hash = nh->nh_exceptions;
633 	if (!hash) {
634 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
635 		if (!hash)
636 			goto out_unlock;
637 		nh->nh_exceptions = hash;
638 	}
639 
640 	hash += hval;
641 
642 	depth = 0;
643 	for (fnhe = rcu_dereference(hash->chain); fnhe;
644 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
645 		if (fnhe->fnhe_daddr == daddr)
646 			break;
647 		depth++;
648 	}
649 
650 	if (fnhe) {
651 		if (gw)
652 			fnhe->fnhe_gw = gw;
653 		if (pmtu) {
654 			fnhe->fnhe_pmtu = pmtu;
655 			fnhe->fnhe_expires = max(1UL, expires);
656 		}
657 		/* Update all cached dsts too */
658 		rt = rcu_dereference(fnhe->fnhe_rth_input);
659 		if (rt)
660 			fill_route_from_fnhe(rt, fnhe);
661 		rt = rcu_dereference(fnhe->fnhe_rth_output);
662 		if (rt)
663 			fill_route_from_fnhe(rt, fnhe);
664 	} else {
665 		if (depth > FNHE_RECLAIM_DEPTH)
666 			fnhe = fnhe_oldest(hash);
667 		else {
668 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
669 			if (!fnhe)
670 				goto out_unlock;
671 
672 			fnhe->fnhe_next = hash->chain;
673 			rcu_assign_pointer(hash->chain, fnhe);
674 		}
675 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
676 		fnhe->fnhe_daddr = daddr;
677 		fnhe->fnhe_gw = gw;
678 		fnhe->fnhe_pmtu = pmtu;
679 		fnhe->fnhe_expires = expires;
680 
681 		/* Exception created; mark the cached routes for the nexthop
682 		 * stale, so anyone caching it rechecks if this exception
683 		 * applies to them.
684 		 */
685 		rt = rcu_dereference(nh->nh_rth_input);
686 		if (rt)
687 			rt->dst.obsolete = DST_OBSOLETE_KILL;
688 
689 		for_each_possible_cpu(i) {
690 			struct rtable __rcu **prt;
691 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
692 			rt = rcu_dereference(*prt);
693 			if (rt)
694 				rt->dst.obsolete = DST_OBSOLETE_KILL;
695 		}
696 	}
697 
698 	fnhe->fnhe_stamp = jiffies;
699 
700 out_unlock:
701 	spin_unlock_bh(&fnhe_lock);
702 	return;
703 }
704 
705 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
706 			     bool kill_route)
707 {
708 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
709 	__be32 old_gw = ip_hdr(skb)->saddr;
710 	struct net_device *dev = skb->dev;
711 	struct in_device *in_dev;
712 	struct fib_result res;
713 	struct neighbour *n;
714 	struct net *net;
715 
716 	switch (icmp_hdr(skb)->code & 7) {
717 	case ICMP_REDIR_NET:
718 	case ICMP_REDIR_NETTOS:
719 	case ICMP_REDIR_HOST:
720 	case ICMP_REDIR_HOSTTOS:
721 		break;
722 
723 	default:
724 		return;
725 	}
726 
727 	if (rt->rt_gateway != old_gw)
728 		return;
729 
730 	in_dev = __in_dev_get_rcu(dev);
731 	if (!in_dev)
732 		return;
733 
734 	net = dev_net(dev);
735 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
736 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
737 	    ipv4_is_zeronet(new_gw))
738 		goto reject_redirect;
739 
740 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
741 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
742 			goto reject_redirect;
743 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
744 			goto reject_redirect;
745 	} else {
746 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
747 			goto reject_redirect;
748 	}
749 
750 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
751 	if (n) {
752 		if (!(n->nud_state & NUD_VALID)) {
753 			neigh_event_send(n, NULL);
754 		} else {
755 			if (fib_lookup(net, fl4, &res) == 0) {
756 				struct fib_nh *nh = &FIB_RES_NH(res);
757 
758 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
759 						      0, 0);
760 			}
761 			if (kill_route)
762 				rt->dst.obsolete = DST_OBSOLETE_KILL;
763 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
764 		}
765 		neigh_release(n);
766 	}
767 	return;
768 
769 reject_redirect:
770 #ifdef CONFIG_IP_ROUTE_VERBOSE
771 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
772 		const struct iphdr *iph = (const struct iphdr *) skb->data;
773 		__be32 daddr = iph->daddr;
774 		__be32 saddr = iph->saddr;
775 
776 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
777 				     "  Advised path = %pI4 -> %pI4\n",
778 				     &old_gw, dev->name, &new_gw,
779 				     &saddr, &daddr);
780 	}
781 #endif
782 	;
783 }
784 
785 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
786 {
787 	struct rtable *rt;
788 	struct flowi4 fl4;
789 	const struct iphdr *iph = (const struct iphdr *) skb->data;
790 	int oif = skb->dev->ifindex;
791 	u8 tos = RT_TOS(iph->tos);
792 	u8 prot = iph->protocol;
793 	u32 mark = skb->mark;
794 
795 	rt = (struct rtable *) dst;
796 
797 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
798 	__ip_do_redirect(rt, skb, &fl4, true);
799 }
800 
801 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
802 {
803 	struct rtable *rt = (struct rtable *)dst;
804 	struct dst_entry *ret = dst;
805 
806 	if (rt) {
807 		if (dst->obsolete > 0) {
808 			ip_rt_put(rt);
809 			ret = NULL;
810 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
811 			   rt->dst.expires) {
812 			ip_rt_put(rt);
813 			ret = NULL;
814 		}
815 	}
816 	return ret;
817 }
818 
819 /*
820  * Algorithm:
821  *	1. The first ip_rt_redirect_number redirects are sent
822  *	   with exponential backoff, then we stop sending them at all,
823  *	   assuming that the host ignores our redirects.
824  *	2. If we did not see packets requiring redirects
825  *	   during ip_rt_redirect_silence, we assume that the host
826  *	   forgot redirected route and start to send redirects again.
827  *
828  * This algorithm is much cheaper and more intelligent than dumb load limiting
829  * in icmp.c.
830  *
831  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
832  * and "frag. need" (breaks PMTU discovery) in icmp.c.
833  */
834 
835 void ip_rt_send_redirect(struct sk_buff *skb)
836 {
837 	struct rtable *rt = skb_rtable(skb);
838 	struct in_device *in_dev;
839 	struct inet_peer *peer;
840 	struct net *net;
841 	int log_martians;
842 
843 	rcu_read_lock();
844 	in_dev = __in_dev_get_rcu(rt->dst.dev);
845 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
846 		rcu_read_unlock();
847 		return;
848 	}
849 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
850 	rcu_read_unlock();
851 
852 	net = dev_net(rt->dst.dev);
853 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
854 	if (!peer) {
855 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
856 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
857 		return;
858 	}
859 
860 	/* No redirected packets during ip_rt_redirect_silence;
861 	 * reset the algorithm.
862 	 */
863 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
864 		peer->rate_tokens = 0;
865 
866 	/* Too many ignored redirects; do not send anything
867 	 * set dst.rate_last to the last seen redirected packet.
868 	 */
869 	if (peer->rate_tokens >= ip_rt_redirect_number) {
870 		peer->rate_last = jiffies;
871 		goto out_put_peer;
872 	}
873 
874 	/* Check for load limit; set rate_last to the latest sent
875 	 * redirect.
876 	 */
877 	if (peer->rate_tokens == 0 ||
878 	    time_after(jiffies,
879 		       (peer->rate_last +
880 			(ip_rt_redirect_load << peer->rate_tokens)))) {
881 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
882 
883 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
884 		peer->rate_last = jiffies;
885 		++peer->rate_tokens;
886 #ifdef CONFIG_IP_ROUTE_VERBOSE
887 		if (log_martians &&
888 		    peer->rate_tokens == ip_rt_redirect_number)
889 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
890 					     &ip_hdr(skb)->saddr, inet_iif(skb),
891 					     &ip_hdr(skb)->daddr, &gw);
892 #endif
893 	}
894 out_put_peer:
895 	inet_putpeer(peer);
896 }
897 
898 static int ip_error(struct sk_buff *skb)
899 {
900 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
901 	struct rtable *rt = skb_rtable(skb);
902 	struct inet_peer *peer;
903 	unsigned long now;
904 	struct net *net;
905 	bool send;
906 	int code;
907 
908 	net = dev_net(rt->dst.dev);
909 	if (!IN_DEV_FORWARD(in_dev)) {
910 		switch (rt->dst.error) {
911 		case EHOSTUNREACH:
912 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
913 			break;
914 
915 		case ENETUNREACH:
916 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
917 			break;
918 		}
919 		goto out;
920 	}
921 
922 	switch (rt->dst.error) {
923 	case EINVAL:
924 	default:
925 		goto out;
926 	case EHOSTUNREACH:
927 		code = ICMP_HOST_UNREACH;
928 		break;
929 	case ENETUNREACH:
930 		code = ICMP_NET_UNREACH;
931 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
932 		break;
933 	case EACCES:
934 		code = ICMP_PKT_FILTERED;
935 		break;
936 	}
937 
938 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
939 
940 	send = true;
941 	if (peer) {
942 		now = jiffies;
943 		peer->rate_tokens += now - peer->rate_last;
944 		if (peer->rate_tokens > ip_rt_error_burst)
945 			peer->rate_tokens = ip_rt_error_burst;
946 		peer->rate_last = now;
947 		if (peer->rate_tokens >= ip_rt_error_cost)
948 			peer->rate_tokens -= ip_rt_error_cost;
949 		else
950 			send = false;
951 		inet_putpeer(peer);
952 	}
953 	if (send)
954 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
955 
956 out:	kfree_skb(skb);
957 	return 0;
958 }
959 
960 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
961 {
962 	struct dst_entry *dst = &rt->dst;
963 	struct fib_result res;
964 
965 	if (dst_metric_locked(dst, RTAX_MTU))
966 		return;
967 
968 	if (dst->dev->mtu < mtu)
969 		return;
970 
971 	if (mtu < ip_rt_min_pmtu)
972 		mtu = ip_rt_min_pmtu;
973 
974 	if (rt->rt_pmtu == mtu &&
975 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
976 		return;
977 
978 	rcu_read_lock();
979 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
980 		struct fib_nh *nh = &FIB_RES_NH(res);
981 
982 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
983 				      jiffies + ip_rt_mtu_expires);
984 	}
985 	rcu_read_unlock();
986 }
987 
988 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
989 			      struct sk_buff *skb, u32 mtu)
990 {
991 	struct rtable *rt = (struct rtable *) dst;
992 	struct flowi4 fl4;
993 
994 	ip_rt_build_flow_key(&fl4, sk, skb);
995 	__ip_rt_update_pmtu(rt, &fl4, mtu);
996 }
997 
998 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
999 		      int oif, u32 mark, u8 protocol, int flow_flags)
1000 {
1001 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1002 	struct flowi4 fl4;
1003 	struct rtable *rt;
1004 
1005 	__build_flow_key(&fl4, NULL, iph, oif,
1006 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1007 	rt = __ip_route_output_key(net, &fl4);
1008 	if (!IS_ERR(rt)) {
1009 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1010 		ip_rt_put(rt);
1011 	}
1012 }
1013 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1014 
1015 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1016 {
1017 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1018 	struct flowi4 fl4;
1019 	struct rtable *rt;
1020 
1021 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1022 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1023 	if (!IS_ERR(rt)) {
1024 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1025 		ip_rt_put(rt);
1026 	}
1027 }
1028 
1029 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1030 {
1031 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1032 	struct flowi4 fl4;
1033 	struct rtable *rt;
1034 	struct dst_entry *dst;
1035 	bool new = false;
1036 
1037 	bh_lock_sock(sk);
1038 	rt = (struct rtable *) __sk_dst_get(sk);
1039 
1040 	if (sock_owned_by_user(sk) || !rt) {
1041 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1042 		goto out;
1043 	}
1044 
1045 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1046 
1047 	if (!__sk_dst_check(sk, 0)) {
1048 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1049 		if (IS_ERR(rt))
1050 			goto out;
1051 
1052 		new = true;
1053 	}
1054 
1055 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1056 
1057 	dst = dst_check(&rt->dst, 0);
1058 	if (!dst) {
1059 		if (new)
1060 			dst_release(&rt->dst);
1061 
1062 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1063 		if (IS_ERR(rt))
1064 			goto out;
1065 
1066 		new = true;
1067 	}
1068 
1069 	if (new)
1070 		__sk_dst_set(sk, &rt->dst);
1071 
1072 out:
1073 	bh_unlock_sock(sk);
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1076 
1077 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1078 		   int oif, u32 mark, u8 protocol, int flow_flags)
1079 {
1080 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1081 	struct flowi4 fl4;
1082 	struct rtable *rt;
1083 
1084 	__build_flow_key(&fl4, NULL, iph, oif,
1085 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1086 	rt = __ip_route_output_key(net, &fl4);
1087 	if (!IS_ERR(rt)) {
1088 		__ip_do_redirect(rt, skb, &fl4, false);
1089 		ip_rt_put(rt);
1090 	}
1091 }
1092 EXPORT_SYMBOL_GPL(ipv4_redirect);
1093 
1094 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1095 {
1096 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1097 	struct flowi4 fl4;
1098 	struct rtable *rt;
1099 
1100 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1101 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1102 	if (!IS_ERR(rt)) {
1103 		__ip_do_redirect(rt, skb, &fl4, false);
1104 		ip_rt_put(rt);
1105 	}
1106 }
1107 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1108 
1109 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1110 {
1111 	struct rtable *rt = (struct rtable *) dst;
1112 
1113 	/* All IPV4 dsts are created with ->obsolete set to the value
1114 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1115 	 * into this function always.
1116 	 *
1117 	 * When a PMTU/redirect information update invalidates a route,
1118 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1119 	 * DST_OBSOLETE_DEAD by dst_free().
1120 	 */
1121 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1122 		return NULL;
1123 	return dst;
1124 }
1125 
1126 static void ipv4_link_failure(struct sk_buff *skb)
1127 {
1128 	struct rtable *rt;
1129 
1130 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1131 
1132 	rt = skb_rtable(skb);
1133 	if (rt)
1134 		dst_set_expires(&rt->dst, 0);
1135 }
1136 
1137 static int ip_rt_bug(struct sk_buff *skb)
1138 {
1139 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1140 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1141 		 skb->dev ? skb->dev->name : "?");
1142 	kfree_skb(skb);
1143 	WARN_ON(1);
1144 	return 0;
1145 }
1146 
1147 /*
1148    We do not cache source address of outgoing interface,
1149    because it is used only by IP RR, TS and SRR options,
1150    so that it out of fast path.
1151 
1152    BTW remember: "addr" is allowed to be not aligned
1153    in IP options!
1154  */
1155 
1156 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1157 {
1158 	__be32 src;
1159 
1160 	if (rt_is_output_route(rt))
1161 		src = ip_hdr(skb)->saddr;
1162 	else {
1163 		struct fib_result res;
1164 		struct flowi4 fl4;
1165 		struct iphdr *iph;
1166 
1167 		iph = ip_hdr(skb);
1168 
1169 		memset(&fl4, 0, sizeof(fl4));
1170 		fl4.daddr = iph->daddr;
1171 		fl4.saddr = iph->saddr;
1172 		fl4.flowi4_tos = RT_TOS(iph->tos);
1173 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1174 		fl4.flowi4_iif = skb->dev->ifindex;
1175 		fl4.flowi4_mark = skb->mark;
1176 
1177 		rcu_read_lock();
1178 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1179 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1180 		else
1181 			src = inet_select_addr(rt->dst.dev,
1182 					       rt_nexthop(rt, iph->daddr),
1183 					       RT_SCOPE_UNIVERSE);
1184 		rcu_read_unlock();
1185 	}
1186 	memcpy(addr, &src, 4);
1187 }
1188 
1189 #ifdef CONFIG_IP_ROUTE_CLASSID
1190 static void set_class_tag(struct rtable *rt, u32 tag)
1191 {
1192 	if (!(rt->dst.tclassid & 0xFFFF))
1193 		rt->dst.tclassid |= tag & 0xFFFF;
1194 	if (!(rt->dst.tclassid & 0xFFFF0000))
1195 		rt->dst.tclassid |= tag & 0xFFFF0000;
1196 }
1197 #endif
1198 
1199 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1200 {
1201 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1202 
1203 	if (advmss == 0) {
1204 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1205 			       ip_rt_min_advmss);
1206 		if (advmss > 65535 - 40)
1207 			advmss = 65535 - 40;
1208 	}
1209 	return advmss;
1210 }
1211 
1212 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1213 {
1214 	const struct rtable *rt = (const struct rtable *) dst;
1215 	unsigned int mtu = rt->rt_pmtu;
1216 
1217 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1218 		mtu = dst_metric_raw(dst, RTAX_MTU);
1219 
1220 	if (mtu)
1221 		return mtu;
1222 
1223 	mtu = dst->dev->mtu;
1224 
1225 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1226 		if (rt->rt_uses_gateway && mtu > 576)
1227 			mtu = 576;
1228 	}
1229 
1230 	if (mtu > IP_MAX_MTU)
1231 		mtu = IP_MAX_MTU;
1232 
1233 	return mtu;
1234 }
1235 
1236 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1237 {
1238 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1239 	struct fib_nh_exception *fnhe;
1240 	u32 hval;
1241 
1242 	if (!hash)
1243 		return NULL;
1244 
1245 	hval = fnhe_hashfun(daddr);
1246 
1247 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1248 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1249 		if (fnhe->fnhe_daddr == daddr)
1250 			return fnhe;
1251 	}
1252 	return NULL;
1253 }
1254 
1255 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1256 			      __be32 daddr)
1257 {
1258 	bool ret = false;
1259 
1260 	spin_lock_bh(&fnhe_lock);
1261 
1262 	if (daddr == fnhe->fnhe_daddr) {
1263 		struct rtable __rcu **porig;
1264 		struct rtable *orig;
1265 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1266 
1267 		if (rt_is_input_route(rt))
1268 			porig = &fnhe->fnhe_rth_input;
1269 		else
1270 			porig = &fnhe->fnhe_rth_output;
1271 		orig = rcu_dereference(*porig);
1272 
1273 		if (fnhe->fnhe_genid != genid) {
1274 			fnhe->fnhe_genid = genid;
1275 			fnhe->fnhe_gw = 0;
1276 			fnhe->fnhe_pmtu = 0;
1277 			fnhe->fnhe_expires = 0;
1278 			fnhe_flush_routes(fnhe);
1279 			orig = NULL;
1280 		}
1281 		fill_route_from_fnhe(rt, fnhe);
1282 		if (!rt->rt_gateway)
1283 			rt->rt_gateway = daddr;
1284 
1285 		if (!(rt->dst.flags & DST_NOCACHE)) {
1286 			rcu_assign_pointer(*porig, rt);
1287 			if (orig)
1288 				rt_free(orig);
1289 			ret = true;
1290 		}
1291 
1292 		fnhe->fnhe_stamp = jiffies;
1293 	}
1294 	spin_unlock_bh(&fnhe_lock);
1295 
1296 	return ret;
1297 }
1298 
1299 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1300 {
1301 	struct rtable *orig, *prev, **p;
1302 	bool ret = true;
1303 
1304 	if (rt_is_input_route(rt)) {
1305 		p = (struct rtable **)&nh->nh_rth_input;
1306 	} else {
1307 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1308 	}
1309 	orig = *p;
1310 
1311 	prev = cmpxchg(p, orig, rt);
1312 	if (prev == orig) {
1313 		if (orig)
1314 			rt_free(orig);
1315 	} else
1316 		ret = false;
1317 
1318 	return ret;
1319 }
1320 
1321 static DEFINE_SPINLOCK(rt_uncached_lock);
1322 static LIST_HEAD(rt_uncached_list);
1323 
1324 static void rt_add_uncached_list(struct rtable *rt)
1325 {
1326 	spin_lock_bh(&rt_uncached_lock);
1327 	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1328 	spin_unlock_bh(&rt_uncached_lock);
1329 }
1330 
1331 static void ipv4_dst_destroy(struct dst_entry *dst)
1332 {
1333 	struct rtable *rt = (struct rtable *) dst;
1334 
1335 	if (!list_empty(&rt->rt_uncached)) {
1336 		spin_lock_bh(&rt_uncached_lock);
1337 		list_del(&rt->rt_uncached);
1338 		spin_unlock_bh(&rt_uncached_lock);
1339 	}
1340 }
1341 
1342 void rt_flush_dev(struct net_device *dev)
1343 {
1344 	if (!list_empty(&rt_uncached_list)) {
1345 		struct net *net = dev_net(dev);
1346 		struct rtable *rt;
1347 
1348 		spin_lock_bh(&rt_uncached_lock);
1349 		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1350 			if (rt->dst.dev != dev)
1351 				continue;
1352 			rt->dst.dev = net->loopback_dev;
1353 			dev_hold(rt->dst.dev);
1354 			dev_put(dev);
1355 		}
1356 		spin_unlock_bh(&rt_uncached_lock);
1357 	}
1358 }
1359 
1360 static bool rt_cache_valid(const struct rtable *rt)
1361 {
1362 	return	rt &&
1363 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1364 		!rt_is_expired(rt);
1365 }
1366 
1367 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1368 			   const struct fib_result *res,
1369 			   struct fib_nh_exception *fnhe,
1370 			   struct fib_info *fi, u16 type, u32 itag)
1371 {
1372 	bool cached = false;
1373 
1374 	if (fi) {
1375 		struct fib_nh *nh = &FIB_RES_NH(*res);
1376 
1377 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1378 			rt->rt_gateway = nh->nh_gw;
1379 			rt->rt_uses_gateway = 1;
1380 		}
1381 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1382 #ifdef CONFIG_IP_ROUTE_CLASSID
1383 		rt->dst.tclassid = nh->nh_tclassid;
1384 #endif
1385 		if (unlikely(fnhe))
1386 			cached = rt_bind_exception(rt, fnhe, daddr);
1387 		else if (!(rt->dst.flags & DST_NOCACHE))
1388 			cached = rt_cache_route(nh, rt);
1389 		if (unlikely(!cached)) {
1390 			/* Routes we intend to cache in nexthop exception or
1391 			 * FIB nexthop have the DST_NOCACHE bit clear.
1392 			 * However, if we are unsuccessful at storing this
1393 			 * route into the cache we really need to set it.
1394 			 */
1395 			rt->dst.flags |= DST_NOCACHE;
1396 			if (!rt->rt_gateway)
1397 				rt->rt_gateway = daddr;
1398 			rt_add_uncached_list(rt);
1399 		}
1400 	} else
1401 		rt_add_uncached_list(rt);
1402 
1403 #ifdef CONFIG_IP_ROUTE_CLASSID
1404 #ifdef CONFIG_IP_MULTIPLE_TABLES
1405 	set_class_tag(rt, res->tclassid);
1406 #endif
1407 	set_class_tag(rt, itag);
1408 #endif
1409 }
1410 
1411 static struct rtable *rt_dst_alloc(struct net_device *dev,
1412 				   bool nopolicy, bool noxfrm, bool will_cache)
1413 {
1414 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1415 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1416 			 (nopolicy ? DST_NOPOLICY : 0) |
1417 			 (noxfrm ? DST_NOXFRM : 0));
1418 }
1419 
1420 /* called in rcu_read_lock() section */
1421 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1422 				u8 tos, struct net_device *dev, int our)
1423 {
1424 	struct rtable *rth;
1425 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1426 	u32 itag = 0;
1427 	int err;
1428 
1429 	/* Primary sanity checks. */
1430 
1431 	if (in_dev == NULL)
1432 		return -EINVAL;
1433 
1434 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1435 	    skb->protocol != htons(ETH_P_IP))
1436 		goto e_inval;
1437 
1438 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1439 		if (ipv4_is_loopback(saddr))
1440 			goto e_inval;
1441 
1442 	if (ipv4_is_zeronet(saddr)) {
1443 		if (!ipv4_is_local_multicast(daddr))
1444 			goto e_inval;
1445 	} else {
1446 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1447 					  in_dev, &itag);
1448 		if (err < 0)
1449 			goto e_err;
1450 	}
1451 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1452 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1453 	if (!rth)
1454 		goto e_nobufs;
1455 
1456 #ifdef CONFIG_IP_ROUTE_CLASSID
1457 	rth->dst.tclassid = itag;
1458 #endif
1459 	rth->dst.output = ip_rt_bug;
1460 
1461 	rth->rt_genid	= rt_genid(dev_net(dev));
1462 	rth->rt_flags	= RTCF_MULTICAST;
1463 	rth->rt_type	= RTN_MULTICAST;
1464 	rth->rt_is_input= 1;
1465 	rth->rt_iif	= 0;
1466 	rth->rt_pmtu	= 0;
1467 	rth->rt_gateway	= 0;
1468 	rth->rt_uses_gateway = 0;
1469 	INIT_LIST_HEAD(&rth->rt_uncached);
1470 	if (our) {
1471 		rth->dst.input= ip_local_deliver;
1472 		rth->rt_flags |= RTCF_LOCAL;
1473 	}
1474 
1475 #ifdef CONFIG_IP_MROUTE
1476 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1477 		rth->dst.input = ip_mr_input;
1478 #endif
1479 	RT_CACHE_STAT_INC(in_slow_mc);
1480 
1481 	skb_dst_set(skb, &rth->dst);
1482 	return 0;
1483 
1484 e_nobufs:
1485 	return -ENOBUFS;
1486 e_inval:
1487 	return -EINVAL;
1488 e_err:
1489 	return err;
1490 }
1491 
1492 
1493 static void ip_handle_martian_source(struct net_device *dev,
1494 				     struct in_device *in_dev,
1495 				     struct sk_buff *skb,
1496 				     __be32 daddr,
1497 				     __be32 saddr)
1498 {
1499 	RT_CACHE_STAT_INC(in_martian_src);
1500 #ifdef CONFIG_IP_ROUTE_VERBOSE
1501 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1502 		/*
1503 		 *	RFC1812 recommendation, if source is martian,
1504 		 *	the only hint is MAC header.
1505 		 */
1506 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1507 			&daddr, &saddr, dev->name);
1508 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1509 			print_hex_dump(KERN_WARNING, "ll header: ",
1510 				       DUMP_PREFIX_OFFSET, 16, 1,
1511 				       skb_mac_header(skb),
1512 				       dev->hard_header_len, true);
1513 		}
1514 	}
1515 #endif
1516 }
1517 
1518 /* called in rcu_read_lock() section */
1519 static int __mkroute_input(struct sk_buff *skb,
1520 			   const struct fib_result *res,
1521 			   struct in_device *in_dev,
1522 			   __be32 daddr, __be32 saddr, u32 tos)
1523 {
1524 	struct fib_nh_exception *fnhe;
1525 	struct rtable *rth;
1526 	int err;
1527 	struct in_device *out_dev;
1528 	unsigned int flags = 0;
1529 	bool do_cache;
1530 	u32 itag;
1531 
1532 	/* get a working reference to the output device */
1533 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1534 	if (out_dev == NULL) {
1535 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1536 		return -EINVAL;
1537 	}
1538 
1539 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1540 				  in_dev->dev, in_dev, &itag);
1541 	if (err < 0) {
1542 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1543 					 saddr);
1544 
1545 		goto cleanup;
1546 	}
1547 
1548 	do_cache = res->fi && !itag;
1549 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1550 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1551 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1552 		flags |= RTCF_DOREDIRECT;
1553 		do_cache = false;
1554 	}
1555 
1556 	if (skb->protocol != htons(ETH_P_IP)) {
1557 		/* Not IP (i.e. ARP). Do not create route, if it is
1558 		 * invalid for proxy arp. DNAT routes are always valid.
1559 		 *
1560 		 * Proxy arp feature have been extended to allow, ARP
1561 		 * replies back to the same interface, to support
1562 		 * Private VLAN switch technologies. See arp.c.
1563 		 */
1564 		if (out_dev == in_dev &&
1565 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1566 			err = -EINVAL;
1567 			goto cleanup;
1568 		}
1569 	}
1570 
1571 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1572 	if (do_cache) {
1573 		if (fnhe != NULL)
1574 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1575 		else
1576 			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1577 
1578 		if (rt_cache_valid(rth)) {
1579 			skb_dst_set_noref(skb, &rth->dst);
1580 			goto out;
1581 		}
1582 	}
1583 
1584 	rth = rt_dst_alloc(out_dev->dev,
1585 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1586 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1587 	if (!rth) {
1588 		err = -ENOBUFS;
1589 		goto cleanup;
1590 	}
1591 
1592 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1593 	rth->rt_flags = flags;
1594 	rth->rt_type = res->type;
1595 	rth->rt_is_input = 1;
1596 	rth->rt_iif 	= 0;
1597 	rth->rt_pmtu	= 0;
1598 	rth->rt_gateway	= 0;
1599 	rth->rt_uses_gateway = 0;
1600 	INIT_LIST_HEAD(&rth->rt_uncached);
1601 
1602 	rth->dst.input = ip_forward;
1603 	rth->dst.output = ip_output;
1604 
1605 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1606 	skb_dst_set(skb, &rth->dst);
1607 out:
1608 	err = 0;
1609  cleanup:
1610 	return err;
1611 }
1612 
1613 static int ip_mkroute_input(struct sk_buff *skb,
1614 			    struct fib_result *res,
1615 			    const struct flowi4 *fl4,
1616 			    struct in_device *in_dev,
1617 			    __be32 daddr, __be32 saddr, u32 tos)
1618 {
1619 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1620 	if (res->fi && res->fi->fib_nhs > 1)
1621 		fib_select_multipath(res);
1622 #endif
1623 
1624 	/* create a routing cache entry */
1625 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1626 }
1627 
1628 /*
1629  *	NOTE. We drop all the packets that has local source
1630  *	addresses, because every properly looped back packet
1631  *	must have correct destination already attached by output routine.
1632  *
1633  *	Such approach solves two big problems:
1634  *	1. Not simplex devices are handled properly.
1635  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1636  *	called with rcu_read_lock()
1637  */
1638 
1639 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640 			       u8 tos, struct net_device *dev)
1641 {
1642 	struct fib_result res;
1643 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1644 	struct flowi4	fl4;
1645 	unsigned int	flags = 0;
1646 	u32		itag = 0;
1647 	struct rtable	*rth;
1648 	int		err = -EINVAL;
1649 	struct net    *net = dev_net(dev);
1650 	bool do_cache;
1651 
1652 	/* IP on this device is disabled. */
1653 
1654 	if (!in_dev)
1655 		goto out;
1656 
1657 	/* Check for the most weird martians, which can be not detected
1658 	   by fib_lookup.
1659 	 */
1660 
1661 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1662 		goto martian_source;
1663 
1664 	res.fi = NULL;
1665 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1666 		goto brd_input;
1667 
1668 	/* Accept zero addresses only to limited broadcast;
1669 	 * I even do not know to fix it or not. Waiting for complains :-)
1670 	 */
1671 	if (ipv4_is_zeronet(saddr))
1672 		goto martian_source;
1673 
1674 	if (ipv4_is_zeronet(daddr))
1675 		goto martian_destination;
1676 
1677 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1678 	 * and call it once if daddr or/and saddr are loopback addresses
1679 	 */
1680 	if (ipv4_is_loopback(daddr)) {
1681 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1682 			goto martian_destination;
1683 	} else if (ipv4_is_loopback(saddr)) {
1684 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1685 			goto martian_source;
1686 	}
1687 
1688 	/*
1689 	 *	Now we are ready to route packet.
1690 	 */
1691 	fl4.flowi4_oif = 0;
1692 	fl4.flowi4_iif = dev->ifindex;
1693 	fl4.flowi4_mark = skb->mark;
1694 	fl4.flowi4_tos = tos;
1695 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1696 	fl4.daddr = daddr;
1697 	fl4.saddr = saddr;
1698 	err = fib_lookup(net, &fl4, &res);
1699 	if (err != 0)
1700 		goto no_route;
1701 
1702 	RT_CACHE_STAT_INC(in_slow_tot);
1703 
1704 	if (res.type == RTN_BROADCAST)
1705 		goto brd_input;
1706 
1707 	if (res.type == RTN_LOCAL) {
1708 		err = fib_validate_source(skb, saddr, daddr, tos,
1709 					  LOOPBACK_IFINDEX,
1710 					  dev, in_dev, &itag);
1711 		if (err < 0)
1712 			goto martian_source_keep_err;
1713 		goto local_input;
1714 	}
1715 
1716 	if (!IN_DEV_FORWARD(in_dev))
1717 		goto no_route;
1718 	if (res.type != RTN_UNICAST)
1719 		goto martian_destination;
1720 
1721 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1722 out:	return err;
1723 
1724 brd_input:
1725 	if (skb->protocol != htons(ETH_P_IP))
1726 		goto e_inval;
1727 
1728 	if (!ipv4_is_zeronet(saddr)) {
1729 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1730 					  in_dev, &itag);
1731 		if (err < 0)
1732 			goto martian_source_keep_err;
1733 	}
1734 	flags |= RTCF_BROADCAST;
1735 	res.type = RTN_BROADCAST;
1736 	RT_CACHE_STAT_INC(in_brd);
1737 
1738 local_input:
1739 	do_cache = false;
1740 	if (res.fi) {
1741 		if (!itag) {
1742 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1743 			if (rt_cache_valid(rth)) {
1744 				skb_dst_set_noref(skb, &rth->dst);
1745 				err = 0;
1746 				goto out;
1747 			}
1748 			do_cache = true;
1749 		}
1750 	}
1751 
1752 	rth = rt_dst_alloc(net->loopback_dev,
1753 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1754 	if (!rth)
1755 		goto e_nobufs;
1756 
1757 	rth->dst.input= ip_local_deliver;
1758 	rth->dst.output= ip_rt_bug;
1759 #ifdef CONFIG_IP_ROUTE_CLASSID
1760 	rth->dst.tclassid = itag;
1761 #endif
1762 
1763 	rth->rt_genid = rt_genid(net);
1764 	rth->rt_flags 	= flags|RTCF_LOCAL;
1765 	rth->rt_type	= res.type;
1766 	rth->rt_is_input = 1;
1767 	rth->rt_iif	= 0;
1768 	rth->rt_pmtu	= 0;
1769 	rth->rt_gateway	= 0;
1770 	rth->rt_uses_gateway = 0;
1771 	INIT_LIST_HEAD(&rth->rt_uncached);
1772 	if (res.type == RTN_UNREACHABLE) {
1773 		rth->dst.input= ip_error;
1774 		rth->dst.error= -err;
1775 		rth->rt_flags 	&= ~RTCF_LOCAL;
1776 	}
1777 	if (do_cache)
1778 		rt_cache_route(&FIB_RES_NH(res), rth);
1779 	skb_dst_set(skb, &rth->dst);
1780 	err = 0;
1781 	goto out;
1782 
1783 no_route:
1784 	RT_CACHE_STAT_INC(in_no_route);
1785 	res.type = RTN_UNREACHABLE;
1786 	if (err == -ESRCH)
1787 		err = -ENETUNREACH;
1788 	goto local_input;
1789 
1790 	/*
1791 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1792 	 */
1793 martian_destination:
1794 	RT_CACHE_STAT_INC(in_martian_dst);
1795 #ifdef CONFIG_IP_ROUTE_VERBOSE
1796 	if (IN_DEV_LOG_MARTIANS(in_dev))
1797 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1798 				     &daddr, &saddr, dev->name);
1799 #endif
1800 
1801 e_inval:
1802 	err = -EINVAL;
1803 	goto out;
1804 
1805 e_nobufs:
1806 	err = -ENOBUFS;
1807 	goto out;
1808 
1809 martian_source:
1810 	err = -EINVAL;
1811 martian_source_keep_err:
1812 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1813 	goto out;
1814 }
1815 
1816 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1817 			 u8 tos, struct net_device *dev)
1818 {
1819 	int res;
1820 
1821 	rcu_read_lock();
1822 
1823 	/* Multicast recognition logic is moved from route cache to here.
1824 	   The problem was that too many Ethernet cards have broken/missing
1825 	   hardware multicast filters :-( As result the host on multicasting
1826 	   network acquires a lot of useless route cache entries, sort of
1827 	   SDR messages from all the world. Now we try to get rid of them.
1828 	   Really, provided software IP multicast filter is organized
1829 	   reasonably (at least, hashed), it does not result in a slowdown
1830 	   comparing with route cache reject entries.
1831 	   Note, that multicast routers are not affected, because
1832 	   route cache entry is created eventually.
1833 	 */
1834 	if (ipv4_is_multicast(daddr)) {
1835 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1836 
1837 		if (in_dev) {
1838 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1839 						  ip_hdr(skb)->protocol);
1840 			if (our
1841 #ifdef CONFIG_IP_MROUTE
1842 				||
1843 			    (!ipv4_is_local_multicast(daddr) &&
1844 			     IN_DEV_MFORWARD(in_dev))
1845 #endif
1846 			   ) {
1847 				int res = ip_route_input_mc(skb, daddr, saddr,
1848 							    tos, dev, our);
1849 				rcu_read_unlock();
1850 				return res;
1851 			}
1852 		}
1853 		rcu_read_unlock();
1854 		return -EINVAL;
1855 	}
1856 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1857 	rcu_read_unlock();
1858 	return res;
1859 }
1860 EXPORT_SYMBOL(ip_route_input_noref);
1861 
1862 /* called with rcu_read_lock() */
1863 static struct rtable *__mkroute_output(const struct fib_result *res,
1864 				       const struct flowi4 *fl4, int orig_oif,
1865 				       struct net_device *dev_out,
1866 				       unsigned int flags)
1867 {
1868 	struct fib_info *fi = res->fi;
1869 	struct fib_nh_exception *fnhe;
1870 	struct in_device *in_dev;
1871 	u16 type = res->type;
1872 	struct rtable *rth;
1873 	bool do_cache;
1874 
1875 	in_dev = __in_dev_get_rcu(dev_out);
1876 	if (!in_dev)
1877 		return ERR_PTR(-EINVAL);
1878 
1879 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1880 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1881 			return ERR_PTR(-EINVAL);
1882 
1883 	if (ipv4_is_lbcast(fl4->daddr))
1884 		type = RTN_BROADCAST;
1885 	else if (ipv4_is_multicast(fl4->daddr))
1886 		type = RTN_MULTICAST;
1887 	else if (ipv4_is_zeronet(fl4->daddr))
1888 		return ERR_PTR(-EINVAL);
1889 
1890 	if (dev_out->flags & IFF_LOOPBACK)
1891 		flags |= RTCF_LOCAL;
1892 
1893 	do_cache = true;
1894 	if (type == RTN_BROADCAST) {
1895 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1896 		fi = NULL;
1897 	} else if (type == RTN_MULTICAST) {
1898 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1899 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1900 				     fl4->flowi4_proto))
1901 			flags &= ~RTCF_LOCAL;
1902 		else
1903 			do_cache = false;
1904 		/* If multicast route do not exist use
1905 		 * default one, but do not gateway in this case.
1906 		 * Yes, it is hack.
1907 		 */
1908 		if (fi && res->prefixlen < 4)
1909 			fi = NULL;
1910 	}
1911 
1912 	fnhe = NULL;
1913 	do_cache &= fi != NULL;
1914 	if (do_cache) {
1915 		struct rtable __rcu **prth;
1916 		struct fib_nh *nh = &FIB_RES_NH(*res);
1917 
1918 		fnhe = find_exception(nh, fl4->daddr);
1919 		if (fnhe)
1920 			prth = &fnhe->fnhe_rth_output;
1921 		else {
1922 			if (unlikely(fl4->flowi4_flags &
1923 				     FLOWI_FLAG_KNOWN_NH &&
1924 				     !(nh->nh_gw &&
1925 				       nh->nh_scope == RT_SCOPE_LINK))) {
1926 				do_cache = false;
1927 				goto add;
1928 			}
1929 			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1930 		}
1931 		rth = rcu_dereference(*prth);
1932 		if (rt_cache_valid(rth)) {
1933 			dst_hold(&rth->dst);
1934 			return rth;
1935 		}
1936 	}
1937 
1938 add:
1939 	rth = rt_dst_alloc(dev_out,
1940 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1941 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1942 			   do_cache);
1943 	if (!rth)
1944 		return ERR_PTR(-ENOBUFS);
1945 
1946 	rth->dst.output = ip_output;
1947 
1948 	rth->rt_genid = rt_genid(dev_net(dev_out));
1949 	rth->rt_flags	= flags;
1950 	rth->rt_type	= type;
1951 	rth->rt_is_input = 0;
1952 	rth->rt_iif	= orig_oif ? : 0;
1953 	rth->rt_pmtu	= 0;
1954 	rth->rt_gateway = 0;
1955 	rth->rt_uses_gateway = 0;
1956 	INIT_LIST_HEAD(&rth->rt_uncached);
1957 
1958 	RT_CACHE_STAT_INC(out_slow_tot);
1959 
1960 	if (flags & RTCF_LOCAL)
1961 		rth->dst.input = ip_local_deliver;
1962 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1963 		if (flags & RTCF_LOCAL &&
1964 		    !(dev_out->flags & IFF_LOOPBACK)) {
1965 			rth->dst.output = ip_mc_output;
1966 			RT_CACHE_STAT_INC(out_slow_mc);
1967 		}
1968 #ifdef CONFIG_IP_MROUTE
1969 		if (type == RTN_MULTICAST) {
1970 			if (IN_DEV_MFORWARD(in_dev) &&
1971 			    !ipv4_is_local_multicast(fl4->daddr)) {
1972 				rth->dst.input = ip_mr_input;
1973 				rth->dst.output = ip_mc_output;
1974 			}
1975 		}
1976 #endif
1977 	}
1978 
1979 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1980 
1981 	return rth;
1982 }
1983 
1984 /*
1985  * Major route resolver routine.
1986  */
1987 
1988 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1989 {
1990 	struct net_device *dev_out = NULL;
1991 	__u8 tos = RT_FL_TOS(fl4);
1992 	unsigned int flags = 0;
1993 	struct fib_result res;
1994 	struct rtable *rth;
1995 	int orig_oif;
1996 
1997 	res.tclassid	= 0;
1998 	res.fi		= NULL;
1999 	res.table	= NULL;
2000 
2001 	orig_oif = fl4->flowi4_oif;
2002 
2003 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2004 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2005 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2006 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2007 
2008 	rcu_read_lock();
2009 	if (fl4->saddr) {
2010 		rth = ERR_PTR(-EINVAL);
2011 		if (ipv4_is_multicast(fl4->saddr) ||
2012 		    ipv4_is_lbcast(fl4->saddr) ||
2013 		    ipv4_is_zeronet(fl4->saddr))
2014 			goto out;
2015 
2016 		/* I removed check for oif == dev_out->oif here.
2017 		   It was wrong for two reasons:
2018 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2019 		      is assigned to multiple interfaces.
2020 		   2. Moreover, we are allowed to send packets with saddr
2021 		      of another iface. --ANK
2022 		 */
2023 
2024 		if (fl4->flowi4_oif == 0 &&
2025 		    (ipv4_is_multicast(fl4->daddr) ||
2026 		     ipv4_is_lbcast(fl4->daddr))) {
2027 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2029 			if (dev_out == NULL)
2030 				goto out;
2031 
2032 			/* Special hack: user can direct multicasts
2033 			   and limited broadcast via necessary interface
2034 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2035 			   This hack is not just for fun, it allows
2036 			   vic,vat and friends to work.
2037 			   They bind socket to loopback, set ttl to zero
2038 			   and expect that it will work.
2039 			   From the viewpoint of routing cache they are broken,
2040 			   because we are not allowed to build multicast path
2041 			   with loopback source addr (look, routing cache
2042 			   cannot know, that ttl is zero, so that packet
2043 			   will not leave this host and route is valid).
2044 			   Luckily, this hack is good workaround.
2045 			 */
2046 
2047 			fl4->flowi4_oif = dev_out->ifindex;
2048 			goto make_route;
2049 		}
2050 
2051 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2052 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053 			if (!__ip_dev_find(net, fl4->saddr, false))
2054 				goto out;
2055 		}
2056 	}
2057 
2058 
2059 	if (fl4->flowi4_oif) {
2060 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2061 		rth = ERR_PTR(-ENODEV);
2062 		if (dev_out == NULL)
2063 			goto out;
2064 
2065 		/* RACE: Check return value of inet_select_addr instead. */
2066 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2067 			rth = ERR_PTR(-ENETUNREACH);
2068 			goto out;
2069 		}
2070 		if (ipv4_is_local_multicast(fl4->daddr) ||
2071 		    ipv4_is_lbcast(fl4->daddr)) {
2072 			if (!fl4->saddr)
2073 				fl4->saddr = inet_select_addr(dev_out, 0,
2074 							      RT_SCOPE_LINK);
2075 			goto make_route;
2076 		}
2077 		if (fl4->saddr) {
2078 			if (ipv4_is_multicast(fl4->daddr))
2079 				fl4->saddr = inet_select_addr(dev_out, 0,
2080 							      fl4->flowi4_scope);
2081 			else if (!fl4->daddr)
2082 				fl4->saddr = inet_select_addr(dev_out, 0,
2083 							      RT_SCOPE_HOST);
2084 		}
2085 	}
2086 
2087 	if (!fl4->daddr) {
2088 		fl4->daddr = fl4->saddr;
2089 		if (!fl4->daddr)
2090 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2091 		dev_out = net->loopback_dev;
2092 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2093 		res.type = RTN_LOCAL;
2094 		flags |= RTCF_LOCAL;
2095 		goto make_route;
2096 	}
2097 
2098 	if (fib_lookup(net, fl4, &res)) {
2099 		res.fi = NULL;
2100 		res.table = NULL;
2101 		if (fl4->flowi4_oif) {
2102 			/* Apparently, routing tables are wrong. Assume,
2103 			   that the destination is on link.
2104 
2105 			   WHY? DW.
2106 			   Because we are allowed to send to iface
2107 			   even if it has NO routes and NO assigned
2108 			   addresses. When oif is specified, routing
2109 			   tables are looked up with only one purpose:
2110 			   to catch if destination is gatewayed, rather than
2111 			   direct. Moreover, if MSG_DONTROUTE is set,
2112 			   we send packet, ignoring both routing tables
2113 			   and ifaddr state. --ANK
2114 
2115 
2116 			   We could make it even if oif is unknown,
2117 			   likely IPv6, but we do not.
2118 			 */
2119 
2120 			if (fl4->saddr == 0)
2121 				fl4->saddr = inet_select_addr(dev_out, 0,
2122 							      RT_SCOPE_LINK);
2123 			res.type = RTN_UNICAST;
2124 			goto make_route;
2125 		}
2126 		rth = ERR_PTR(-ENETUNREACH);
2127 		goto out;
2128 	}
2129 
2130 	if (res.type == RTN_LOCAL) {
2131 		if (!fl4->saddr) {
2132 			if (res.fi->fib_prefsrc)
2133 				fl4->saddr = res.fi->fib_prefsrc;
2134 			else
2135 				fl4->saddr = fl4->daddr;
2136 		}
2137 		dev_out = net->loopback_dev;
2138 		fl4->flowi4_oif = dev_out->ifindex;
2139 		flags |= RTCF_LOCAL;
2140 		goto make_route;
2141 	}
2142 
2143 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2144 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2145 		fib_select_multipath(&res);
2146 	else
2147 #endif
2148 	if (!res.prefixlen &&
2149 	    res.table->tb_num_default > 1 &&
2150 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2151 		fib_select_default(&res);
2152 
2153 	if (!fl4->saddr)
2154 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2155 
2156 	dev_out = FIB_RES_DEV(res);
2157 	fl4->flowi4_oif = dev_out->ifindex;
2158 
2159 
2160 make_route:
2161 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2162 
2163 out:
2164 	rcu_read_unlock();
2165 	return rth;
2166 }
2167 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2168 
2169 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2170 {
2171 	return NULL;
2172 }
2173 
2174 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2175 {
2176 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2177 
2178 	return mtu ? : dst->dev->mtu;
2179 }
2180 
2181 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182 					  struct sk_buff *skb, u32 mtu)
2183 {
2184 }
2185 
2186 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2187 				       struct sk_buff *skb)
2188 {
2189 }
2190 
2191 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2192 					  unsigned long old)
2193 {
2194 	return NULL;
2195 }
2196 
2197 static struct dst_ops ipv4_dst_blackhole_ops = {
2198 	.family			=	AF_INET,
2199 	.protocol		=	cpu_to_be16(ETH_P_IP),
2200 	.check			=	ipv4_blackhole_dst_check,
2201 	.mtu			=	ipv4_blackhole_mtu,
2202 	.default_advmss		=	ipv4_default_advmss,
2203 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2204 	.redirect		=	ipv4_rt_blackhole_redirect,
2205 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2206 	.neigh_lookup		=	ipv4_neigh_lookup,
2207 };
2208 
2209 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2210 {
2211 	struct rtable *ort = (struct rtable *) dst_orig;
2212 	struct rtable *rt;
2213 
2214 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2215 	if (rt) {
2216 		struct dst_entry *new = &rt->dst;
2217 
2218 		new->__use = 1;
2219 		new->input = dst_discard;
2220 		new->output = dst_discard;
2221 
2222 		new->dev = ort->dst.dev;
2223 		if (new->dev)
2224 			dev_hold(new->dev);
2225 
2226 		rt->rt_is_input = ort->rt_is_input;
2227 		rt->rt_iif = ort->rt_iif;
2228 		rt->rt_pmtu = ort->rt_pmtu;
2229 
2230 		rt->rt_genid = rt_genid(net);
2231 		rt->rt_flags = ort->rt_flags;
2232 		rt->rt_type = ort->rt_type;
2233 		rt->rt_gateway = ort->rt_gateway;
2234 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2235 
2236 		INIT_LIST_HEAD(&rt->rt_uncached);
2237 
2238 		dst_free(new);
2239 	}
2240 
2241 	dst_release(dst_orig);
2242 
2243 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2244 }
2245 
2246 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2247 				    struct sock *sk)
2248 {
2249 	struct rtable *rt = __ip_route_output_key(net, flp4);
2250 
2251 	if (IS_ERR(rt))
2252 		return rt;
2253 
2254 	if (flp4->flowi4_proto)
2255 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2256 						   flowi4_to_flowi(flp4),
2257 						   sk, 0);
2258 
2259 	return rt;
2260 }
2261 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2262 
2263 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2264 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2265 			u32 seq, int event, int nowait, unsigned int flags)
2266 {
2267 	struct rtable *rt = skb_rtable(skb);
2268 	struct rtmsg *r;
2269 	struct nlmsghdr *nlh;
2270 	unsigned long expires = 0;
2271 	u32 error;
2272 	u32 metrics[RTAX_MAX];
2273 
2274 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2275 	if (nlh == NULL)
2276 		return -EMSGSIZE;
2277 
2278 	r = nlmsg_data(nlh);
2279 	r->rtm_family	 = AF_INET;
2280 	r->rtm_dst_len	= 32;
2281 	r->rtm_src_len	= 0;
2282 	r->rtm_tos	= fl4->flowi4_tos;
2283 	r->rtm_table	= RT_TABLE_MAIN;
2284 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2285 		goto nla_put_failure;
2286 	r->rtm_type	= rt->rt_type;
2287 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2288 	r->rtm_protocol = RTPROT_UNSPEC;
2289 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2290 	if (rt->rt_flags & RTCF_NOTIFY)
2291 		r->rtm_flags |= RTM_F_NOTIFY;
2292 
2293 	if (nla_put_be32(skb, RTA_DST, dst))
2294 		goto nla_put_failure;
2295 	if (src) {
2296 		r->rtm_src_len = 32;
2297 		if (nla_put_be32(skb, RTA_SRC, src))
2298 			goto nla_put_failure;
2299 	}
2300 	if (rt->dst.dev &&
2301 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2302 		goto nla_put_failure;
2303 #ifdef CONFIG_IP_ROUTE_CLASSID
2304 	if (rt->dst.tclassid &&
2305 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2306 		goto nla_put_failure;
2307 #endif
2308 	if (!rt_is_input_route(rt) &&
2309 	    fl4->saddr != src) {
2310 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2311 			goto nla_put_failure;
2312 	}
2313 	if (rt->rt_uses_gateway &&
2314 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2315 		goto nla_put_failure;
2316 
2317 	expires = rt->dst.expires;
2318 	if (expires) {
2319 		unsigned long now = jiffies;
2320 
2321 		if (time_before(now, expires))
2322 			expires -= now;
2323 		else
2324 			expires = 0;
2325 	}
2326 
2327 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2328 	if (rt->rt_pmtu && expires)
2329 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2330 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2331 		goto nla_put_failure;
2332 
2333 	if (fl4->flowi4_mark &&
2334 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2335 		goto nla_put_failure;
2336 
2337 	error = rt->dst.error;
2338 
2339 	if (rt_is_input_route(rt)) {
2340 #ifdef CONFIG_IP_MROUTE
2341 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2342 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2343 			int err = ipmr_get_route(net, skb,
2344 						 fl4->saddr, fl4->daddr,
2345 						 r, nowait);
2346 			if (err <= 0) {
2347 				if (!nowait) {
2348 					if (err == 0)
2349 						return 0;
2350 					goto nla_put_failure;
2351 				} else {
2352 					if (err == -EMSGSIZE)
2353 						goto nla_put_failure;
2354 					error = err;
2355 				}
2356 			}
2357 		} else
2358 #endif
2359 			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2360 				goto nla_put_failure;
2361 	}
2362 
2363 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2364 		goto nla_put_failure;
2365 
2366 	return nlmsg_end(skb, nlh);
2367 
2368 nla_put_failure:
2369 	nlmsg_cancel(skb, nlh);
2370 	return -EMSGSIZE;
2371 }
2372 
2373 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2374 {
2375 	struct net *net = sock_net(in_skb->sk);
2376 	struct rtmsg *rtm;
2377 	struct nlattr *tb[RTA_MAX+1];
2378 	struct rtable *rt = NULL;
2379 	struct flowi4 fl4;
2380 	__be32 dst = 0;
2381 	__be32 src = 0;
2382 	u32 iif;
2383 	int err;
2384 	int mark;
2385 	struct sk_buff *skb;
2386 
2387 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2388 	if (err < 0)
2389 		goto errout;
2390 
2391 	rtm = nlmsg_data(nlh);
2392 
2393 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2394 	if (skb == NULL) {
2395 		err = -ENOBUFS;
2396 		goto errout;
2397 	}
2398 
2399 	/* Reserve room for dummy headers, this skb can pass
2400 	   through good chunk of routing engine.
2401 	 */
2402 	skb_reset_mac_header(skb);
2403 	skb_reset_network_header(skb);
2404 
2405 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2406 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2407 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2408 
2409 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2410 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2411 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2412 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2413 
2414 	memset(&fl4, 0, sizeof(fl4));
2415 	fl4.daddr = dst;
2416 	fl4.saddr = src;
2417 	fl4.flowi4_tos = rtm->rtm_tos;
2418 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2419 	fl4.flowi4_mark = mark;
2420 
2421 	if (iif) {
2422 		struct net_device *dev;
2423 
2424 		dev = __dev_get_by_index(net, iif);
2425 		if (dev == NULL) {
2426 			err = -ENODEV;
2427 			goto errout_free;
2428 		}
2429 
2430 		skb->protocol	= htons(ETH_P_IP);
2431 		skb->dev	= dev;
2432 		skb->mark	= mark;
2433 		local_bh_disable();
2434 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2435 		local_bh_enable();
2436 
2437 		rt = skb_rtable(skb);
2438 		if (err == 0 && rt->dst.error)
2439 			err = -rt->dst.error;
2440 	} else {
2441 		rt = ip_route_output_key(net, &fl4);
2442 
2443 		err = 0;
2444 		if (IS_ERR(rt))
2445 			err = PTR_ERR(rt);
2446 	}
2447 
2448 	if (err)
2449 		goto errout_free;
2450 
2451 	skb_dst_set(skb, &rt->dst);
2452 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2453 		rt->rt_flags |= RTCF_NOTIFY;
2454 
2455 	err = rt_fill_info(net, dst, src, &fl4, skb,
2456 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2457 			   RTM_NEWROUTE, 0, 0);
2458 	if (err <= 0)
2459 		goto errout_free;
2460 
2461 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2462 errout:
2463 	return err;
2464 
2465 errout_free:
2466 	kfree_skb(skb);
2467 	goto errout;
2468 }
2469 
2470 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2471 {
2472 	return skb->len;
2473 }
2474 
2475 void ip_rt_multicast_event(struct in_device *in_dev)
2476 {
2477 	rt_cache_flush(dev_net(in_dev->dev));
2478 }
2479 
2480 #ifdef CONFIG_SYSCTL
2481 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2482 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2483 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2484 static int ip_rt_gc_elasticity __read_mostly	= 8;
2485 
2486 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2487 					void __user *buffer,
2488 					size_t *lenp, loff_t *ppos)
2489 {
2490 	struct net *net = (struct net *)__ctl->extra1;
2491 
2492 	if (write) {
2493 		rt_cache_flush(net);
2494 		fnhe_genid_bump(net);
2495 		return 0;
2496 	}
2497 
2498 	return -EINVAL;
2499 }
2500 
2501 static struct ctl_table ipv4_route_table[] = {
2502 	{
2503 		.procname	= "gc_thresh",
2504 		.data		= &ipv4_dst_ops.gc_thresh,
2505 		.maxlen		= sizeof(int),
2506 		.mode		= 0644,
2507 		.proc_handler	= proc_dointvec,
2508 	},
2509 	{
2510 		.procname	= "max_size",
2511 		.data		= &ip_rt_max_size,
2512 		.maxlen		= sizeof(int),
2513 		.mode		= 0644,
2514 		.proc_handler	= proc_dointvec,
2515 	},
2516 	{
2517 		/*  Deprecated. Use gc_min_interval_ms */
2518 
2519 		.procname	= "gc_min_interval",
2520 		.data		= &ip_rt_gc_min_interval,
2521 		.maxlen		= sizeof(int),
2522 		.mode		= 0644,
2523 		.proc_handler	= proc_dointvec_jiffies,
2524 	},
2525 	{
2526 		.procname	= "gc_min_interval_ms",
2527 		.data		= &ip_rt_gc_min_interval,
2528 		.maxlen		= sizeof(int),
2529 		.mode		= 0644,
2530 		.proc_handler	= proc_dointvec_ms_jiffies,
2531 	},
2532 	{
2533 		.procname	= "gc_timeout",
2534 		.data		= &ip_rt_gc_timeout,
2535 		.maxlen		= sizeof(int),
2536 		.mode		= 0644,
2537 		.proc_handler	= proc_dointvec_jiffies,
2538 	},
2539 	{
2540 		.procname	= "gc_interval",
2541 		.data		= &ip_rt_gc_interval,
2542 		.maxlen		= sizeof(int),
2543 		.mode		= 0644,
2544 		.proc_handler	= proc_dointvec_jiffies,
2545 	},
2546 	{
2547 		.procname	= "redirect_load",
2548 		.data		= &ip_rt_redirect_load,
2549 		.maxlen		= sizeof(int),
2550 		.mode		= 0644,
2551 		.proc_handler	= proc_dointvec,
2552 	},
2553 	{
2554 		.procname	= "redirect_number",
2555 		.data		= &ip_rt_redirect_number,
2556 		.maxlen		= sizeof(int),
2557 		.mode		= 0644,
2558 		.proc_handler	= proc_dointvec,
2559 	},
2560 	{
2561 		.procname	= "redirect_silence",
2562 		.data		= &ip_rt_redirect_silence,
2563 		.maxlen		= sizeof(int),
2564 		.mode		= 0644,
2565 		.proc_handler	= proc_dointvec,
2566 	},
2567 	{
2568 		.procname	= "error_cost",
2569 		.data		= &ip_rt_error_cost,
2570 		.maxlen		= sizeof(int),
2571 		.mode		= 0644,
2572 		.proc_handler	= proc_dointvec,
2573 	},
2574 	{
2575 		.procname	= "error_burst",
2576 		.data		= &ip_rt_error_burst,
2577 		.maxlen		= sizeof(int),
2578 		.mode		= 0644,
2579 		.proc_handler	= proc_dointvec,
2580 	},
2581 	{
2582 		.procname	= "gc_elasticity",
2583 		.data		= &ip_rt_gc_elasticity,
2584 		.maxlen		= sizeof(int),
2585 		.mode		= 0644,
2586 		.proc_handler	= proc_dointvec,
2587 	},
2588 	{
2589 		.procname	= "mtu_expires",
2590 		.data		= &ip_rt_mtu_expires,
2591 		.maxlen		= sizeof(int),
2592 		.mode		= 0644,
2593 		.proc_handler	= proc_dointvec_jiffies,
2594 	},
2595 	{
2596 		.procname	= "min_pmtu",
2597 		.data		= &ip_rt_min_pmtu,
2598 		.maxlen		= sizeof(int),
2599 		.mode		= 0644,
2600 		.proc_handler	= proc_dointvec,
2601 	},
2602 	{
2603 		.procname	= "min_adv_mss",
2604 		.data		= &ip_rt_min_advmss,
2605 		.maxlen		= sizeof(int),
2606 		.mode		= 0644,
2607 		.proc_handler	= proc_dointvec,
2608 	},
2609 	{ }
2610 };
2611 
2612 static struct ctl_table ipv4_route_flush_table[] = {
2613 	{
2614 		.procname	= "flush",
2615 		.maxlen		= sizeof(int),
2616 		.mode		= 0200,
2617 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2618 	},
2619 	{ },
2620 };
2621 
2622 static __net_init int sysctl_route_net_init(struct net *net)
2623 {
2624 	struct ctl_table *tbl;
2625 
2626 	tbl = ipv4_route_flush_table;
2627 	if (!net_eq(net, &init_net)) {
2628 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2629 		if (tbl == NULL)
2630 			goto err_dup;
2631 
2632 		/* Don't export sysctls to unprivileged users */
2633 		if (net->user_ns != &init_user_ns)
2634 			tbl[0].procname = NULL;
2635 	}
2636 	tbl[0].extra1 = net;
2637 
2638 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2639 	if (net->ipv4.route_hdr == NULL)
2640 		goto err_reg;
2641 	return 0;
2642 
2643 err_reg:
2644 	if (tbl != ipv4_route_flush_table)
2645 		kfree(tbl);
2646 err_dup:
2647 	return -ENOMEM;
2648 }
2649 
2650 static __net_exit void sysctl_route_net_exit(struct net *net)
2651 {
2652 	struct ctl_table *tbl;
2653 
2654 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2655 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2656 	BUG_ON(tbl == ipv4_route_flush_table);
2657 	kfree(tbl);
2658 }
2659 
2660 static __net_initdata struct pernet_operations sysctl_route_ops = {
2661 	.init = sysctl_route_net_init,
2662 	.exit = sysctl_route_net_exit,
2663 };
2664 #endif
2665 
2666 static __net_init int rt_genid_init(struct net *net)
2667 {
2668 	atomic_set(&net->rt_genid, 0);
2669 	atomic_set(&net->fnhe_genid, 0);
2670 	get_random_bytes(&net->ipv4.dev_addr_genid,
2671 			 sizeof(net->ipv4.dev_addr_genid));
2672 	return 0;
2673 }
2674 
2675 static __net_initdata struct pernet_operations rt_genid_ops = {
2676 	.init = rt_genid_init,
2677 };
2678 
2679 static int __net_init ipv4_inetpeer_init(struct net *net)
2680 {
2681 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2682 
2683 	if (!bp)
2684 		return -ENOMEM;
2685 	inet_peer_base_init(bp);
2686 	net->ipv4.peers = bp;
2687 	return 0;
2688 }
2689 
2690 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2691 {
2692 	struct inet_peer_base *bp = net->ipv4.peers;
2693 
2694 	net->ipv4.peers = NULL;
2695 	inetpeer_invalidate_tree(bp);
2696 	kfree(bp);
2697 }
2698 
2699 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2700 	.init	=	ipv4_inetpeer_init,
2701 	.exit	=	ipv4_inetpeer_exit,
2702 };
2703 
2704 #ifdef CONFIG_IP_ROUTE_CLASSID
2705 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2706 #endif /* CONFIG_IP_ROUTE_CLASSID */
2707 
2708 int __init ip_rt_init(void)
2709 {
2710 	int rc = 0;
2711 
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2714 	if (!ip_rt_acct)
2715 		panic("IP: failed to allocate ip_rt_acct\n");
2716 #endif
2717 
2718 	ipv4_dst_ops.kmem_cachep =
2719 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2720 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2721 
2722 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2723 
2724 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2725 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2726 
2727 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2728 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2729 
2730 	ipv4_dst_ops.gc_thresh = ~0;
2731 	ip_rt_max_size = INT_MAX;
2732 
2733 	devinet_init();
2734 	ip_fib_init();
2735 
2736 	if (ip_rt_proc_init())
2737 		pr_err("Unable to create route proc files\n");
2738 #ifdef CONFIG_XFRM
2739 	xfrm_init();
2740 	xfrm4_init();
2741 #endif
2742 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2743 
2744 #ifdef CONFIG_SYSCTL
2745 	register_pernet_subsys(&sysctl_route_ops);
2746 #endif
2747 	register_pernet_subsys(&rt_genid_ops);
2748 	register_pernet_subsys(&ipv4_inetpeer_ops);
2749 	return rc;
2750 }
2751 
2752 #ifdef CONFIG_SYSCTL
2753 /*
2754  * We really need to sanitize the damn ipv4 init order, then all
2755  * this nonsense will go away.
2756  */
2757 void __init ip_static_sysctl_init(void)
2758 {
2759 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2760 }
2761 #endif
2762