xref: /linux/net/ipv4/route.c (revision 44b111b519160e33fdc41eadb39af86a24707edf)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
113 
114 #define RT_FL_TOS(oldflp4) \
115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 
117 #define IP_MAX_MTU	0xFFF0
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
124 static int ip_rt_redirect_number __read_mostly	= 9;
125 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly	= HZ;
128 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly	= 8;
130 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly	= 256;
133 static int rt_chain_length_max __read_mostly	= 20;
134 static int redirect_genid;
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
143 static void		 ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void		 ipv4_link_failure(struct sk_buff *skb);
146 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148 
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 			    int how)
151 {
152 }
153 
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155 {
156 	struct rtable *rt = (struct rtable *) dst;
157 	struct inet_peer *peer;
158 	u32 *p = NULL;
159 
160 	if (!rt->peer)
161 		rt_bind_peer(rt, rt->rt_dst, 1);
162 
163 	peer = rt->peer;
164 	if (peer) {
165 		u32 *old_p = __DST_METRICS_PTR(old);
166 		unsigned long prev, new;
167 
168 		p = peer->metrics;
169 		if (inet_metrics_new(peer))
170 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
171 
172 		new = (unsigned long) p;
173 		prev = cmpxchg(&dst->_metrics, old, new);
174 
175 		if (prev != old) {
176 			p = __DST_METRICS_PTR(prev);
177 			if (prev & DST_METRICS_READ_ONLY)
178 				p = NULL;
179 		} else {
180 			if (rt->fi) {
181 				fib_info_put(rt->fi);
182 				rt->fi = NULL;
183 			}
184 		}
185 	}
186 	return p;
187 }
188 
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
190 
191 static struct dst_ops ipv4_dst_ops = {
192 	.family =		AF_INET,
193 	.protocol =		cpu_to_be16(ETH_P_IP),
194 	.gc =			rt_garbage_collect,
195 	.check =		ipv4_dst_check,
196 	.default_advmss =	ipv4_default_advmss,
197 	.mtu =			ipv4_mtu,
198 	.cow_metrics =		ipv4_cow_metrics,
199 	.destroy =		ipv4_dst_destroy,
200 	.ifdown =		ipv4_dst_ifdown,
201 	.negative_advice =	ipv4_negative_advice,
202 	.link_failure =		ipv4_link_failure,
203 	.update_pmtu =		ip_rt_update_pmtu,
204 	.local_out =		__ip_local_out,
205 	.neigh_lookup =		ipv4_neigh_lookup,
206 };
207 
208 #define ECN_OR_COST(class)	TC_PRIO_##class
209 
210 const __u8 ip_tos2prio[16] = {
211 	TC_PRIO_BESTEFFORT,
212 	ECN_OR_COST(BESTEFFORT),
213 	TC_PRIO_BESTEFFORT,
214 	ECN_OR_COST(BESTEFFORT),
215 	TC_PRIO_BULK,
216 	ECN_OR_COST(BULK),
217 	TC_PRIO_BULK,
218 	ECN_OR_COST(BULK),
219 	TC_PRIO_INTERACTIVE,
220 	ECN_OR_COST(INTERACTIVE),
221 	TC_PRIO_INTERACTIVE,
222 	ECN_OR_COST(INTERACTIVE),
223 	TC_PRIO_INTERACTIVE_BULK,
224 	ECN_OR_COST(INTERACTIVE_BULK),
225 	TC_PRIO_INTERACTIVE_BULK,
226 	ECN_OR_COST(INTERACTIVE_BULK)
227 };
228 
229 
230 /*
231  * Route cache.
232  */
233 
234 /* The locking scheme is rather straight forward:
235  *
236  * 1) Read-Copy Update protects the buckets of the central route hash.
237  * 2) Only writers remove entries, and they hold the lock
238  *    as they look at rtable reference counts.
239  * 3) Only readers acquire references to rtable entries,
240  *    they do so with atomic increments and with the
241  *    lock held.
242  */
243 
244 struct rt_hash_bucket {
245 	struct rtable __rcu	*chain;
246 };
247 
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 	defined(CONFIG_PROVE_LOCKING)
250 /*
251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252  * The size of this table is a power of two and depends on the number of CPUS.
253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
254  */
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ	256
257 #else
258 # if NR_CPUS >= 32
259 #  define RT_HASH_LOCK_SZ	4096
260 # elif NR_CPUS >= 16
261 #  define RT_HASH_LOCK_SZ	2048
262 # elif NR_CPUS >= 8
263 #  define RT_HASH_LOCK_SZ	1024
264 # elif NR_CPUS >= 4
265 #  define RT_HASH_LOCK_SZ	512
266 # else
267 #  define RT_HASH_LOCK_SZ	256
268 # endif
269 #endif
270 
271 static spinlock_t	*rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
273 
274 static __init void rt_hash_lock_init(void)
275 {
276 	int i;
277 
278 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 			GFP_KERNEL);
280 	if (!rt_hash_locks)
281 		panic("IP: failed to allocate rt_hash_locks\n");
282 
283 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 		spin_lock_init(&rt_hash_locks[i]);
285 }
286 #else
287 # define rt_hash_lock_addr(slot) NULL
288 
289 static inline void rt_hash_lock_init(void)
290 {
291 }
292 #endif
293 
294 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
295 static unsigned			rt_hash_mask __read_mostly;
296 static unsigned int		rt_hash_log  __read_mostly;
297 
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
300 
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
302 				   int genid)
303 {
304 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
305 			    idx, genid)
306 		& rt_hash_mask;
307 }
308 
309 static inline int rt_genid(struct net *net)
310 {
311 	return atomic_read(&net->ipv4.rt_genid);
312 }
313 
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 	struct seq_net_private p;
317 	int bucket;
318 	int genid;
319 };
320 
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
322 {
323 	struct rt_cache_iter_state *st = seq->private;
324 	struct rtable *r = NULL;
325 
326 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
328 			continue;
329 		rcu_read_lock_bh();
330 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
331 		while (r) {
332 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 			    r->rt_genid == st->genid)
334 				return r;
335 			r = rcu_dereference_bh(r->dst.rt_next);
336 		}
337 		rcu_read_unlock_bh();
338 	}
339 	return r;
340 }
341 
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
343 					  struct rtable *r)
344 {
345 	struct rt_cache_iter_state *st = seq->private;
346 
347 	r = rcu_dereference_bh(r->dst.rt_next);
348 	while (!r) {
349 		rcu_read_unlock_bh();
350 		do {
351 			if (--st->bucket < 0)
352 				return NULL;
353 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
354 		rcu_read_lock_bh();
355 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
356 	}
357 	return r;
358 }
359 
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
361 					struct rtable *r)
362 {
363 	struct rt_cache_iter_state *st = seq->private;
364 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 		if (dev_net(r->dst.dev) != seq_file_net(seq))
366 			continue;
367 		if (r->rt_genid == st->genid)
368 			break;
369 	}
370 	return r;
371 }
372 
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
374 {
375 	struct rtable *r = rt_cache_get_first(seq);
376 
377 	if (r)
378 		while (pos && (r = rt_cache_get_next(seq, r)))
379 			--pos;
380 	return pos ? NULL : r;
381 }
382 
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
384 {
385 	struct rt_cache_iter_state *st = seq->private;
386 	if (*pos)
387 		return rt_cache_get_idx(seq, *pos - 1);
388 	st->genid = rt_genid(seq_file_net(seq));
389 	return SEQ_START_TOKEN;
390 }
391 
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
393 {
394 	struct rtable *r;
395 
396 	if (v == SEQ_START_TOKEN)
397 		r = rt_cache_get_first(seq);
398 	else
399 		r = rt_cache_get_next(seq, v);
400 	++*pos;
401 	return r;
402 }
403 
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
405 {
406 	if (v && v != SEQ_START_TOKEN)
407 		rcu_read_unlock_bh();
408 }
409 
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
411 {
412 	if (v == SEQ_START_TOKEN)
413 		seq_printf(seq, "%-127s\n",
414 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 			   "HHUptod\tSpecDst");
417 	else {
418 		struct rtable *r = v;
419 		struct neighbour *n;
420 		int len, HHUptod;
421 
422 		rcu_read_lock();
423 		n = dst_get_neighbour(&r->dst);
424 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
425 		rcu_read_unlock();
426 
427 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429 			r->dst.dev ? r->dst.dev->name : "*",
430 			(__force u32)r->rt_dst,
431 			(__force u32)r->rt_gateway,
432 			r->rt_flags, atomic_read(&r->dst.__refcnt),
433 			r->dst.__use, 0, (__force u32)r->rt_src,
434 			dst_metric_advmss(&r->dst) + 40,
435 			dst_metric(&r->dst, RTAX_WINDOW),
436 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437 			      dst_metric(&r->dst, RTAX_RTTVAR)),
438 			r->rt_key_tos,
439 			-1,
440 			HHUptod,
441 			r->rt_spec_dst, &len);
442 
443 		seq_printf(seq, "%*s\n", 127 - len, "");
444 	}
445 	return 0;
446 }
447 
448 static const struct seq_operations rt_cache_seq_ops = {
449 	.start  = rt_cache_seq_start,
450 	.next   = rt_cache_seq_next,
451 	.stop   = rt_cache_seq_stop,
452 	.show   = rt_cache_seq_show,
453 };
454 
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
456 {
457 	return seq_open_net(inode, file, &rt_cache_seq_ops,
458 			sizeof(struct rt_cache_iter_state));
459 }
460 
461 static const struct file_operations rt_cache_seq_fops = {
462 	.owner	 = THIS_MODULE,
463 	.open	 = rt_cache_seq_open,
464 	.read	 = seq_read,
465 	.llseek	 = seq_lseek,
466 	.release = seq_release_net,
467 };
468 
469 
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
471 {
472 	int cpu;
473 
474 	if (*pos == 0)
475 		return SEQ_START_TOKEN;
476 
477 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478 		if (!cpu_possible(cpu))
479 			continue;
480 		*pos = cpu+1;
481 		return &per_cpu(rt_cache_stat, cpu);
482 	}
483 	return NULL;
484 }
485 
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
487 {
488 	int cpu;
489 
490 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491 		if (!cpu_possible(cpu))
492 			continue;
493 		*pos = cpu+1;
494 		return &per_cpu(rt_cache_stat, cpu);
495 	}
496 	return NULL;
497 
498 }
499 
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
501 {
502 
503 }
504 
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
506 {
507 	struct rt_cache_stat *st = v;
508 
509 	if (v == SEQ_START_TOKEN) {
510 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
511 		return 0;
512 	}
513 
514 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
515 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516 		   dst_entries_get_slow(&ipv4_dst_ops),
517 		   st->in_hit,
518 		   st->in_slow_tot,
519 		   st->in_slow_mc,
520 		   st->in_no_route,
521 		   st->in_brd,
522 		   st->in_martian_dst,
523 		   st->in_martian_src,
524 
525 		   st->out_hit,
526 		   st->out_slow_tot,
527 		   st->out_slow_mc,
528 
529 		   st->gc_total,
530 		   st->gc_ignored,
531 		   st->gc_goal_miss,
532 		   st->gc_dst_overflow,
533 		   st->in_hlist_search,
534 		   st->out_hlist_search
535 		);
536 	return 0;
537 }
538 
539 static const struct seq_operations rt_cpu_seq_ops = {
540 	.start  = rt_cpu_seq_start,
541 	.next   = rt_cpu_seq_next,
542 	.stop   = rt_cpu_seq_stop,
543 	.show   = rt_cpu_seq_show,
544 };
545 
546 
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
548 {
549 	return seq_open(file, &rt_cpu_seq_ops);
550 }
551 
552 static const struct file_operations rt_cpu_seq_fops = {
553 	.owner	 = THIS_MODULE,
554 	.open	 = rt_cpu_seq_open,
555 	.read	 = seq_read,
556 	.llseek	 = seq_lseek,
557 	.release = seq_release,
558 };
559 
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
562 {
563 	struct ip_rt_acct *dst, *src;
564 	unsigned int i, j;
565 
566 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
567 	if (!dst)
568 		return -ENOMEM;
569 
570 	for_each_possible_cpu(i) {
571 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572 		for (j = 0; j < 256; j++) {
573 			dst[j].o_bytes   += src[j].o_bytes;
574 			dst[j].o_packets += src[j].o_packets;
575 			dst[j].i_bytes   += src[j].i_bytes;
576 			dst[j].i_packets += src[j].i_packets;
577 		}
578 	}
579 
580 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
581 	kfree(dst);
582 	return 0;
583 }
584 
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
586 {
587 	return single_open(file, rt_acct_proc_show, NULL);
588 }
589 
590 static const struct file_operations rt_acct_proc_fops = {
591 	.owner		= THIS_MODULE,
592 	.open		= rt_acct_proc_open,
593 	.read		= seq_read,
594 	.llseek		= seq_lseek,
595 	.release	= single_release,
596 };
597 #endif
598 
599 static int __net_init ip_rt_do_proc_init(struct net *net)
600 {
601 	struct proc_dir_entry *pde;
602 
603 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
604 			&rt_cache_seq_fops);
605 	if (!pde)
606 		goto err1;
607 
608 	pde = proc_create("rt_cache", S_IRUGO,
609 			  net->proc_net_stat, &rt_cpu_seq_fops);
610 	if (!pde)
611 		goto err2;
612 
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
615 	if (!pde)
616 		goto err3;
617 #endif
618 	return 0;
619 
620 #ifdef CONFIG_IP_ROUTE_CLASSID
621 err3:
622 	remove_proc_entry("rt_cache", net->proc_net_stat);
623 #endif
624 err2:
625 	remove_proc_entry("rt_cache", net->proc_net);
626 err1:
627 	return -ENOMEM;
628 }
629 
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
631 {
632 	remove_proc_entry("rt_cache", net->proc_net_stat);
633 	remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635 	remove_proc_entry("rt_acct", net->proc_net);
636 #endif
637 }
638 
639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
640 	.init = ip_rt_do_proc_init,
641 	.exit = ip_rt_do_proc_exit,
642 };
643 
644 static int __init ip_rt_proc_init(void)
645 {
646 	return register_pernet_subsys(&ip_rt_proc_ops);
647 }
648 
649 #else
650 static inline int ip_rt_proc_init(void)
651 {
652 	return 0;
653 }
654 #endif /* CONFIG_PROC_FS */
655 
656 static inline void rt_free(struct rtable *rt)
657 {
658 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
659 }
660 
661 static inline void rt_drop(struct rtable *rt)
662 {
663 	ip_rt_put(rt);
664 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
665 }
666 
667 static inline int rt_fast_clean(struct rtable *rth)
668 {
669 	/* Kill broadcast/multicast entries very aggresively, if they
670 	   collide in hash table with more useful entries */
671 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672 		rt_is_input_route(rth) && rth->dst.rt_next;
673 }
674 
675 static inline int rt_valuable(struct rtable *rth)
676 {
677 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678 		(rth->peer && rth->peer->pmtu_expires);
679 }
680 
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
682 {
683 	unsigned long age;
684 	int ret = 0;
685 
686 	if (atomic_read(&rth->dst.__refcnt))
687 		goto out;
688 
689 	age = jiffies - rth->dst.lastuse;
690 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691 	    (age <= tmo2 && rt_valuable(rth)))
692 		goto out;
693 	ret = 1;
694 out:	return ret;
695 }
696 
697 /* Bits of score are:
698  * 31: very valuable
699  * 30: not quite useless
700  * 29..0: usage counter
701  */
702 static inline u32 rt_score(struct rtable *rt)
703 {
704 	u32 score = jiffies - rt->dst.lastuse;
705 
706 	score = ~score & ~(3<<30);
707 
708 	if (rt_valuable(rt))
709 		score |= (1<<31);
710 
711 	if (rt_is_output_route(rt) ||
712 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
713 		score |= (1<<30);
714 
715 	return score;
716 }
717 
718 static inline bool rt_caching(const struct net *net)
719 {
720 	return net->ipv4.current_rt_cache_rebuild_count <=
721 		net->ipv4.sysctl_rt_cache_rebuild_count;
722 }
723 
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725 				       const struct rtable *rt2)
726 {
727 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
730 }
731 
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
733 {
734 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736 		(rt1->rt_mark ^ rt2->rt_mark) |
737 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
738 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
739 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
740 }
741 
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
743 {
744 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
745 }
746 
747 static inline int rt_is_expired(struct rtable *rth)
748 {
749 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
750 }
751 
752 /*
753  * Perform a full scan of hash table and free all entries.
754  * Can be called by a softirq or a process.
755  * In the later case, we want to be reschedule if necessary
756  */
757 static void rt_do_flush(struct net *net, int process_context)
758 {
759 	unsigned int i;
760 	struct rtable *rth, *next;
761 
762 	for (i = 0; i <= rt_hash_mask; i++) {
763 		struct rtable __rcu **pprev;
764 		struct rtable *list;
765 
766 		if (process_context && need_resched())
767 			cond_resched();
768 		rth = rcu_access_pointer(rt_hash_table[i].chain);
769 		if (!rth)
770 			continue;
771 
772 		spin_lock_bh(rt_hash_lock_addr(i));
773 
774 		list = NULL;
775 		pprev = &rt_hash_table[i].chain;
776 		rth = rcu_dereference_protected(*pprev,
777 			lockdep_is_held(rt_hash_lock_addr(i)));
778 
779 		while (rth) {
780 			next = rcu_dereference_protected(rth->dst.rt_next,
781 				lockdep_is_held(rt_hash_lock_addr(i)));
782 
783 			if (!net ||
784 			    net_eq(dev_net(rth->dst.dev), net)) {
785 				rcu_assign_pointer(*pprev, next);
786 				rcu_assign_pointer(rth->dst.rt_next, list);
787 				list = rth;
788 			} else {
789 				pprev = &rth->dst.rt_next;
790 			}
791 			rth = next;
792 		}
793 
794 		spin_unlock_bh(rt_hash_lock_addr(i));
795 
796 		for (; list; list = next) {
797 			next = rcu_dereference_protected(list->dst.rt_next, 1);
798 			rt_free(list);
799 		}
800 	}
801 }
802 
803 /*
804  * While freeing expired entries, we compute average chain length
805  * and standard deviation, using fixed-point arithmetic.
806  * This to have an estimation of rt_chain_length_max
807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
809  */
810 
811 #define FRACT_BITS 3
812 #define ONE (1UL << FRACT_BITS)
813 
814 /*
815  * Given a hash chain and an item in this hash chain,
816  * find if a previous entry has the same hash_inputs
817  * (but differs on tos, mark or oif)
818  * Returns 0 if an alias is found.
819  * Returns ONE if rth has no alias before itself.
820  */
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
822 {
823 	const struct rtable *aux = head;
824 
825 	while (aux != rth) {
826 		if (compare_hash_inputs(aux, rth))
827 			return 0;
828 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
829 	}
830 	return ONE;
831 }
832 
833 /*
834  * Perturbation of rt_genid by a small quantity [1..256]
835  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
836  * many times (2^24) without giving recent rt_genid.
837  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
838  */
839 static void rt_cache_invalidate(struct net *net)
840 {
841 	unsigned char shuffle;
842 
843 	get_random_bytes(&shuffle, sizeof(shuffle));
844 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
845 	redirect_genid++;
846 }
847 
848 /*
849  * delay < 0  : invalidate cache (fast : entries will be deleted later)
850  * delay >= 0 : invalidate & flush cache (can be long)
851  */
852 void rt_cache_flush(struct net *net, int delay)
853 {
854 	rt_cache_invalidate(net);
855 	if (delay >= 0)
856 		rt_do_flush(net, !in_softirq());
857 }
858 
859 /* Flush previous cache invalidated entries from the cache */
860 void rt_cache_flush_batch(struct net *net)
861 {
862 	rt_do_flush(net, !in_softirq());
863 }
864 
865 static void rt_emergency_hash_rebuild(struct net *net)
866 {
867 	if (net_ratelimit())
868 		printk(KERN_WARNING "Route hash chain too long!\n");
869 	rt_cache_invalidate(net);
870 }
871 
872 /*
873    Short description of GC goals.
874 
875    We want to build algorithm, which will keep routing cache
876    at some equilibrium point, when number of aged off entries
877    is kept approximately equal to newly generated ones.
878 
879    Current expiration strength is variable "expire".
880    We try to adjust it dynamically, so that if networking
881    is idle expires is large enough to keep enough of warm entries,
882    and when load increases it reduces to limit cache size.
883  */
884 
885 static int rt_garbage_collect(struct dst_ops *ops)
886 {
887 	static unsigned long expire = RT_GC_TIMEOUT;
888 	static unsigned long last_gc;
889 	static int rover;
890 	static int equilibrium;
891 	struct rtable *rth;
892 	struct rtable __rcu **rthp;
893 	unsigned long now = jiffies;
894 	int goal;
895 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
896 
897 	/*
898 	 * Garbage collection is pretty expensive,
899 	 * do not make it too frequently.
900 	 */
901 
902 	RT_CACHE_STAT_INC(gc_total);
903 
904 	if (now - last_gc < ip_rt_gc_min_interval &&
905 	    entries < ip_rt_max_size) {
906 		RT_CACHE_STAT_INC(gc_ignored);
907 		goto out;
908 	}
909 
910 	entries = dst_entries_get_slow(&ipv4_dst_ops);
911 	/* Calculate number of entries, which we want to expire now. */
912 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
913 	if (goal <= 0) {
914 		if (equilibrium < ipv4_dst_ops.gc_thresh)
915 			equilibrium = ipv4_dst_ops.gc_thresh;
916 		goal = entries - equilibrium;
917 		if (goal > 0) {
918 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
919 			goal = entries - equilibrium;
920 		}
921 	} else {
922 		/* We are in dangerous area. Try to reduce cache really
923 		 * aggressively.
924 		 */
925 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
926 		equilibrium = entries - goal;
927 	}
928 
929 	if (now - last_gc >= ip_rt_gc_min_interval)
930 		last_gc = now;
931 
932 	if (goal <= 0) {
933 		equilibrium += goal;
934 		goto work_done;
935 	}
936 
937 	do {
938 		int i, k;
939 
940 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
941 			unsigned long tmo = expire;
942 
943 			k = (k + 1) & rt_hash_mask;
944 			rthp = &rt_hash_table[k].chain;
945 			spin_lock_bh(rt_hash_lock_addr(k));
946 			while ((rth = rcu_dereference_protected(*rthp,
947 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
948 				if (!rt_is_expired(rth) &&
949 					!rt_may_expire(rth, tmo, expire)) {
950 					tmo >>= 1;
951 					rthp = &rth->dst.rt_next;
952 					continue;
953 				}
954 				*rthp = rth->dst.rt_next;
955 				rt_free(rth);
956 				goal--;
957 			}
958 			spin_unlock_bh(rt_hash_lock_addr(k));
959 			if (goal <= 0)
960 				break;
961 		}
962 		rover = k;
963 
964 		if (goal <= 0)
965 			goto work_done;
966 
967 		/* Goal is not achieved. We stop process if:
968 
969 		   - if expire reduced to zero. Otherwise, expire is halfed.
970 		   - if table is not full.
971 		   - if we are called from interrupt.
972 		   - jiffies check is just fallback/debug loop breaker.
973 		     We will not spin here for long time in any case.
974 		 */
975 
976 		RT_CACHE_STAT_INC(gc_goal_miss);
977 
978 		if (expire == 0)
979 			break;
980 
981 		expire >>= 1;
982 
983 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
984 			goto out;
985 	} while (!in_softirq() && time_before_eq(jiffies, now));
986 
987 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
988 		goto out;
989 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
990 		goto out;
991 	if (net_ratelimit())
992 		printk(KERN_WARNING "dst cache overflow\n");
993 	RT_CACHE_STAT_INC(gc_dst_overflow);
994 	return 1;
995 
996 work_done:
997 	expire += ip_rt_gc_min_interval;
998 	if (expire > ip_rt_gc_timeout ||
999 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001 		expire = ip_rt_gc_timeout;
1002 out:	return 0;
1003 }
1004 
1005 /*
1006  * Returns number of entries in a hash chain that have different hash_inputs
1007  */
1008 static int slow_chain_length(const struct rtable *head)
1009 {
1010 	int length = 0;
1011 	const struct rtable *rth = head;
1012 
1013 	while (rth) {
1014 		length += has_noalias(head, rth);
1015 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1016 	}
1017 	return length >> FRACT_BITS;
1018 }
1019 
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1021 {
1022 	struct neigh_table *tbl = &arp_tbl;
1023 	static const __be32 inaddr_any = 0;
1024 	struct net_device *dev = dst->dev;
1025 	const __be32 *pkey = daddr;
1026 	struct neighbour *n;
1027 
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029 	if (dev->type == ARPHRD_ATM)
1030 		tbl = clip_tbl_hook;
1031 #endif
1032 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1033 		pkey = &inaddr_any;
1034 
1035 	n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1036 	if (n)
1037 		return n;
1038 	return neigh_create(tbl, pkey, dev);
1039 }
1040 
1041 static int rt_bind_neighbour(struct rtable *rt)
1042 {
1043 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1044 	if (IS_ERR(n))
1045 		return PTR_ERR(n);
1046 	dst_set_neighbour(&rt->dst, n);
1047 
1048 	return 0;
1049 }
1050 
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052 				     struct sk_buff *skb, int ifindex)
1053 {
1054 	struct rtable	*rth, *cand;
1055 	struct rtable __rcu **rthp, **candp;
1056 	unsigned long	now;
1057 	u32 		min_score;
1058 	int		chain_length;
1059 	int attempts = !in_softirq();
1060 
1061 restart:
1062 	chain_length = 0;
1063 	min_score = ~(u32)0;
1064 	cand = NULL;
1065 	candp = NULL;
1066 	now = jiffies;
1067 
1068 	if (!rt_caching(dev_net(rt->dst.dev))) {
1069 		/*
1070 		 * If we're not caching, just tell the caller we
1071 		 * were successful and don't touch the route.  The
1072 		 * caller hold the sole reference to the cache entry, and
1073 		 * it will be released when the caller is done with it.
1074 		 * If we drop it here, the callers have no way to resolve routes
1075 		 * when we're not caching.  Instead, just point *rp at rt, so
1076 		 * the caller gets a single use out of the route
1077 		 * Note that we do rt_free on this new route entry, so that
1078 		 * once its refcount hits zero, we are still able to reap it
1079 		 * (Thanks Alexey)
1080 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1081 		 * we set DST_NOCACHE so that dst_release() can free dst without
1082 		 * waiting a grace period.
1083 		 */
1084 
1085 		rt->dst.flags |= DST_NOCACHE;
1086 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087 			int err = rt_bind_neighbour(rt);
1088 			if (err) {
1089 				if (net_ratelimit())
1090 					printk(KERN_WARNING
1091 					    "Neighbour table failure & not caching routes.\n");
1092 				ip_rt_put(rt);
1093 				return ERR_PTR(err);
1094 			}
1095 		}
1096 
1097 		goto skip_hashing;
1098 	}
1099 
1100 	rthp = &rt_hash_table[hash].chain;
1101 
1102 	spin_lock_bh(rt_hash_lock_addr(hash));
1103 	while ((rth = rcu_dereference_protected(*rthp,
1104 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105 		if (rt_is_expired(rth)) {
1106 			*rthp = rth->dst.rt_next;
1107 			rt_free(rth);
1108 			continue;
1109 		}
1110 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1111 			/* Put it first */
1112 			*rthp = rth->dst.rt_next;
1113 			/*
1114 			 * Since lookup is lockfree, the deletion
1115 			 * must be visible to another weakly ordered CPU before
1116 			 * the insertion at the start of the hash chain.
1117 			 */
1118 			rcu_assign_pointer(rth->dst.rt_next,
1119 					   rt_hash_table[hash].chain);
1120 			/*
1121 			 * Since lookup is lockfree, the update writes
1122 			 * must be ordered for consistency on SMP.
1123 			 */
1124 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1125 
1126 			dst_use(&rth->dst, now);
1127 			spin_unlock_bh(rt_hash_lock_addr(hash));
1128 
1129 			rt_drop(rt);
1130 			if (skb)
1131 				skb_dst_set(skb, &rth->dst);
1132 			return rth;
1133 		}
1134 
1135 		if (!atomic_read(&rth->dst.__refcnt)) {
1136 			u32 score = rt_score(rth);
1137 
1138 			if (score <= min_score) {
1139 				cand = rth;
1140 				candp = rthp;
1141 				min_score = score;
1142 			}
1143 		}
1144 
1145 		chain_length++;
1146 
1147 		rthp = &rth->dst.rt_next;
1148 	}
1149 
1150 	if (cand) {
1151 		/* ip_rt_gc_elasticity used to be average length of chain
1152 		 * length, when exceeded gc becomes really aggressive.
1153 		 *
1154 		 * The second limit is less certain. At the moment it allows
1155 		 * only 2 entries per bucket. We will see.
1156 		 */
1157 		if (chain_length > ip_rt_gc_elasticity) {
1158 			*candp = cand->dst.rt_next;
1159 			rt_free(cand);
1160 		}
1161 	} else {
1162 		if (chain_length > rt_chain_length_max &&
1163 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164 			struct net *net = dev_net(rt->dst.dev);
1165 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 			if (!rt_caching(net)) {
1167 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 					rt->dst.dev->name, num);
1169 			}
1170 			rt_emergency_hash_rebuild(net);
1171 			spin_unlock_bh(rt_hash_lock_addr(hash));
1172 
1173 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174 					ifindex, rt_genid(net));
1175 			goto restart;
1176 		}
1177 	}
1178 
1179 	/* Try to bind route to arp only if it is output
1180 	   route or unicast forwarding path.
1181 	 */
1182 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183 		int err = rt_bind_neighbour(rt);
1184 		if (err) {
1185 			spin_unlock_bh(rt_hash_lock_addr(hash));
1186 
1187 			if (err != -ENOBUFS) {
1188 				rt_drop(rt);
1189 				return ERR_PTR(err);
1190 			}
1191 
1192 			/* Neighbour tables are full and nothing
1193 			   can be released. Try to shrink route cache,
1194 			   it is most likely it holds some neighbour records.
1195 			 */
1196 			if (attempts-- > 0) {
1197 				int saved_elasticity = ip_rt_gc_elasticity;
1198 				int saved_int = ip_rt_gc_min_interval;
1199 				ip_rt_gc_elasticity	= 1;
1200 				ip_rt_gc_min_interval	= 0;
1201 				rt_garbage_collect(&ipv4_dst_ops);
1202 				ip_rt_gc_min_interval	= saved_int;
1203 				ip_rt_gc_elasticity	= saved_elasticity;
1204 				goto restart;
1205 			}
1206 
1207 			if (net_ratelimit())
1208 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1209 			rt_drop(rt);
1210 			return ERR_PTR(-ENOBUFS);
1211 		}
1212 	}
1213 
1214 	rt->dst.rt_next = rt_hash_table[hash].chain;
1215 
1216 	/*
1217 	 * Since lookup is lockfree, we must make sure
1218 	 * previous writes to rt are committed to memory
1219 	 * before making rt visible to other CPUS.
1220 	 */
1221 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1222 
1223 	spin_unlock_bh(rt_hash_lock_addr(hash));
1224 
1225 skip_hashing:
1226 	if (skb)
1227 		skb_dst_set(skb, &rt->dst);
1228 	return rt;
1229 }
1230 
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1232 
1233 static u32 rt_peer_genid(void)
1234 {
1235 	return atomic_read(&__rt_peer_genid);
1236 }
1237 
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1239 {
1240 	struct inet_peer *peer;
1241 
1242 	peer = inet_getpeer_v4(daddr, create);
1243 
1244 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1245 		inet_putpeer(peer);
1246 	else
1247 		rt->rt_peer_genid = rt_peer_genid();
1248 }
1249 
1250 /*
1251  * Peer allocation may fail only in serious out-of-memory conditions.  However
1252  * we still can generate some output.
1253  * Random ID selection looks a bit dangerous because we have no chances to
1254  * select ID being unique in a reasonable period of time.
1255  * But broken packet identifier may be better than no packet at all.
1256  */
1257 static void ip_select_fb_ident(struct iphdr *iph)
1258 {
1259 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1260 	static u32 ip_fallback_id;
1261 	u32 salt;
1262 
1263 	spin_lock_bh(&ip_fb_id_lock);
1264 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265 	iph->id = htons(salt & 0xFFFF);
1266 	ip_fallback_id = salt;
1267 	spin_unlock_bh(&ip_fb_id_lock);
1268 }
1269 
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1271 {
1272 	struct rtable *rt = (struct rtable *) dst;
1273 
1274 	if (rt) {
1275 		if (rt->peer == NULL)
1276 			rt_bind_peer(rt, rt->rt_dst, 1);
1277 
1278 		/* If peer is attached to destination, it is never detached,
1279 		   so that we need not to grab a lock to dereference it.
1280 		 */
1281 		if (rt->peer) {
1282 			iph->id = htons(inet_getid(rt->peer, more));
1283 			return;
1284 		}
1285 	} else
1286 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287 		       __builtin_return_address(0));
1288 
1289 	ip_select_fb_ident(iph);
1290 }
1291 EXPORT_SYMBOL(__ip_select_ident);
1292 
1293 static void rt_del(unsigned hash, struct rtable *rt)
1294 {
1295 	struct rtable __rcu **rthp;
1296 	struct rtable *aux;
1297 
1298 	rthp = &rt_hash_table[hash].chain;
1299 	spin_lock_bh(rt_hash_lock_addr(hash));
1300 	ip_rt_put(rt);
1301 	while ((aux = rcu_dereference_protected(*rthp,
1302 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303 		if (aux == rt || rt_is_expired(aux)) {
1304 			*rthp = aux->dst.rt_next;
1305 			rt_free(aux);
1306 			continue;
1307 		}
1308 		rthp = &aux->dst.rt_next;
1309 	}
1310 	spin_unlock_bh(rt_hash_lock_addr(hash));
1311 }
1312 
1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1314 {
1315 	struct rtable *rt = (struct rtable *) dst;
1316 	__be32 orig_gw = rt->rt_gateway;
1317 	struct neighbour *n, *old_n;
1318 
1319 	dst_confirm(&rt->dst);
1320 
1321 	rt->rt_gateway = peer->redirect_learned.a4;
1322 
1323 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1324 	if (IS_ERR(n))
1325 		return PTR_ERR(n);
1326 	old_n = xchg(&rt->dst._neighbour, n);
1327 	if (old_n)
1328 		neigh_release(old_n);
1329 	if (!n || !(n->nud_state & NUD_VALID)) {
1330 		if (n)
1331 			neigh_event_send(n, NULL);
1332 		rt->rt_gateway = orig_gw;
1333 		return -EAGAIN;
1334 	} else {
1335 		rt->rt_flags |= RTCF_REDIRECTED;
1336 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337 	}
1338 	return 0;
1339 }
1340 
1341 /* called in rcu_read_lock() section */
1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1343 		    __be32 saddr, struct net_device *dev)
1344 {
1345 	int s, i;
1346 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1347 	__be32 skeys[2] = { saddr, 0 };
1348 	int    ikeys[2] = { dev->ifindex, 0 };
1349 	struct inet_peer *peer;
1350 	struct net *net;
1351 
1352 	if (!in_dev)
1353 		return;
1354 
1355 	net = dev_net(dev);
1356 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1357 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1358 	    ipv4_is_zeronet(new_gw))
1359 		goto reject_redirect;
1360 
1361 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 			goto reject_redirect;
1364 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 			goto reject_redirect;
1366 	} else {
1367 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368 			goto reject_redirect;
1369 	}
1370 
1371 	for (s = 0; s < 2; s++) {
1372 		for (i = 0; i < 2; i++) {
1373 			unsigned int hash;
1374 			struct rtable __rcu **rthp;
1375 			struct rtable *rt;
1376 
1377 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1378 
1379 			rthp = &rt_hash_table[hash].chain;
1380 
1381 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1382 				rthp = &rt->dst.rt_next;
1383 
1384 				if (rt->rt_key_dst != daddr ||
1385 				    rt->rt_key_src != skeys[s] ||
1386 				    rt->rt_oif != ikeys[i] ||
1387 				    rt_is_input_route(rt) ||
1388 				    rt_is_expired(rt) ||
1389 				    !net_eq(dev_net(rt->dst.dev), net) ||
1390 				    rt->dst.error ||
1391 				    rt->dst.dev != dev ||
1392 				    rt->rt_gateway != old_gw)
1393 					continue;
1394 
1395 				if (!rt->peer)
1396 					rt_bind_peer(rt, rt->rt_dst, 1);
1397 
1398 				peer = rt->peer;
1399 				if (peer) {
1400 					if (peer->redirect_learned.a4 != new_gw ||
1401 					    peer->redirect_genid != redirect_genid) {
1402 						peer->redirect_learned.a4 = new_gw;
1403 						peer->redirect_genid = redirect_genid;
1404 						atomic_inc(&__rt_peer_genid);
1405 					}
1406 					check_peer_redir(&rt->dst, peer);
1407 				}
1408 			}
1409 		}
1410 	}
1411 	return;
1412 
1413 reject_redirect:
1414 #ifdef CONFIG_IP_ROUTE_VERBOSE
1415 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1416 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1417 			"  Advised path = %pI4 -> %pI4\n",
1418 		       &old_gw, dev->name, &new_gw,
1419 		       &saddr, &daddr);
1420 #endif
1421 	;
1422 }
1423 
1424 static bool peer_pmtu_expired(struct inet_peer *peer)
1425 {
1426 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1427 
1428 	return orig &&
1429 	       time_after_eq(jiffies, orig) &&
1430 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1431 }
1432 
1433 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1434 {
1435 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1436 
1437 	return orig &&
1438 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1439 }
1440 
1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1442 {
1443 	struct rtable *rt = (struct rtable *)dst;
1444 	struct dst_entry *ret = dst;
1445 
1446 	if (rt) {
1447 		if (dst->obsolete > 0) {
1448 			ip_rt_put(rt);
1449 			ret = NULL;
1450 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1451 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1452 						rt->rt_oif,
1453 						rt_genid(dev_net(dst->dev)));
1454 			rt_del(hash, rt);
1455 			ret = NULL;
1456 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1457 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1458 		}
1459 	}
1460 	return ret;
1461 }
1462 
1463 /*
1464  * Algorithm:
1465  *	1. The first ip_rt_redirect_number redirects are sent
1466  *	   with exponential backoff, then we stop sending them at all,
1467  *	   assuming that the host ignores our redirects.
1468  *	2. If we did not see packets requiring redirects
1469  *	   during ip_rt_redirect_silence, we assume that the host
1470  *	   forgot redirected route and start to send redirects again.
1471  *
1472  * This algorithm is much cheaper and more intelligent than dumb load limiting
1473  * in icmp.c.
1474  *
1475  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1476  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1477  */
1478 
1479 void ip_rt_send_redirect(struct sk_buff *skb)
1480 {
1481 	struct rtable *rt = skb_rtable(skb);
1482 	struct in_device *in_dev;
1483 	struct inet_peer *peer;
1484 	int log_martians;
1485 
1486 	rcu_read_lock();
1487 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1488 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1489 		rcu_read_unlock();
1490 		return;
1491 	}
1492 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1493 	rcu_read_unlock();
1494 
1495 	if (!rt->peer)
1496 		rt_bind_peer(rt, rt->rt_dst, 1);
1497 	peer = rt->peer;
1498 	if (!peer) {
1499 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1500 		return;
1501 	}
1502 
1503 	/* No redirected packets during ip_rt_redirect_silence;
1504 	 * reset the algorithm.
1505 	 */
1506 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1507 		peer->rate_tokens = 0;
1508 
1509 	/* Too many ignored redirects; do not send anything
1510 	 * set dst.rate_last to the last seen redirected packet.
1511 	 */
1512 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1513 		peer->rate_last = jiffies;
1514 		return;
1515 	}
1516 
1517 	/* Check for load limit; set rate_last to the latest sent
1518 	 * redirect.
1519 	 */
1520 	if (peer->rate_tokens == 0 ||
1521 	    time_after(jiffies,
1522 		       (peer->rate_last +
1523 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1524 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1525 		peer->rate_last = jiffies;
1526 		++peer->rate_tokens;
1527 #ifdef CONFIG_IP_ROUTE_VERBOSE
1528 		if (log_martians &&
1529 		    peer->rate_tokens == ip_rt_redirect_number &&
1530 		    net_ratelimit())
1531 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1532 			       &ip_hdr(skb)->saddr, rt->rt_iif,
1533 				&rt->rt_dst, &rt->rt_gateway);
1534 #endif
1535 	}
1536 }
1537 
1538 static int ip_error(struct sk_buff *skb)
1539 {
1540 	struct rtable *rt = skb_rtable(skb);
1541 	struct inet_peer *peer;
1542 	unsigned long now;
1543 	bool send;
1544 	int code;
1545 
1546 	switch (rt->dst.error) {
1547 	case EINVAL:
1548 	default:
1549 		goto out;
1550 	case EHOSTUNREACH:
1551 		code = ICMP_HOST_UNREACH;
1552 		break;
1553 	case ENETUNREACH:
1554 		code = ICMP_NET_UNREACH;
1555 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1556 				IPSTATS_MIB_INNOROUTES);
1557 		break;
1558 	case EACCES:
1559 		code = ICMP_PKT_FILTERED;
1560 		break;
1561 	}
1562 
1563 	if (!rt->peer)
1564 		rt_bind_peer(rt, rt->rt_dst, 1);
1565 	peer = rt->peer;
1566 
1567 	send = true;
1568 	if (peer) {
1569 		now = jiffies;
1570 		peer->rate_tokens += now - peer->rate_last;
1571 		if (peer->rate_tokens > ip_rt_error_burst)
1572 			peer->rate_tokens = ip_rt_error_burst;
1573 		peer->rate_last = now;
1574 		if (peer->rate_tokens >= ip_rt_error_cost)
1575 			peer->rate_tokens -= ip_rt_error_cost;
1576 		else
1577 			send = false;
1578 	}
1579 	if (send)
1580 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1581 
1582 out:	kfree_skb(skb);
1583 	return 0;
1584 }
1585 
1586 /*
1587  *	The last two values are not from the RFC but
1588  *	are needed for AMPRnet AX.25 paths.
1589  */
1590 
1591 static const unsigned short mtu_plateau[] =
1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1593 
1594 static inline unsigned short guess_mtu(unsigned short old_mtu)
1595 {
1596 	int i;
1597 
1598 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1599 		if (old_mtu > mtu_plateau[i])
1600 			return mtu_plateau[i];
1601 	return 68;
1602 }
1603 
1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1605 				 unsigned short new_mtu,
1606 				 struct net_device *dev)
1607 {
1608 	unsigned short old_mtu = ntohs(iph->tot_len);
1609 	unsigned short est_mtu = 0;
1610 	struct inet_peer *peer;
1611 
1612 	peer = inet_getpeer_v4(iph->daddr, 1);
1613 	if (peer) {
1614 		unsigned short mtu = new_mtu;
1615 
1616 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1617 			/* BSD 4.2 derived systems incorrectly adjust
1618 			 * tot_len by the IP header length, and report
1619 			 * a zero MTU in the ICMP message.
1620 			 */
1621 			if (mtu == 0 &&
1622 			    old_mtu >= 68 + (iph->ihl << 2))
1623 				old_mtu -= iph->ihl << 2;
1624 			mtu = guess_mtu(old_mtu);
1625 		}
1626 
1627 		if (mtu < ip_rt_min_pmtu)
1628 			mtu = ip_rt_min_pmtu;
1629 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1630 			unsigned long pmtu_expires;
1631 
1632 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1633 			if (!pmtu_expires)
1634 				pmtu_expires = 1UL;
1635 
1636 			est_mtu = mtu;
1637 			peer->pmtu_learned = mtu;
1638 			peer->pmtu_expires = pmtu_expires;
1639 			atomic_inc(&__rt_peer_genid);
1640 		}
1641 
1642 		inet_putpeer(peer);
1643 	}
1644 	return est_mtu ? : new_mtu;
1645 }
1646 
1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1648 {
1649 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1650 
1651 	if (!expires)
1652 		return;
1653 	if (time_before(jiffies, expires)) {
1654 		u32 orig_dst_mtu = dst_mtu(dst);
1655 		if (peer->pmtu_learned < orig_dst_mtu) {
1656 			if (!peer->pmtu_orig)
1657 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1658 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1659 		}
1660 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1661 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1662 }
1663 
1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1665 {
1666 	struct rtable *rt = (struct rtable *) dst;
1667 	struct inet_peer *peer;
1668 
1669 	dst_confirm(dst);
1670 
1671 	if (!rt->peer)
1672 		rt_bind_peer(rt, rt->rt_dst, 1);
1673 	peer = rt->peer;
1674 	if (peer) {
1675 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1676 
1677 		if (mtu < ip_rt_min_pmtu)
1678 			mtu = ip_rt_min_pmtu;
1679 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1680 
1681 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1682 			if (!pmtu_expires)
1683 				pmtu_expires = 1UL;
1684 
1685 			peer->pmtu_learned = mtu;
1686 			peer->pmtu_expires = pmtu_expires;
1687 
1688 			atomic_inc(&__rt_peer_genid);
1689 			rt->rt_peer_genid = rt_peer_genid();
1690 		}
1691 		check_peer_pmtu(dst, peer);
1692 	}
1693 }
1694 
1695 
1696 static struct rtable *ipv4_validate_peer(struct rtable *rt)
1697 {
1698 	if (rt->rt_peer_genid != rt_peer_genid()) {
1699 		struct inet_peer *peer;
1700 
1701 		if (!rt->peer)
1702 			rt_bind_peer(rt, rt->rt_dst, 0);
1703 
1704 		peer = rt->peer;
1705 		if (peer) {
1706 			check_peer_pmtu(&rt->dst, peer);
1707 
1708 			if (peer->redirect_genid != redirect_genid)
1709 				peer->redirect_learned.a4 = 0;
1710 			if (peer->redirect_learned.a4 &&
1711 			    peer->redirect_learned.a4 != rt->rt_gateway) {
1712 				if (check_peer_redir(&rt->dst, peer))
1713 					return NULL;
1714 			}
1715 		}
1716 
1717 		rt->rt_peer_genid = rt_peer_genid();
1718 	}
1719 	return rt;
1720 }
1721 
1722 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1723 {
1724 	struct rtable *rt = (struct rtable *) dst;
1725 
1726 	if (rt_is_expired(rt))
1727 		return NULL;
1728 	dst = (struct dst_entry *) ipv4_validate_peer(rt);
1729 	return dst;
1730 }
1731 
1732 static void ipv4_dst_destroy(struct dst_entry *dst)
1733 {
1734 	struct rtable *rt = (struct rtable *) dst;
1735 	struct inet_peer *peer = rt->peer;
1736 
1737 	if (rt->fi) {
1738 		fib_info_put(rt->fi);
1739 		rt->fi = NULL;
1740 	}
1741 	if (peer) {
1742 		rt->peer = NULL;
1743 		inet_putpeer(peer);
1744 	}
1745 }
1746 
1747 
1748 static void ipv4_link_failure(struct sk_buff *skb)
1749 {
1750 	struct rtable *rt;
1751 
1752 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1753 
1754 	rt = skb_rtable(skb);
1755 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1756 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1757 }
1758 
1759 static int ip_rt_bug(struct sk_buff *skb)
1760 {
1761 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1762 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1763 		skb->dev ? skb->dev->name : "?");
1764 	kfree_skb(skb);
1765 	WARN_ON(1);
1766 	return 0;
1767 }
1768 
1769 /*
1770    We do not cache source address of outgoing interface,
1771    because it is used only by IP RR, TS and SRR options,
1772    so that it out of fast path.
1773 
1774    BTW remember: "addr" is allowed to be not aligned
1775    in IP options!
1776  */
1777 
1778 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1779 {
1780 	__be32 src;
1781 
1782 	if (rt_is_output_route(rt))
1783 		src = ip_hdr(skb)->saddr;
1784 	else {
1785 		struct fib_result res;
1786 		struct flowi4 fl4;
1787 		struct iphdr *iph;
1788 
1789 		iph = ip_hdr(skb);
1790 
1791 		memset(&fl4, 0, sizeof(fl4));
1792 		fl4.daddr = iph->daddr;
1793 		fl4.saddr = iph->saddr;
1794 		fl4.flowi4_tos = RT_TOS(iph->tos);
1795 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1796 		fl4.flowi4_iif = skb->dev->ifindex;
1797 		fl4.flowi4_mark = skb->mark;
1798 
1799 		rcu_read_lock();
1800 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1801 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1802 		else
1803 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1804 					RT_SCOPE_UNIVERSE);
1805 		rcu_read_unlock();
1806 	}
1807 	memcpy(addr, &src, 4);
1808 }
1809 
1810 #ifdef CONFIG_IP_ROUTE_CLASSID
1811 static void set_class_tag(struct rtable *rt, u32 tag)
1812 {
1813 	if (!(rt->dst.tclassid & 0xFFFF))
1814 		rt->dst.tclassid |= tag & 0xFFFF;
1815 	if (!(rt->dst.tclassid & 0xFFFF0000))
1816 		rt->dst.tclassid |= tag & 0xFFFF0000;
1817 }
1818 #endif
1819 
1820 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1821 {
1822 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1823 
1824 	if (advmss == 0) {
1825 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1826 			       ip_rt_min_advmss);
1827 		if (advmss > 65535 - 40)
1828 			advmss = 65535 - 40;
1829 	}
1830 	return advmss;
1831 }
1832 
1833 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1834 {
1835 	const struct rtable *rt = (const struct rtable *) dst;
1836 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1837 
1838 	if (mtu && rt_is_output_route(rt))
1839 		return mtu;
1840 
1841 	mtu = dst->dev->mtu;
1842 
1843 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1844 
1845 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1846 			mtu = 576;
1847 	}
1848 
1849 	if (mtu > IP_MAX_MTU)
1850 		mtu = IP_MAX_MTU;
1851 
1852 	return mtu;
1853 }
1854 
1855 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1856 			    struct fib_info *fi)
1857 {
1858 	struct inet_peer *peer;
1859 	int create = 0;
1860 
1861 	/* If a peer entry exists for this destination, we must hook
1862 	 * it up in order to get at cached metrics.
1863 	 */
1864 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1865 		create = 1;
1866 
1867 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1868 	if (peer) {
1869 		rt->rt_peer_genid = rt_peer_genid();
1870 		if (inet_metrics_new(peer))
1871 			memcpy(peer->metrics, fi->fib_metrics,
1872 			       sizeof(u32) * RTAX_MAX);
1873 		dst_init_metrics(&rt->dst, peer->metrics, false);
1874 
1875 		check_peer_pmtu(&rt->dst, peer);
1876 		if (peer->redirect_genid != redirect_genid)
1877 			peer->redirect_learned.a4 = 0;
1878 		if (peer->redirect_learned.a4 &&
1879 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1880 			rt->rt_gateway = peer->redirect_learned.a4;
1881 			rt->rt_flags |= RTCF_REDIRECTED;
1882 		}
1883 	} else {
1884 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1885 			rt->fi = fi;
1886 			atomic_inc(&fi->fib_clntref);
1887 		}
1888 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1889 	}
1890 }
1891 
1892 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1893 			   const struct fib_result *res,
1894 			   struct fib_info *fi, u16 type, u32 itag)
1895 {
1896 	struct dst_entry *dst = &rt->dst;
1897 
1898 	if (fi) {
1899 		if (FIB_RES_GW(*res) &&
1900 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1901 			rt->rt_gateway = FIB_RES_GW(*res);
1902 		rt_init_metrics(rt, fl4, fi);
1903 #ifdef CONFIG_IP_ROUTE_CLASSID
1904 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1905 #endif
1906 	}
1907 
1908 	if (dst_mtu(dst) > IP_MAX_MTU)
1909 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1910 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1911 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1912 
1913 #ifdef CONFIG_IP_ROUTE_CLASSID
1914 #ifdef CONFIG_IP_MULTIPLE_TABLES
1915 	set_class_tag(rt, fib_rules_tclass(res));
1916 #endif
1917 	set_class_tag(rt, itag);
1918 #endif
1919 }
1920 
1921 static struct rtable *rt_dst_alloc(struct net_device *dev,
1922 				   bool nopolicy, bool noxfrm)
1923 {
1924 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1925 			 DST_HOST |
1926 			 (nopolicy ? DST_NOPOLICY : 0) |
1927 			 (noxfrm ? DST_NOXFRM : 0));
1928 }
1929 
1930 /* called in rcu_read_lock() section */
1931 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1932 				u8 tos, struct net_device *dev, int our)
1933 {
1934 	unsigned int hash;
1935 	struct rtable *rth;
1936 	__be32 spec_dst;
1937 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1938 	u32 itag = 0;
1939 	int err;
1940 
1941 	/* Primary sanity checks. */
1942 
1943 	if (in_dev == NULL)
1944 		return -EINVAL;
1945 
1946 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1947 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1948 		goto e_inval;
1949 
1950 	if (ipv4_is_zeronet(saddr)) {
1951 		if (!ipv4_is_local_multicast(daddr))
1952 			goto e_inval;
1953 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1954 	} else {
1955 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1956 					  &itag);
1957 		if (err < 0)
1958 			goto e_err;
1959 	}
1960 	rth = rt_dst_alloc(init_net.loopback_dev,
1961 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1962 	if (!rth)
1963 		goto e_nobufs;
1964 
1965 #ifdef CONFIG_IP_ROUTE_CLASSID
1966 	rth->dst.tclassid = itag;
1967 #endif
1968 	rth->dst.output = ip_rt_bug;
1969 
1970 	rth->rt_key_dst	= daddr;
1971 	rth->rt_key_src	= saddr;
1972 	rth->rt_genid	= rt_genid(dev_net(dev));
1973 	rth->rt_flags	= RTCF_MULTICAST;
1974 	rth->rt_type	= RTN_MULTICAST;
1975 	rth->rt_key_tos	= tos;
1976 	rth->rt_dst	= daddr;
1977 	rth->rt_src	= saddr;
1978 	rth->rt_route_iif = dev->ifindex;
1979 	rth->rt_iif	= dev->ifindex;
1980 	rth->rt_oif	= 0;
1981 	rth->rt_mark    = skb->mark;
1982 	rth->rt_gateway	= daddr;
1983 	rth->rt_spec_dst= spec_dst;
1984 	rth->rt_peer_genid = 0;
1985 	rth->peer = NULL;
1986 	rth->fi = NULL;
1987 	if (our) {
1988 		rth->dst.input= ip_local_deliver;
1989 		rth->rt_flags |= RTCF_LOCAL;
1990 	}
1991 
1992 #ifdef CONFIG_IP_MROUTE
1993 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1994 		rth->dst.input = ip_mr_input;
1995 #endif
1996 	RT_CACHE_STAT_INC(in_slow_mc);
1997 
1998 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1999 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2000 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2001 
2002 e_nobufs:
2003 	return -ENOBUFS;
2004 e_inval:
2005 	return -EINVAL;
2006 e_err:
2007 	return err;
2008 }
2009 
2010 
2011 static void ip_handle_martian_source(struct net_device *dev,
2012 				     struct in_device *in_dev,
2013 				     struct sk_buff *skb,
2014 				     __be32 daddr,
2015 				     __be32 saddr)
2016 {
2017 	RT_CACHE_STAT_INC(in_martian_src);
2018 #ifdef CONFIG_IP_ROUTE_VERBOSE
2019 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2020 		/*
2021 		 *	RFC1812 recommendation, if source is martian,
2022 		 *	the only hint is MAC header.
2023 		 */
2024 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2025 			&daddr, &saddr, dev->name);
2026 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2027 			int i;
2028 			const unsigned char *p = skb_mac_header(skb);
2029 			printk(KERN_WARNING "ll header: ");
2030 			for (i = 0; i < dev->hard_header_len; i++, p++) {
2031 				printk("%02x", *p);
2032 				if (i < (dev->hard_header_len - 1))
2033 					printk(":");
2034 			}
2035 			printk("\n");
2036 		}
2037 	}
2038 #endif
2039 }
2040 
2041 /* called in rcu_read_lock() section */
2042 static int __mkroute_input(struct sk_buff *skb,
2043 			   const struct fib_result *res,
2044 			   struct in_device *in_dev,
2045 			   __be32 daddr, __be32 saddr, u32 tos,
2046 			   struct rtable **result)
2047 {
2048 	struct rtable *rth;
2049 	int err;
2050 	struct in_device *out_dev;
2051 	unsigned int flags = 0;
2052 	__be32 spec_dst;
2053 	u32 itag;
2054 
2055 	/* get a working reference to the output device */
2056 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2057 	if (out_dev == NULL) {
2058 		if (net_ratelimit())
2059 			printk(KERN_CRIT "Bug in ip_route_input" \
2060 			       "_slow(). Please, report\n");
2061 		return -EINVAL;
2062 	}
2063 
2064 
2065 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2066 				  in_dev->dev, &spec_dst, &itag);
2067 	if (err < 0) {
2068 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2069 					 saddr);
2070 
2071 		goto cleanup;
2072 	}
2073 
2074 	if (err)
2075 		flags |= RTCF_DIRECTSRC;
2076 
2077 	if (out_dev == in_dev && err &&
2078 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2079 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2080 		flags |= RTCF_DOREDIRECT;
2081 
2082 	if (skb->protocol != htons(ETH_P_IP)) {
2083 		/* Not IP (i.e. ARP). Do not create route, if it is
2084 		 * invalid for proxy arp. DNAT routes are always valid.
2085 		 *
2086 		 * Proxy arp feature have been extended to allow, ARP
2087 		 * replies back to the same interface, to support
2088 		 * Private VLAN switch technologies. See arp.c.
2089 		 */
2090 		if (out_dev == in_dev &&
2091 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2092 			err = -EINVAL;
2093 			goto cleanup;
2094 		}
2095 	}
2096 
2097 	rth = rt_dst_alloc(out_dev->dev,
2098 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2099 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2100 	if (!rth) {
2101 		err = -ENOBUFS;
2102 		goto cleanup;
2103 	}
2104 
2105 	rth->rt_key_dst	= daddr;
2106 	rth->rt_key_src	= saddr;
2107 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2108 	rth->rt_flags = flags;
2109 	rth->rt_type = res->type;
2110 	rth->rt_key_tos	= tos;
2111 	rth->rt_dst	= daddr;
2112 	rth->rt_src	= saddr;
2113 	rth->rt_route_iif = in_dev->dev->ifindex;
2114 	rth->rt_iif 	= in_dev->dev->ifindex;
2115 	rth->rt_oif 	= 0;
2116 	rth->rt_mark    = skb->mark;
2117 	rth->rt_gateway	= daddr;
2118 	rth->rt_spec_dst= spec_dst;
2119 	rth->rt_peer_genid = 0;
2120 	rth->peer = NULL;
2121 	rth->fi = NULL;
2122 
2123 	rth->dst.input = ip_forward;
2124 	rth->dst.output = ip_output;
2125 
2126 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2127 
2128 	*result = rth;
2129 	err = 0;
2130  cleanup:
2131 	return err;
2132 }
2133 
2134 static int ip_mkroute_input(struct sk_buff *skb,
2135 			    struct fib_result *res,
2136 			    const struct flowi4 *fl4,
2137 			    struct in_device *in_dev,
2138 			    __be32 daddr, __be32 saddr, u32 tos)
2139 {
2140 	struct rtable* rth = NULL;
2141 	int err;
2142 	unsigned hash;
2143 
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145 	if (res->fi && res->fi->fib_nhs > 1)
2146 		fib_select_multipath(res);
2147 #endif
2148 
2149 	/* create a routing cache entry */
2150 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2151 	if (err)
2152 		return err;
2153 
2154 	/* put it into the cache */
2155 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2156 		       rt_genid(dev_net(rth->dst.dev)));
2157 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2158 	if (IS_ERR(rth))
2159 		return PTR_ERR(rth);
2160 	return 0;
2161 }
2162 
2163 /*
2164  *	NOTE. We drop all the packets that has local source
2165  *	addresses, because every properly looped back packet
2166  *	must have correct destination already attached by output routine.
2167  *
2168  *	Such approach solves two big problems:
2169  *	1. Not simplex devices are handled properly.
2170  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2171  *	called with rcu_read_lock()
2172  */
2173 
2174 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2175 			       u8 tos, struct net_device *dev)
2176 {
2177 	struct fib_result res;
2178 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2179 	struct flowi4	fl4;
2180 	unsigned	flags = 0;
2181 	u32		itag = 0;
2182 	struct rtable * rth;
2183 	unsigned	hash;
2184 	__be32		spec_dst;
2185 	int		err = -EINVAL;
2186 	struct net    * net = dev_net(dev);
2187 
2188 	/* IP on this device is disabled. */
2189 
2190 	if (!in_dev)
2191 		goto out;
2192 
2193 	/* Check for the most weird martians, which can be not detected
2194 	   by fib_lookup.
2195 	 */
2196 
2197 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2198 	    ipv4_is_loopback(saddr))
2199 		goto martian_source;
2200 
2201 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2202 		goto brd_input;
2203 
2204 	/* Accept zero addresses only to limited broadcast;
2205 	 * I even do not know to fix it or not. Waiting for complains :-)
2206 	 */
2207 	if (ipv4_is_zeronet(saddr))
2208 		goto martian_source;
2209 
2210 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2211 		goto martian_destination;
2212 
2213 	/*
2214 	 *	Now we are ready to route packet.
2215 	 */
2216 	fl4.flowi4_oif = 0;
2217 	fl4.flowi4_iif = dev->ifindex;
2218 	fl4.flowi4_mark = skb->mark;
2219 	fl4.flowi4_tos = tos;
2220 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2221 	fl4.daddr = daddr;
2222 	fl4.saddr = saddr;
2223 	err = fib_lookup(net, &fl4, &res);
2224 	if (err != 0) {
2225 		if (!IN_DEV_FORWARD(in_dev))
2226 			goto e_hostunreach;
2227 		goto no_route;
2228 	}
2229 
2230 	RT_CACHE_STAT_INC(in_slow_tot);
2231 
2232 	if (res.type == RTN_BROADCAST)
2233 		goto brd_input;
2234 
2235 	if (res.type == RTN_LOCAL) {
2236 		err = fib_validate_source(skb, saddr, daddr, tos,
2237 					  net->loopback_dev->ifindex,
2238 					  dev, &spec_dst, &itag);
2239 		if (err < 0)
2240 			goto martian_source_keep_err;
2241 		if (err)
2242 			flags |= RTCF_DIRECTSRC;
2243 		spec_dst = daddr;
2244 		goto local_input;
2245 	}
2246 
2247 	if (!IN_DEV_FORWARD(in_dev))
2248 		goto e_hostunreach;
2249 	if (res.type != RTN_UNICAST)
2250 		goto martian_destination;
2251 
2252 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2253 out:	return err;
2254 
2255 brd_input:
2256 	if (skb->protocol != htons(ETH_P_IP))
2257 		goto e_inval;
2258 
2259 	if (ipv4_is_zeronet(saddr))
2260 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2261 	else {
2262 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2263 					  &itag);
2264 		if (err < 0)
2265 			goto martian_source_keep_err;
2266 		if (err)
2267 			flags |= RTCF_DIRECTSRC;
2268 	}
2269 	flags |= RTCF_BROADCAST;
2270 	res.type = RTN_BROADCAST;
2271 	RT_CACHE_STAT_INC(in_brd);
2272 
2273 local_input:
2274 	rth = rt_dst_alloc(net->loopback_dev,
2275 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2276 	if (!rth)
2277 		goto e_nobufs;
2278 
2279 	rth->dst.input= ip_local_deliver;
2280 	rth->dst.output= ip_rt_bug;
2281 #ifdef CONFIG_IP_ROUTE_CLASSID
2282 	rth->dst.tclassid = itag;
2283 #endif
2284 
2285 	rth->rt_key_dst	= daddr;
2286 	rth->rt_key_src	= saddr;
2287 	rth->rt_genid = rt_genid(net);
2288 	rth->rt_flags 	= flags|RTCF_LOCAL;
2289 	rth->rt_type	= res.type;
2290 	rth->rt_key_tos	= tos;
2291 	rth->rt_dst	= daddr;
2292 	rth->rt_src	= saddr;
2293 #ifdef CONFIG_IP_ROUTE_CLASSID
2294 	rth->dst.tclassid = itag;
2295 #endif
2296 	rth->rt_route_iif = dev->ifindex;
2297 	rth->rt_iif	= dev->ifindex;
2298 	rth->rt_oif	= 0;
2299 	rth->rt_mark    = skb->mark;
2300 	rth->rt_gateway	= daddr;
2301 	rth->rt_spec_dst= spec_dst;
2302 	rth->rt_peer_genid = 0;
2303 	rth->peer = NULL;
2304 	rth->fi = NULL;
2305 	if (res.type == RTN_UNREACHABLE) {
2306 		rth->dst.input= ip_error;
2307 		rth->dst.error= -err;
2308 		rth->rt_flags 	&= ~RTCF_LOCAL;
2309 	}
2310 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2311 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2312 	err = 0;
2313 	if (IS_ERR(rth))
2314 		err = PTR_ERR(rth);
2315 	goto out;
2316 
2317 no_route:
2318 	RT_CACHE_STAT_INC(in_no_route);
2319 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2320 	res.type = RTN_UNREACHABLE;
2321 	if (err == -ESRCH)
2322 		err = -ENETUNREACH;
2323 	goto local_input;
2324 
2325 	/*
2326 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2327 	 */
2328 martian_destination:
2329 	RT_CACHE_STAT_INC(in_martian_dst);
2330 #ifdef CONFIG_IP_ROUTE_VERBOSE
2331 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2332 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2333 			&daddr, &saddr, dev->name);
2334 #endif
2335 
2336 e_hostunreach:
2337 	err = -EHOSTUNREACH;
2338 	goto out;
2339 
2340 e_inval:
2341 	err = -EINVAL;
2342 	goto out;
2343 
2344 e_nobufs:
2345 	err = -ENOBUFS;
2346 	goto out;
2347 
2348 martian_source:
2349 	err = -EINVAL;
2350 martian_source_keep_err:
2351 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2352 	goto out;
2353 }
2354 
2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2356 			   u8 tos, struct net_device *dev, bool noref)
2357 {
2358 	struct rtable * rth;
2359 	unsigned	hash;
2360 	int iif = dev->ifindex;
2361 	struct net *net;
2362 	int res;
2363 
2364 	net = dev_net(dev);
2365 
2366 	rcu_read_lock();
2367 
2368 	if (!rt_caching(net))
2369 		goto skip_cache;
2370 
2371 	tos &= IPTOS_RT_MASK;
2372 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2373 
2374 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2375 	     rth = rcu_dereference(rth->dst.rt_next)) {
2376 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2378 		     (rth->rt_route_iif ^ iif) |
2379 		     (rth->rt_key_tos ^ tos)) == 0 &&
2380 		    rth->rt_mark == skb->mark &&
2381 		    net_eq(dev_net(rth->dst.dev), net) &&
2382 		    !rt_is_expired(rth)) {
2383 			rth = ipv4_validate_peer(rth);
2384 			if (!rth)
2385 				continue;
2386 			if (noref) {
2387 				dst_use_noref(&rth->dst, jiffies);
2388 				skb_dst_set_noref(skb, &rth->dst);
2389 			} else {
2390 				dst_use(&rth->dst, jiffies);
2391 				skb_dst_set(skb, &rth->dst);
2392 			}
2393 			RT_CACHE_STAT_INC(in_hit);
2394 			rcu_read_unlock();
2395 			return 0;
2396 		}
2397 		RT_CACHE_STAT_INC(in_hlist_search);
2398 	}
2399 
2400 skip_cache:
2401 	/* Multicast recognition logic is moved from route cache to here.
2402 	   The problem was that too many Ethernet cards have broken/missing
2403 	   hardware multicast filters :-( As result the host on multicasting
2404 	   network acquires a lot of useless route cache entries, sort of
2405 	   SDR messages from all the world. Now we try to get rid of them.
2406 	   Really, provided software IP multicast filter is organized
2407 	   reasonably (at least, hashed), it does not result in a slowdown
2408 	   comparing with route cache reject entries.
2409 	   Note, that multicast routers are not affected, because
2410 	   route cache entry is created eventually.
2411 	 */
2412 	if (ipv4_is_multicast(daddr)) {
2413 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2414 
2415 		if (in_dev) {
2416 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2417 						  ip_hdr(skb)->protocol);
2418 			if (our
2419 #ifdef CONFIG_IP_MROUTE
2420 				||
2421 			    (!ipv4_is_local_multicast(daddr) &&
2422 			     IN_DEV_MFORWARD(in_dev))
2423 #endif
2424 			   ) {
2425 				int res = ip_route_input_mc(skb, daddr, saddr,
2426 							    tos, dev, our);
2427 				rcu_read_unlock();
2428 				return res;
2429 			}
2430 		}
2431 		rcu_read_unlock();
2432 		return -EINVAL;
2433 	}
2434 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2435 	rcu_read_unlock();
2436 	return res;
2437 }
2438 EXPORT_SYMBOL(ip_route_input_common);
2439 
2440 /* called with rcu_read_lock() */
2441 static struct rtable *__mkroute_output(const struct fib_result *res,
2442 				       const struct flowi4 *fl4,
2443 				       __be32 orig_daddr, __be32 orig_saddr,
2444 				       int orig_oif, struct net_device *dev_out,
2445 				       unsigned int flags)
2446 {
2447 	struct fib_info *fi = res->fi;
2448 	u32 tos = RT_FL_TOS(fl4);
2449 	struct in_device *in_dev;
2450 	u16 type = res->type;
2451 	struct rtable *rth;
2452 
2453 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2454 		return ERR_PTR(-EINVAL);
2455 
2456 	if (ipv4_is_lbcast(fl4->daddr))
2457 		type = RTN_BROADCAST;
2458 	else if (ipv4_is_multicast(fl4->daddr))
2459 		type = RTN_MULTICAST;
2460 	else if (ipv4_is_zeronet(fl4->daddr))
2461 		return ERR_PTR(-EINVAL);
2462 
2463 	if (dev_out->flags & IFF_LOOPBACK)
2464 		flags |= RTCF_LOCAL;
2465 
2466 	in_dev = __in_dev_get_rcu(dev_out);
2467 	if (!in_dev)
2468 		return ERR_PTR(-EINVAL);
2469 
2470 	if (type == RTN_BROADCAST) {
2471 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2472 		fi = NULL;
2473 	} else if (type == RTN_MULTICAST) {
2474 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2475 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2476 				     fl4->flowi4_proto))
2477 			flags &= ~RTCF_LOCAL;
2478 		/* If multicast route do not exist use
2479 		 * default one, but do not gateway in this case.
2480 		 * Yes, it is hack.
2481 		 */
2482 		if (fi && res->prefixlen < 4)
2483 			fi = NULL;
2484 	}
2485 
2486 	rth = rt_dst_alloc(dev_out,
2487 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2488 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2489 	if (!rth)
2490 		return ERR_PTR(-ENOBUFS);
2491 
2492 	rth->dst.output = ip_output;
2493 
2494 	rth->rt_key_dst	= orig_daddr;
2495 	rth->rt_key_src	= orig_saddr;
2496 	rth->rt_genid = rt_genid(dev_net(dev_out));
2497 	rth->rt_flags	= flags;
2498 	rth->rt_type	= type;
2499 	rth->rt_key_tos	= tos;
2500 	rth->rt_dst	= fl4->daddr;
2501 	rth->rt_src	= fl4->saddr;
2502 	rth->rt_route_iif = 0;
2503 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2504 	rth->rt_oif	= orig_oif;
2505 	rth->rt_mark    = fl4->flowi4_mark;
2506 	rth->rt_gateway = fl4->daddr;
2507 	rth->rt_spec_dst= fl4->saddr;
2508 	rth->rt_peer_genid = 0;
2509 	rth->peer = NULL;
2510 	rth->fi = NULL;
2511 
2512 	RT_CACHE_STAT_INC(out_slow_tot);
2513 
2514 	if (flags & RTCF_LOCAL) {
2515 		rth->dst.input = ip_local_deliver;
2516 		rth->rt_spec_dst = fl4->daddr;
2517 	}
2518 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2519 		rth->rt_spec_dst = fl4->saddr;
2520 		if (flags & RTCF_LOCAL &&
2521 		    !(dev_out->flags & IFF_LOOPBACK)) {
2522 			rth->dst.output = ip_mc_output;
2523 			RT_CACHE_STAT_INC(out_slow_mc);
2524 		}
2525 #ifdef CONFIG_IP_MROUTE
2526 		if (type == RTN_MULTICAST) {
2527 			if (IN_DEV_MFORWARD(in_dev) &&
2528 			    !ipv4_is_local_multicast(fl4->daddr)) {
2529 				rth->dst.input = ip_mr_input;
2530 				rth->dst.output = ip_mc_output;
2531 			}
2532 		}
2533 #endif
2534 	}
2535 
2536 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2537 
2538 	return rth;
2539 }
2540 
2541 /*
2542  * Major route resolver routine.
2543  * called with rcu_read_lock();
2544  */
2545 
2546 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2547 {
2548 	struct net_device *dev_out = NULL;
2549 	u32 tos	= RT_FL_TOS(fl4);
2550 	unsigned int flags = 0;
2551 	struct fib_result res;
2552 	struct rtable *rth;
2553 	__be32 orig_daddr;
2554 	__be32 orig_saddr;
2555 	int orig_oif;
2556 
2557 	res.fi		= NULL;
2558 #ifdef CONFIG_IP_MULTIPLE_TABLES
2559 	res.r		= NULL;
2560 #endif
2561 
2562 	orig_daddr = fl4->daddr;
2563 	orig_saddr = fl4->saddr;
2564 	orig_oif = fl4->flowi4_oif;
2565 
2566 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2567 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2568 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2569 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2570 
2571 	rcu_read_lock();
2572 	if (fl4->saddr) {
2573 		rth = ERR_PTR(-EINVAL);
2574 		if (ipv4_is_multicast(fl4->saddr) ||
2575 		    ipv4_is_lbcast(fl4->saddr) ||
2576 		    ipv4_is_zeronet(fl4->saddr))
2577 			goto out;
2578 
2579 		/* I removed check for oif == dev_out->oif here.
2580 		   It was wrong for two reasons:
2581 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2582 		      is assigned to multiple interfaces.
2583 		   2. Moreover, we are allowed to send packets with saddr
2584 		      of another iface. --ANK
2585 		 */
2586 
2587 		if (fl4->flowi4_oif == 0 &&
2588 		    (ipv4_is_multicast(fl4->daddr) ||
2589 		     ipv4_is_lbcast(fl4->daddr))) {
2590 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2592 			if (dev_out == NULL)
2593 				goto out;
2594 
2595 			/* Special hack: user can direct multicasts
2596 			   and limited broadcast via necessary interface
2597 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2598 			   This hack is not just for fun, it allows
2599 			   vic,vat and friends to work.
2600 			   They bind socket to loopback, set ttl to zero
2601 			   and expect that it will work.
2602 			   From the viewpoint of routing cache they are broken,
2603 			   because we are not allowed to build multicast path
2604 			   with loopback source addr (look, routing cache
2605 			   cannot know, that ttl is zero, so that packet
2606 			   will not leave this host and route is valid).
2607 			   Luckily, this hack is good workaround.
2608 			 */
2609 
2610 			fl4->flowi4_oif = dev_out->ifindex;
2611 			goto make_route;
2612 		}
2613 
2614 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2615 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2616 			if (!__ip_dev_find(net, fl4->saddr, false))
2617 				goto out;
2618 		}
2619 	}
2620 
2621 
2622 	if (fl4->flowi4_oif) {
2623 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2624 		rth = ERR_PTR(-ENODEV);
2625 		if (dev_out == NULL)
2626 			goto out;
2627 
2628 		/* RACE: Check return value of inet_select_addr instead. */
2629 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2630 			rth = ERR_PTR(-ENETUNREACH);
2631 			goto out;
2632 		}
2633 		if (ipv4_is_local_multicast(fl4->daddr) ||
2634 		    ipv4_is_lbcast(fl4->daddr)) {
2635 			if (!fl4->saddr)
2636 				fl4->saddr = inet_select_addr(dev_out, 0,
2637 							      RT_SCOPE_LINK);
2638 			goto make_route;
2639 		}
2640 		if (fl4->saddr) {
2641 			if (ipv4_is_multicast(fl4->daddr))
2642 				fl4->saddr = inet_select_addr(dev_out, 0,
2643 							      fl4->flowi4_scope);
2644 			else if (!fl4->daddr)
2645 				fl4->saddr = inet_select_addr(dev_out, 0,
2646 							      RT_SCOPE_HOST);
2647 		}
2648 	}
2649 
2650 	if (!fl4->daddr) {
2651 		fl4->daddr = fl4->saddr;
2652 		if (!fl4->daddr)
2653 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2654 		dev_out = net->loopback_dev;
2655 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2656 		res.type = RTN_LOCAL;
2657 		flags |= RTCF_LOCAL;
2658 		goto make_route;
2659 	}
2660 
2661 	if (fib_lookup(net, fl4, &res)) {
2662 		res.fi = NULL;
2663 		if (fl4->flowi4_oif) {
2664 			/* Apparently, routing tables are wrong. Assume,
2665 			   that the destination is on link.
2666 
2667 			   WHY? DW.
2668 			   Because we are allowed to send to iface
2669 			   even if it has NO routes and NO assigned
2670 			   addresses. When oif is specified, routing
2671 			   tables are looked up with only one purpose:
2672 			   to catch if destination is gatewayed, rather than
2673 			   direct. Moreover, if MSG_DONTROUTE is set,
2674 			   we send packet, ignoring both routing tables
2675 			   and ifaddr state. --ANK
2676 
2677 
2678 			   We could make it even if oif is unknown,
2679 			   likely IPv6, but we do not.
2680 			 */
2681 
2682 			if (fl4->saddr == 0)
2683 				fl4->saddr = inet_select_addr(dev_out, 0,
2684 							      RT_SCOPE_LINK);
2685 			res.type = RTN_UNICAST;
2686 			goto make_route;
2687 		}
2688 		rth = ERR_PTR(-ENETUNREACH);
2689 		goto out;
2690 	}
2691 
2692 	if (res.type == RTN_LOCAL) {
2693 		if (!fl4->saddr) {
2694 			if (res.fi->fib_prefsrc)
2695 				fl4->saddr = res.fi->fib_prefsrc;
2696 			else
2697 				fl4->saddr = fl4->daddr;
2698 		}
2699 		dev_out = net->loopback_dev;
2700 		fl4->flowi4_oif = dev_out->ifindex;
2701 		res.fi = NULL;
2702 		flags |= RTCF_LOCAL;
2703 		goto make_route;
2704 	}
2705 
2706 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2707 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2708 		fib_select_multipath(&res);
2709 	else
2710 #endif
2711 	if (!res.prefixlen &&
2712 	    res.table->tb_num_default > 1 &&
2713 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2714 		fib_select_default(&res);
2715 
2716 	if (!fl4->saddr)
2717 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2718 
2719 	dev_out = FIB_RES_DEV(res);
2720 	fl4->flowi4_oif = dev_out->ifindex;
2721 
2722 
2723 make_route:
2724 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2725 			       dev_out, flags);
2726 	if (!IS_ERR(rth)) {
2727 		unsigned int hash;
2728 
2729 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2730 			       rt_genid(dev_net(dev_out)));
2731 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2732 	}
2733 
2734 out:
2735 	rcu_read_unlock();
2736 	return rth;
2737 }
2738 
2739 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2740 {
2741 	struct rtable *rth;
2742 	unsigned int hash;
2743 
2744 	if (!rt_caching(net))
2745 		goto slow_output;
2746 
2747 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2748 
2749 	rcu_read_lock_bh();
2750 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2751 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2752 		if (rth->rt_key_dst == flp4->daddr &&
2753 		    rth->rt_key_src == flp4->saddr &&
2754 		    rt_is_output_route(rth) &&
2755 		    rth->rt_oif == flp4->flowi4_oif &&
2756 		    rth->rt_mark == flp4->flowi4_mark &&
2757 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2758 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2759 		    net_eq(dev_net(rth->dst.dev), net) &&
2760 		    !rt_is_expired(rth)) {
2761 			rth = ipv4_validate_peer(rth);
2762 			if (!rth)
2763 				continue;
2764 			dst_use(&rth->dst, jiffies);
2765 			RT_CACHE_STAT_INC(out_hit);
2766 			rcu_read_unlock_bh();
2767 			if (!flp4->saddr)
2768 				flp4->saddr = rth->rt_src;
2769 			if (!flp4->daddr)
2770 				flp4->daddr = rth->rt_dst;
2771 			return rth;
2772 		}
2773 		RT_CACHE_STAT_INC(out_hlist_search);
2774 	}
2775 	rcu_read_unlock_bh();
2776 
2777 slow_output:
2778 	return ip_route_output_slow(net, flp4);
2779 }
2780 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2781 
2782 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2783 {
2784 	return NULL;
2785 }
2786 
2787 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2788 {
2789 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2790 
2791 	return mtu ? : dst->dev->mtu;
2792 }
2793 
2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2795 {
2796 }
2797 
2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2799 					  unsigned long old)
2800 {
2801 	return NULL;
2802 }
2803 
2804 static struct dst_ops ipv4_dst_blackhole_ops = {
2805 	.family			=	AF_INET,
2806 	.protocol		=	cpu_to_be16(ETH_P_IP),
2807 	.destroy		=	ipv4_dst_destroy,
2808 	.check			=	ipv4_blackhole_dst_check,
2809 	.mtu			=	ipv4_blackhole_mtu,
2810 	.default_advmss		=	ipv4_default_advmss,
2811 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2812 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2813 	.neigh_lookup		=	ipv4_neigh_lookup,
2814 };
2815 
2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2817 {
2818 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2819 	struct rtable *ort = (struct rtable *) dst_orig;
2820 
2821 	if (rt) {
2822 		struct dst_entry *new = &rt->dst;
2823 
2824 		new->__use = 1;
2825 		new->input = dst_discard;
2826 		new->output = dst_discard;
2827 		dst_copy_metrics(new, &ort->dst);
2828 
2829 		new->dev = ort->dst.dev;
2830 		if (new->dev)
2831 			dev_hold(new->dev);
2832 
2833 		rt->rt_key_dst = ort->rt_key_dst;
2834 		rt->rt_key_src = ort->rt_key_src;
2835 		rt->rt_key_tos = ort->rt_key_tos;
2836 		rt->rt_route_iif = ort->rt_route_iif;
2837 		rt->rt_iif = ort->rt_iif;
2838 		rt->rt_oif = ort->rt_oif;
2839 		rt->rt_mark = ort->rt_mark;
2840 
2841 		rt->rt_genid = rt_genid(net);
2842 		rt->rt_flags = ort->rt_flags;
2843 		rt->rt_type = ort->rt_type;
2844 		rt->rt_dst = ort->rt_dst;
2845 		rt->rt_src = ort->rt_src;
2846 		rt->rt_gateway = ort->rt_gateway;
2847 		rt->rt_spec_dst = ort->rt_spec_dst;
2848 		rt->peer = ort->peer;
2849 		if (rt->peer)
2850 			atomic_inc(&rt->peer->refcnt);
2851 		rt->fi = ort->fi;
2852 		if (rt->fi)
2853 			atomic_inc(&rt->fi->fib_clntref);
2854 
2855 		dst_free(new);
2856 	}
2857 
2858 	dst_release(dst_orig);
2859 
2860 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2861 }
2862 
2863 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2864 				    struct sock *sk)
2865 {
2866 	struct rtable *rt = __ip_route_output_key(net, flp4);
2867 
2868 	if (IS_ERR(rt))
2869 		return rt;
2870 
2871 	if (flp4->flowi4_proto)
2872 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2873 						   flowi4_to_flowi(flp4),
2874 						   sk, 0);
2875 
2876 	return rt;
2877 }
2878 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2879 
2880 static int rt_fill_info(struct net *net,
2881 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2882 			int nowait, unsigned int flags)
2883 {
2884 	struct rtable *rt = skb_rtable(skb);
2885 	struct rtmsg *r;
2886 	struct nlmsghdr *nlh;
2887 	unsigned long expires = 0;
2888 	const struct inet_peer *peer = rt->peer;
2889 	u32 id = 0, ts = 0, tsage = 0, error;
2890 
2891 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2892 	if (nlh == NULL)
2893 		return -EMSGSIZE;
2894 
2895 	r = nlmsg_data(nlh);
2896 	r->rtm_family	 = AF_INET;
2897 	r->rtm_dst_len	= 32;
2898 	r->rtm_src_len	= 0;
2899 	r->rtm_tos	= rt->rt_key_tos;
2900 	r->rtm_table	= RT_TABLE_MAIN;
2901 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2902 	r->rtm_type	= rt->rt_type;
2903 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2904 	r->rtm_protocol = RTPROT_UNSPEC;
2905 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2906 	if (rt->rt_flags & RTCF_NOTIFY)
2907 		r->rtm_flags |= RTM_F_NOTIFY;
2908 
2909 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2910 
2911 	if (rt->rt_key_src) {
2912 		r->rtm_src_len = 32;
2913 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2914 	}
2915 	if (rt->dst.dev)
2916 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2917 #ifdef CONFIG_IP_ROUTE_CLASSID
2918 	if (rt->dst.tclassid)
2919 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2920 #endif
2921 	if (rt_is_input_route(rt))
2922 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2923 	else if (rt->rt_src != rt->rt_key_src)
2924 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2925 
2926 	if (rt->rt_dst != rt->rt_gateway)
2927 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2928 
2929 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2930 		goto nla_put_failure;
2931 
2932 	if (rt->rt_mark)
2933 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2934 
2935 	error = rt->dst.error;
2936 	if (peer) {
2937 		inet_peer_refcheck(rt->peer);
2938 		id = atomic_read(&peer->ip_id_count) & 0xffff;
2939 		if (peer->tcp_ts_stamp) {
2940 			ts = peer->tcp_ts;
2941 			tsage = get_seconds() - peer->tcp_ts_stamp;
2942 		}
2943 		expires = ACCESS_ONCE(peer->pmtu_expires);
2944 		if (expires) {
2945 			if (time_before(jiffies, expires))
2946 				expires -= jiffies;
2947 			else
2948 				expires = 0;
2949 		}
2950 	}
2951 
2952 	if (rt_is_input_route(rt)) {
2953 #ifdef CONFIG_IP_MROUTE
2954 		__be32 dst = rt->rt_dst;
2955 
2956 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2957 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958 			int err = ipmr_get_route(net, skb,
2959 						 rt->rt_src, rt->rt_dst,
2960 						 r, nowait);
2961 			if (err <= 0) {
2962 				if (!nowait) {
2963 					if (err == 0)
2964 						return 0;
2965 					goto nla_put_failure;
2966 				} else {
2967 					if (err == -EMSGSIZE)
2968 						goto nla_put_failure;
2969 					error = err;
2970 				}
2971 			}
2972 		} else
2973 #endif
2974 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2975 	}
2976 
2977 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2978 			       expires, error) < 0)
2979 		goto nla_put_failure;
2980 
2981 	return nlmsg_end(skb, nlh);
2982 
2983 nla_put_failure:
2984 	nlmsg_cancel(skb, nlh);
2985 	return -EMSGSIZE;
2986 }
2987 
2988 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2989 {
2990 	struct net *net = sock_net(in_skb->sk);
2991 	struct rtmsg *rtm;
2992 	struct nlattr *tb[RTA_MAX+1];
2993 	struct rtable *rt = NULL;
2994 	__be32 dst = 0;
2995 	__be32 src = 0;
2996 	u32 iif;
2997 	int err;
2998 	int mark;
2999 	struct sk_buff *skb;
3000 
3001 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3002 	if (err < 0)
3003 		goto errout;
3004 
3005 	rtm = nlmsg_data(nlh);
3006 
3007 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3008 	if (skb == NULL) {
3009 		err = -ENOBUFS;
3010 		goto errout;
3011 	}
3012 
3013 	/* Reserve room for dummy headers, this skb can pass
3014 	   through good chunk of routing engine.
3015 	 */
3016 	skb_reset_mac_header(skb);
3017 	skb_reset_network_header(skb);
3018 
3019 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3020 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3021 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3022 
3023 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3024 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3025 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3026 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3027 
3028 	if (iif) {
3029 		struct net_device *dev;
3030 
3031 		dev = __dev_get_by_index(net, iif);
3032 		if (dev == NULL) {
3033 			err = -ENODEV;
3034 			goto errout_free;
3035 		}
3036 
3037 		skb->protocol	= htons(ETH_P_IP);
3038 		skb->dev	= dev;
3039 		skb->mark	= mark;
3040 		local_bh_disable();
3041 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3042 		local_bh_enable();
3043 
3044 		rt = skb_rtable(skb);
3045 		if (err == 0 && rt->dst.error)
3046 			err = -rt->dst.error;
3047 	} else {
3048 		struct flowi4 fl4 = {
3049 			.daddr = dst,
3050 			.saddr = src,
3051 			.flowi4_tos = rtm->rtm_tos,
3052 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3053 			.flowi4_mark = mark,
3054 		};
3055 		rt = ip_route_output_key(net, &fl4);
3056 
3057 		err = 0;
3058 		if (IS_ERR(rt))
3059 			err = PTR_ERR(rt);
3060 	}
3061 
3062 	if (err)
3063 		goto errout_free;
3064 
3065 	skb_dst_set(skb, &rt->dst);
3066 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3067 		rt->rt_flags |= RTCF_NOTIFY;
3068 
3069 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3070 			   RTM_NEWROUTE, 0, 0);
3071 	if (err <= 0)
3072 		goto errout_free;
3073 
3074 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3075 errout:
3076 	return err;
3077 
3078 errout_free:
3079 	kfree_skb(skb);
3080 	goto errout;
3081 }
3082 
3083 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3084 {
3085 	struct rtable *rt;
3086 	int h, s_h;
3087 	int idx, s_idx;
3088 	struct net *net;
3089 
3090 	net = sock_net(skb->sk);
3091 
3092 	s_h = cb->args[0];
3093 	if (s_h < 0)
3094 		s_h = 0;
3095 	s_idx = idx = cb->args[1];
3096 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3097 		if (!rt_hash_table[h].chain)
3098 			continue;
3099 		rcu_read_lock_bh();
3100 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3101 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3102 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3103 				continue;
3104 			if (rt_is_expired(rt))
3105 				continue;
3106 			skb_dst_set_noref(skb, &rt->dst);
3107 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3108 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3109 					 1, NLM_F_MULTI) <= 0) {
3110 				skb_dst_drop(skb);
3111 				rcu_read_unlock_bh();
3112 				goto done;
3113 			}
3114 			skb_dst_drop(skb);
3115 		}
3116 		rcu_read_unlock_bh();
3117 	}
3118 
3119 done:
3120 	cb->args[0] = h;
3121 	cb->args[1] = idx;
3122 	return skb->len;
3123 }
3124 
3125 void ip_rt_multicast_event(struct in_device *in_dev)
3126 {
3127 	rt_cache_flush(dev_net(in_dev->dev), 0);
3128 }
3129 
3130 #ifdef CONFIG_SYSCTL
3131 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3132 					void __user *buffer,
3133 					size_t *lenp, loff_t *ppos)
3134 {
3135 	if (write) {
3136 		int flush_delay;
3137 		ctl_table ctl;
3138 		struct net *net;
3139 
3140 		memcpy(&ctl, __ctl, sizeof(ctl));
3141 		ctl.data = &flush_delay;
3142 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3143 
3144 		net = (struct net *)__ctl->extra1;
3145 		rt_cache_flush(net, flush_delay);
3146 		return 0;
3147 	}
3148 
3149 	return -EINVAL;
3150 }
3151 
3152 static ctl_table ipv4_route_table[] = {
3153 	{
3154 		.procname	= "gc_thresh",
3155 		.data		= &ipv4_dst_ops.gc_thresh,
3156 		.maxlen		= sizeof(int),
3157 		.mode		= 0644,
3158 		.proc_handler	= proc_dointvec,
3159 	},
3160 	{
3161 		.procname	= "max_size",
3162 		.data		= &ip_rt_max_size,
3163 		.maxlen		= sizeof(int),
3164 		.mode		= 0644,
3165 		.proc_handler	= proc_dointvec,
3166 	},
3167 	{
3168 		/*  Deprecated. Use gc_min_interval_ms */
3169 
3170 		.procname	= "gc_min_interval",
3171 		.data		= &ip_rt_gc_min_interval,
3172 		.maxlen		= sizeof(int),
3173 		.mode		= 0644,
3174 		.proc_handler	= proc_dointvec_jiffies,
3175 	},
3176 	{
3177 		.procname	= "gc_min_interval_ms",
3178 		.data		= &ip_rt_gc_min_interval,
3179 		.maxlen		= sizeof(int),
3180 		.mode		= 0644,
3181 		.proc_handler	= proc_dointvec_ms_jiffies,
3182 	},
3183 	{
3184 		.procname	= "gc_timeout",
3185 		.data		= &ip_rt_gc_timeout,
3186 		.maxlen		= sizeof(int),
3187 		.mode		= 0644,
3188 		.proc_handler	= proc_dointvec_jiffies,
3189 	},
3190 	{
3191 		.procname	= "redirect_load",
3192 		.data		= &ip_rt_redirect_load,
3193 		.maxlen		= sizeof(int),
3194 		.mode		= 0644,
3195 		.proc_handler	= proc_dointvec,
3196 	},
3197 	{
3198 		.procname	= "redirect_number",
3199 		.data		= &ip_rt_redirect_number,
3200 		.maxlen		= sizeof(int),
3201 		.mode		= 0644,
3202 		.proc_handler	= proc_dointvec,
3203 	},
3204 	{
3205 		.procname	= "redirect_silence",
3206 		.data		= &ip_rt_redirect_silence,
3207 		.maxlen		= sizeof(int),
3208 		.mode		= 0644,
3209 		.proc_handler	= proc_dointvec,
3210 	},
3211 	{
3212 		.procname	= "error_cost",
3213 		.data		= &ip_rt_error_cost,
3214 		.maxlen		= sizeof(int),
3215 		.mode		= 0644,
3216 		.proc_handler	= proc_dointvec,
3217 	},
3218 	{
3219 		.procname	= "error_burst",
3220 		.data		= &ip_rt_error_burst,
3221 		.maxlen		= sizeof(int),
3222 		.mode		= 0644,
3223 		.proc_handler	= proc_dointvec,
3224 	},
3225 	{
3226 		.procname	= "gc_elasticity",
3227 		.data		= &ip_rt_gc_elasticity,
3228 		.maxlen		= sizeof(int),
3229 		.mode		= 0644,
3230 		.proc_handler	= proc_dointvec,
3231 	},
3232 	{
3233 		.procname	= "mtu_expires",
3234 		.data		= &ip_rt_mtu_expires,
3235 		.maxlen		= sizeof(int),
3236 		.mode		= 0644,
3237 		.proc_handler	= proc_dointvec_jiffies,
3238 	},
3239 	{
3240 		.procname	= "min_pmtu",
3241 		.data		= &ip_rt_min_pmtu,
3242 		.maxlen		= sizeof(int),
3243 		.mode		= 0644,
3244 		.proc_handler	= proc_dointvec,
3245 	},
3246 	{
3247 		.procname	= "min_adv_mss",
3248 		.data		= &ip_rt_min_advmss,
3249 		.maxlen		= sizeof(int),
3250 		.mode		= 0644,
3251 		.proc_handler	= proc_dointvec,
3252 	},
3253 	{ }
3254 };
3255 
3256 static struct ctl_table empty[1];
3257 
3258 static struct ctl_table ipv4_skeleton[] =
3259 {
3260 	{ .procname = "route",
3261 	  .mode = 0555, .child = ipv4_route_table},
3262 	{ .procname = "neigh",
3263 	  .mode = 0555, .child = empty},
3264 	{ }
3265 };
3266 
3267 static __net_initdata struct ctl_path ipv4_path[] = {
3268 	{ .procname = "net", },
3269 	{ .procname = "ipv4", },
3270 	{ },
3271 };
3272 
3273 static struct ctl_table ipv4_route_flush_table[] = {
3274 	{
3275 		.procname	= "flush",
3276 		.maxlen		= sizeof(int),
3277 		.mode		= 0200,
3278 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3279 	},
3280 	{ },
3281 };
3282 
3283 static __net_initdata struct ctl_path ipv4_route_path[] = {
3284 	{ .procname = "net", },
3285 	{ .procname = "ipv4", },
3286 	{ .procname = "route", },
3287 	{ },
3288 };
3289 
3290 static __net_init int sysctl_route_net_init(struct net *net)
3291 {
3292 	struct ctl_table *tbl;
3293 
3294 	tbl = ipv4_route_flush_table;
3295 	if (!net_eq(net, &init_net)) {
3296 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3297 		if (tbl == NULL)
3298 			goto err_dup;
3299 	}
3300 	tbl[0].extra1 = net;
3301 
3302 	net->ipv4.route_hdr =
3303 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3304 	if (net->ipv4.route_hdr == NULL)
3305 		goto err_reg;
3306 	return 0;
3307 
3308 err_reg:
3309 	if (tbl != ipv4_route_flush_table)
3310 		kfree(tbl);
3311 err_dup:
3312 	return -ENOMEM;
3313 }
3314 
3315 static __net_exit void sysctl_route_net_exit(struct net *net)
3316 {
3317 	struct ctl_table *tbl;
3318 
3319 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3320 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3321 	BUG_ON(tbl == ipv4_route_flush_table);
3322 	kfree(tbl);
3323 }
3324 
3325 static __net_initdata struct pernet_operations sysctl_route_ops = {
3326 	.init = sysctl_route_net_init,
3327 	.exit = sysctl_route_net_exit,
3328 };
3329 #endif
3330 
3331 static __net_init int rt_genid_init(struct net *net)
3332 {
3333 	get_random_bytes(&net->ipv4.rt_genid,
3334 			 sizeof(net->ipv4.rt_genid));
3335 	get_random_bytes(&net->ipv4.dev_addr_genid,
3336 			 sizeof(net->ipv4.dev_addr_genid));
3337 	return 0;
3338 }
3339 
3340 static __net_initdata struct pernet_operations rt_genid_ops = {
3341 	.init = rt_genid_init,
3342 };
3343 
3344 
3345 #ifdef CONFIG_IP_ROUTE_CLASSID
3346 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3347 #endif /* CONFIG_IP_ROUTE_CLASSID */
3348 
3349 static __initdata unsigned long rhash_entries;
3350 static int __init set_rhash_entries(char *str)
3351 {
3352 	if (!str)
3353 		return 0;
3354 	rhash_entries = simple_strtoul(str, &str, 0);
3355 	return 1;
3356 }
3357 __setup("rhash_entries=", set_rhash_entries);
3358 
3359 int __init ip_rt_init(void)
3360 {
3361 	int rc = 0;
3362 
3363 #ifdef CONFIG_IP_ROUTE_CLASSID
3364 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3365 	if (!ip_rt_acct)
3366 		panic("IP: failed to allocate ip_rt_acct\n");
3367 #endif
3368 
3369 	ipv4_dst_ops.kmem_cachep =
3370 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3371 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3372 
3373 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3374 
3375 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3376 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3377 
3378 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3379 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3380 
3381 	rt_hash_table = (struct rt_hash_bucket *)
3382 		alloc_large_system_hash("IP route cache",
3383 					sizeof(struct rt_hash_bucket),
3384 					rhash_entries,
3385 					(totalram_pages >= 128 * 1024) ?
3386 					15 : 17,
3387 					0,
3388 					&rt_hash_log,
3389 					&rt_hash_mask,
3390 					rhash_entries ? 0 : 512 * 1024);
3391 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392 	rt_hash_lock_init();
3393 
3394 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3396 
3397 	devinet_init();
3398 	ip_fib_init();
3399 
3400 	if (ip_rt_proc_init())
3401 		printk(KERN_ERR "Unable to create route proc files\n");
3402 #ifdef CONFIG_XFRM
3403 	xfrm_init();
3404 	xfrm4_init(ip_rt_max_size);
3405 #endif
3406 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3407 
3408 #ifdef CONFIG_SYSCTL
3409 	register_pernet_subsys(&sysctl_route_ops);
3410 #endif
3411 	register_pernet_subsys(&rt_genid_ops);
3412 	return rc;
3413 }
3414 
3415 #ifdef CONFIG_SYSCTL
3416 /*
3417  * We really need to sanitize the damn ipv4 init order, then all
3418  * this nonsense will go away.
3419  */
3420 void __init ip_static_sysctl_init(void)
3421 {
3422 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3423 }
3424 #endif
3425