xref: /linux/net/ipv6/route.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.destroy		=	ip6_dst_destroy,
107 	.ifdown			=	ip6_dst_ifdown,
108 	.negative_advice	=	ip6_negative_advice,
109 	.link_failure		=	ip6_link_failure,
110 	.update_pmtu		=	ip6_rt_update_pmtu,
111 	.local_out		=	__ip6_local_out,
112 	.entries		=	ATOMIC_INIT(0),
113 };
114 
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118 
119 static struct dst_ops ip6_dst_blackhole_ops = {
120 	.family			=	AF_INET6,
121 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
122 	.destroy		=	ip6_dst_destroy,
123 	.check			=	ip6_dst_check,
124 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
125 	.entries		=	ATOMIC_INIT(0),
126 };
127 
128 static struct rt6_info ip6_null_entry_template = {
129 	.u = {
130 		.dst = {
131 			.__refcnt	= ATOMIC_INIT(1),
132 			.__use		= 1,
133 			.obsolete	= -1,
134 			.error		= -ENETUNREACH,
135 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
136 			.input		= ip6_pkt_discard,
137 			.output		= ip6_pkt_discard_out,
138 		}
139 	},
140 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
141 	.rt6i_protocol  = RTPROT_KERNEL,
142 	.rt6i_metric	= ~(u32) 0,
143 	.rt6i_ref	= ATOMIC_INIT(1),
144 };
145 
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147 
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
150 
151 static struct rt6_info ip6_prohibit_entry_template = {
152 	.u = {
153 		.dst = {
154 			.__refcnt	= ATOMIC_INIT(1),
155 			.__use		= 1,
156 			.obsolete	= -1,
157 			.error		= -EACCES,
158 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
159 			.input		= ip6_pkt_prohibit,
160 			.output		= ip6_pkt_prohibit_out,
161 		}
162 	},
163 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
164 	.rt6i_protocol  = RTPROT_KERNEL,
165 	.rt6i_metric	= ~(u32) 0,
166 	.rt6i_ref	= ATOMIC_INIT(1),
167 };
168 
169 static struct rt6_info ip6_blk_hole_entry_template = {
170 	.u = {
171 		.dst = {
172 			.__refcnt	= ATOMIC_INIT(1),
173 			.__use		= 1,
174 			.obsolete	= -1,
175 			.error		= -EINVAL,
176 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
177 			.input		= dst_discard,
178 			.output		= dst_discard,
179 		}
180 	},
181 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
182 	.rt6i_protocol  = RTPROT_KERNEL,
183 	.rt6i_metric	= ~(u32) 0,
184 	.rt6i_ref	= ATOMIC_INIT(1),
185 };
186 
187 #endif
188 
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192 	return (struct rt6_info *)dst_alloc(ops);
193 }
194 
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197 	struct rt6_info *rt = (struct rt6_info *)dst;
198 	struct inet6_dev *idev = rt->rt6i_idev;
199 
200 	if (idev != NULL) {
201 		rt->rt6i_idev = NULL;
202 		in6_dev_put(idev);
203 	}
204 }
205 
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 			   int how)
208 {
209 	struct rt6_info *rt = (struct rt6_info *)dst;
210 	struct inet6_dev *idev = rt->rt6i_idev;
211 	struct net_device *loopback_dev =
212 		dev_net(dev)->loopback_dev;
213 
214 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 		struct inet6_dev *loopback_idev =
216 			in6_dev_get(loopback_dev);
217 		if (loopback_idev != NULL) {
218 			rt->rt6i_idev = loopback_idev;
219 			in6_dev_put(idev);
220 		}
221 	}
222 }
223 
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226 	return (rt->rt6i_flags & RTF_EXPIRES &&
227 		time_after(jiffies, rt->rt6i_expires));
228 }
229 
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232 	return (ipv6_addr_type(daddr) &
233 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
234 }
235 
236 /*
237  *	Route lookup. Any table->tb6_lock is implied.
238  */
239 
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 						    struct rt6_info *rt,
242 						    struct in6_addr *saddr,
243 						    int oif,
244 						    int flags)
245 {
246 	struct rt6_info *local = NULL;
247 	struct rt6_info *sprt;
248 
249 	if (!oif && ipv6_addr_any(saddr))
250 		goto out;
251 
252 	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
253 		struct net_device *dev = sprt->rt6i_dev;
254 
255 		if (oif) {
256 			if (dev->ifindex == oif)
257 				return sprt;
258 			if (dev->flags & IFF_LOOPBACK) {
259 				if (sprt->rt6i_idev == NULL ||
260 				    sprt->rt6i_idev->dev->ifindex != oif) {
261 					if (flags & RT6_LOOKUP_F_IFACE && oif)
262 						continue;
263 					if (local && (!oif ||
264 						      local->rt6i_idev->dev->ifindex == oif))
265 						continue;
266 				}
267 				local = sprt;
268 			}
269 		} else {
270 			if (ipv6_chk_addr(net, saddr, dev,
271 					  flags & RT6_LOOKUP_F_IFACE))
272 				return sprt;
273 		}
274 	}
275 
276 	if (oif) {
277 		if (local)
278 			return local;
279 
280 		if (flags & RT6_LOOKUP_F_IFACE)
281 			return net->ipv6.ip6_null_entry;
282 	}
283 out:
284 	return rt;
285 }
286 
287 #ifdef CONFIG_IPV6_ROUTER_PREF
288 static void rt6_probe(struct rt6_info *rt)
289 {
290 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
291 	/*
292 	 * Okay, this does not seem to be appropriate
293 	 * for now, however, we need to check if it
294 	 * is really so; aka Router Reachability Probing.
295 	 *
296 	 * Router Reachability Probe MUST be rate-limited
297 	 * to no more than one per minute.
298 	 */
299 	if (!neigh || (neigh->nud_state & NUD_VALID))
300 		return;
301 	read_lock_bh(&neigh->lock);
302 	if (!(neigh->nud_state & NUD_VALID) &&
303 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
304 		struct in6_addr mcaddr;
305 		struct in6_addr *target;
306 
307 		neigh->updated = jiffies;
308 		read_unlock_bh(&neigh->lock);
309 
310 		target = (struct in6_addr *)&neigh->primary_key;
311 		addrconf_addr_solict_mult(target, &mcaddr);
312 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
313 	} else
314 		read_unlock_bh(&neigh->lock);
315 }
316 #else
317 static inline void rt6_probe(struct rt6_info *rt)
318 {
319 	return;
320 }
321 #endif
322 
323 /*
324  * Default Router Selection (RFC 2461 6.3.6)
325  */
326 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
327 {
328 	struct net_device *dev = rt->rt6i_dev;
329 	if (!oif || dev->ifindex == oif)
330 		return 2;
331 	if ((dev->flags & IFF_LOOPBACK) &&
332 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
333 		return 1;
334 	return 0;
335 }
336 
337 static inline int rt6_check_neigh(struct rt6_info *rt)
338 {
339 	struct neighbour *neigh = rt->rt6i_nexthop;
340 	int m;
341 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
342 	    !(rt->rt6i_flags & RTF_GATEWAY))
343 		m = 1;
344 	else if (neigh) {
345 		read_lock_bh(&neigh->lock);
346 		if (neigh->nud_state & NUD_VALID)
347 			m = 2;
348 #ifdef CONFIG_IPV6_ROUTER_PREF
349 		else if (neigh->nud_state & NUD_FAILED)
350 			m = 0;
351 #endif
352 		else
353 			m = 1;
354 		read_unlock_bh(&neigh->lock);
355 	} else
356 		m = 0;
357 	return m;
358 }
359 
360 static int rt6_score_route(struct rt6_info *rt, int oif,
361 			   int strict)
362 {
363 	int m, n;
364 
365 	m = rt6_check_dev(rt, oif);
366 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
367 		return -1;
368 #ifdef CONFIG_IPV6_ROUTER_PREF
369 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
370 #endif
371 	n = rt6_check_neigh(rt);
372 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
373 		return -1;
374 	return m;
375 }
376 
377 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
378 				   int *mpri, struct rt6_info *match)
379 {
380 	int m;
381 
382 	if (rt6_check_expired(rt))
383 		goto out;
384 
385 	m = rt6_score_route(rt, oif, strict);
386 	if (m < 0)
387 		goto out;
388 
389 	if (m > *mpri) {
390 		if (strict & RT6_LOOKUP_F_REACHABLE)
391 			rt6_probe(match);
392 		*mpri = m;
393 		match = rt;
394 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
395 		rt6_probe(rt);
396 	}
397 
398 out:
399 	return match;
400 }
401 
402 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
403 				     struct rt6_info *rr_head,
404 				     u32 metric, int oif, int strict)
405 {
406 	struct rt6_info *rt, *match;
407 	int mpri = -1;
408 
409 	match = NULL;
410 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
411 	     rt = rt->u.dst.rt6_next)
412 		match = find_match(rt, oif, strict, &mpri, match);
413 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
414 	     rt = rt->u.dst.rt6_next)
415 		match = find_match(rt, oif, strict, &mpri, match);
416 
417 	return match;
418 }
419 
420 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
421 {
422 	struct rt6_info *match, *rt0;
423 	struct net *net;
424 
425 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
426 		  __func__, fn->leaf, oif);
427 
428 	rt0 = fn->rr_ptr;
429 	if (!rt0)
430 		fn->rr_ptr = rt0 = fn->leaf;
431 
432 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
433 
434 	if (!match &&
435 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
436 		struct rt6_info *next = rt0->u.dst.rt6_next;
437 
438 		/* no entries matched; do round-robin */
439 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
440 			next = fn->leaf;
441 
442 		if (next != rt0)
443 			fn->rr_ptr = next;
444 	}
445 
446 	RT6_TRACE("%s() => %p\n",
447 		  __func__, match);
448 
449 	net = dev_net(rt0->rt6i_dev);
450 	return (match ? match : net->ipv6.ip6_null_entry);
451 }
452 
453 #ifdef CONFIG_IPV6_ROUTE_INFO
454 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
455 		  struct in6_addr *gwaddr)
456 {
457 	struct net *net = dev_net(dev);
458 	struct route_info *rinfo = (struct route_info *) opt;
459 	struct in6_addr prefix_buf, *prefix;
460 	unsigned int pref;
461 	unsigned long lifetime;
462 	struct rt6_info *rt;
463 
464 	if (len < sizeof(struct route_info)) {
465 		return -EINVAL;
466 	}
467 
468 	/* Sanity check for prefix_len and length */
469 	if (rinfo->length > 3) {
470 		return -EINVAL;
471 	} else if (rinfo->prefix_len > 128) {
472 		return -EINVAL;
473 	} else if (rinfo->prefix_len > 64) {
474 		if (rinfo->length < 2) {
475 			return -EINVAL;
476 		}
477 	} else if (rinfo->prefix_len > 0) {
478 		if (rinfo->length < 1) {
479 			return -EINVAL;
480 		}
481 	}
482 
483 	pref = rinfo->route_pref;
484 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
485 		return -EINVAL;
486 
487 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
488 
489 	if (rinfo->length == 3)
490 		prefix = (struct in6_addr *)rinfo->prefix;
491 	else {
492 		/* this function is safe */
493 		ipv6_addr_prefix(&prefix_buf,
494 				 (struct in6_addr *)rinfo->prefix,
495 				 rinfo->prefix_len);
496 		prefix = &prefix_buf;
497 	}
498 
499 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
500 				dev->ifindex);
501 
502 	if (rt && !lifetime) {
503 		ip6_del_rt(rt);
504 		rt = NULL;
505 	}
506 
507 	if (!rt && lifetime)
508 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
509 					pref);
510 	else if (rt)
511 		rt->rt6i_flags = RTF_ROUTEINFO |
512 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
513 
514 	if (rt) {
515 		if (!addrconf_finite_timeout(lifetime)) {
516 			rt->rt6i_flags &= ~RTF_EXPIRES;
517 		} else {
518 			rt->rt6i_expires = jiffies + HZ * lifetime;
519 			rt->rt6i_flags |= RTF_EXPIRES;
520 		}
521 		dst_release(&rt->u.dst);
522 	}
523 	return 0;
524 }
525 #endif
526 
527 #define BACKTRACK(__net, saddr)			\
528 do { \
529 	if (rt == __net->ipv6.ip6_null_entry) {	\
530 		struct fib6_node *pn; \
531 		while (1) { \
532 			if (fn->fn_flags & RTN_TL_ROOT) \
533 				goto out; \
534 			pn = fn->parent; \
535 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
536 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
537 			else \
538 				fn = pn; \
539 			if (fn->fn_flags & RTN_RTINFO) \
540 				goto restart; \
541 		} \
542 	} \
543 } while(0)
544 
545 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
546 					     struct fib6_table *table,
547 					     struct flowi *fl, int flags)
548 {
549 	struct fib6_node *fn;
550 	struct rt6_info *rt;
551 
552 	read_lock_bh(&table->tb6_lock);
553 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
554 restart:
555 	rt = fn->leaf;
556 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
557 	BACKTRACK(net, &fl->fl6_src);
558 out:
559 	dst_use(&rt->u.dst, jiffies);
560 	read_unlock_bh(&table->tb6_lock);
561 	return rt;
562 
563 }
564 
565 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
566 			    const struct in6_addr *saddr, int oif, int strict)
567 {
568 	struct flowi fl = {
569 		.oif = oif,
570 		.nl_u = {
571 			.ip6_u = {
572 				.daddr = *daddr,
573 			},
574 		},
575 	};
576 	struct dst_entry *dst;
577 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
578 
579 	if (saddr) {
580 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
581 		flags |= RT6_LOOKUP_F_HAS_SADDR;
582 	}
583 
584 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
585 	if (dst->error == 0)
586 		return (struct rt6_info *) dst;
587 
588 	dst_release(dst);
589 
590 	return NULL;
591 }
592 
593 EXPORT_SYMBOL(rt6_lookup);
594 
595 /* ip6_ins_rt is called with FREE table->tb6_lock.
596    It takes new route entry, the addition fails by any reason the
597    route is freed. In any case, if caller does not hold it, it may
598    be destroyed.
599  */
600 
601 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
602 {
603 	int err;
604 	struct fib6_table *table;
605 
606 	table = rt->rt6i_table;
607 	write_lock_bh(&table->tb6_lock);
608 	err = fib6_add(&table->tb6_root, rt, info);
609 	write_unlock_bh(&table->tb6_lock);
610 
611 	return err;
612 }
613 
614 int ip6_ins_rt(struct rt6_info *rt)
615 {
616 	struct nl_info info = {
617 		.nl_net = dev_net(rt->rt6i_dev),
618 	};
619 	return __ip6_ins_rt(rt, &info);
620 }
621 
622 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
623 				      struct in6_addr *saddr)
624 {
625 	struct rt6_info *rt;
626 
627 	/*
628 	 *	Clone the route.
629 	 */
630 
631 	rt = ip6_rt_copy(ort);
632 
633 	if (rt) {
634 		struct neighbour *neigh;
635 		int attempts = !in_softirq();
636 
637 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
638 			if (rt->rt6i_dst.plen != 128 &&
639 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
640 				rt->rt6i_flags |= RTF_ANYCAST;
641 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
642 		}
643 
644 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
645 		rt->rt6i_dst.plen = 128;
646 		rt->rt6i_flags |= RTF_CACHE;
647 		rt->u.dst.flags |= DST_HOST;
648 
649 #ifdef CONFIG_IPV6_SUBTREES
650 		if (rt->rt6i_src.plen && saddr) {
651 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
652 			rt->rt6i_src.plen = 128;
653 		}
654 #endif
655 
656 	retry:
657 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
658 		if (IS_ERR(neigh)) {
659 			struct net *net = dev_net(rt->rt6i_dev);
660 			int saved_rt_min_interval =
661 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
662 			int saved_rt_elasticity =
663 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
664 
665 			if (attempts-- > 0) {
666 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
667 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
668 
669 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
670 
671 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
672 					saved_rt_elasticity;
673 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
674 					saved_rt_min_interval;
675 				goto retry;
676 			}
677 
678 			if (net_ratelimit())
679 				printk(KERN_WARNING
680 				       "Neighbour table overflow.\n");
681 			dst_free(&rt->u.dst);
682 			return NULL;
683 		}
684 		rt->rt6i_nexthop = neigh;
685 
686 	}
687 
688 	return rt;
689 }
690 
691 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
692 {
693 	struct rt6_info *rt = ip6_rt_copy(ort);
694 	if (rt) {
695 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
696 		rt->rt6i_dst.plen = 128;
697 		rt->rt6i_flags |= RTF_CACHE;
698 		rt->u.dst.flags |= DST_HOST;
699 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
700 	}
701 	return rt;
702 }
703 
704 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
705 				      struct flowi *fl, int flags)
706 {
707 	struct fib6_node *fn;
708 	struct rt6_info *rt, *nrt;
709 	int strict = 0;
710 	int attempts = 3;
711 	int err;
712 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
713 
714 	strict |= flags & RT6_LOOKUP_F_IFACE;
715 
716 relookup:
717 	read_lock_bh(&table->tb6_lock);
718 
719 restart_2:
720 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
721 
722 restart:
723 	rt = rt6_select(fn, oif, strict | reachable);
724 
725 	BACKTRACK(net, &fl->fl6_src);
726 	if (rt == net->ipv6.ip6_null_entry ||
727 	    rt->rt6i_flags & RTF_CACHE)
728 		goto out;
729 
730 	dst_hold(&rt->u.dst);
731 	read_unlock_bh(&table->tb6_lock);
732 
733 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
734 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
735 	else {
736 #if CLONE_OFFLINK_ROUTE
737 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
738 #else
739 		goto out2;
740 #endif
741 	}
742 
743 	dst_release(&rt->u.dst);
744 	rt = nrt ? : net->ipv6.ip6_null_entry;
745 
746 	dst_hold(&rt->u.dst);
747 	if (nrt) {
748 		err = ip6_ins_rt(nrt);
749 		if (!err)
750 			goto out2;
751 	}
752 
753 	if (--attempts <= 0)
754 		goto out2;
755 
756 	/*
757 	 * Race condition! In the gap, when table->tb6_lock was
758 	 * released someone could insert this route.  Relookup.
759 	 */
760 	dst_release(&rt->u.dst);
761 	goto relookup;
762 
763 out:
764 	if (reachable) {
765 		reachable = 0;
766 		goto restart_2;
767 	}
768 	dst_hold(&rt->u.dst);
769 	read_unlock_bh(&table->tb6_lock);
770 out2:
771 	rt->u.dst.lastuse = jiffies;
772 	rt->u.dst.__use++;
773 
774 	return rt;
775 }
776 
777 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
778 					    struct flowi *fl, int flags)
779 {
780 	return ip6_pol_route(net, table, fl->iif, fl, flags);
781 }
782 
783 void ip6_route_input(struct sk_buff *skb)
784 {
785 	struct ipv6hdr *iph = ipv6_hdr(skb);
786 	struct net *net = dev_net(skb->dev);
787 	int flags = RT6_LOOKUP_F_HAS_SADDR;
788 	struct flowi fl = {
789 		.iif = skb->dev->ifindex,
790 		.nl_u = {
791 			.ip6_u = {
792 				.daddr = iph->daddr,
793 				.saddr = iph->saddr,
794 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
795 			},
796 		},
797 		.mark = skb->mark,
798 		.proto = iph->nexthdr,
799 	};
800 
801 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
802 		flags |= RT6_LOOKUP_F_IFACE;
803 
804 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
805 }
806 
807 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
808 					     struct flowi *fl, int flags)
809 {
810 	return ip6_pol_route(net, table, fl->oif, fl, flags);
811 }
812 
813 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
814 				    struct flowi *fl)
815 {
816 	int flags = 0;
817 
818 	if (rt6_need_strict(&fl->fl6_dst))
819 		flags |= RT6_LOOKUP_F_IFACE;
820 
821 	if (!ipv6_addr_any(&fl->fl6_src))
822 		flags |= RT6_LOOKUP_F_HAS_SADDR;
823 	else if (sk)
824 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
825 
826 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
827 }
828 
829 EXPORT_SYMBOL(ip6_route_output);
830 
831 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
832 {
833 	struct rt6_info *ort = (struct rt6_info *) *dstp;
834 	struct rt6_info *rt = (struct rt6_info *)
835 		dst_alloc(&ip6_dst_blackhole_ops);
836 	struct dst_entry *new = NULL;
837 
838 	if (rt) {
839 		new = &rt->u.dst;
840 
841 		atomic_set(&new->__refcnt, 1);
842 		new->__use = 1;
843 		new->input = dst_discard;
844 		new->output = dst_discard;
845 
846 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
847 		new->dev = ort->u.dst.dev;
848 		if (new->dev)
849 			dev_hold(new->dev);
850 		rt->rt6i_idev = ort->rt6i_idev;
851 		if (rt->rt6i_idev)
852 			in6_dev_hold(rt->rt6i_idev);
853 		rt->rt6i_expires = 0;
854 
855 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
856 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
857 		rt->rt6i_metric = 0;
858 
859 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
860 #ifdef CONFIG_IPV6_SUBTREES
861 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
862 #endif
863 
864 		dst_free(new);
865 	}
866 
867 	dst_release(*dstp);
868 	*dstp = new;
869 	return (new ? 0 : -ENOMEM);
870 }
871 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
872 
873 /*
874  *	Destination cache support functions
875  */
876 
877 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
878 {
879 	struct rt6_info *rt;
880 
881 	rt = (struct rt6_info *) dst;
882 
883 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
884 		return dst;
885 
886 	return NULL;
887 }
888 
889 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
890 {
891 	struct rt6_info *rt = (struct rt6_info *) dst;
892 
893 	if (rt) {
894 		if (rt->rt6i_flags & RTF_CACHE) {
895 			if (rt6_check_expired(rt)) {
896 				ip6_del_rt(rt);
897 				dst = NULL;
898 			}
899 		} else {
900 			dst_release(dst);
901 			dst = NULL;
902 		}
903 	}
904 	return dst;
905 }
906 
907 static void ip6_link_failure(struct sk_buff *skb)
908 {
909 	struct rt6_info *rt;
910 
911 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
912 
913 	rt = (struct rt6_info *) skb_dst(skb);
914 	if (rt) {
915 		if (rt->rt6i_flags&RTF_CACHE) {
916 			dst_set_expires(&rt->u.dst, 0);
917 			rt->rt6i_flags |= RTF_EXPIRES;
918 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
919 			rt->rt6i_node->fn_sernum = -1;
920 	}
921 }
922 
923 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
924 {
925 	struct rt6_info *rt6 = (struct rt6_info*)dst;
926 
927 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
928 		rt6->rt6i_flags |= RTF_MODIFIED;
929 		if (mtu < IPV6_MIN_MTU) {
930 			mtu = IPV6_MIN_MTU;
931 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
932 		}
933 		dst->metrics[RTAX_MTU-1] = mtu;
934 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935 	}
936 }
937 
938 static int ipv6_get_mtu(struct net_device *dev);
939 
940 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
941 {
942 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
943 
944 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
945 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
946 
947 	/*
948 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
949 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
950 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
951 	 * rely only on pmtu discovery"
952 	 */
953 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
954 		mtu = IPV6_MAXPLEN;
955 	return mtu;
956 }
957 
958 static struct dst_entry *icmp6_dst_gc_list;
959 static DEFINE_SPINLOCK(icmp6_dst_lock);
960 
961 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
962 				  struct neighbour *neigh,
963 				  const struct in6_addr *addr)
964 {
965 	struct rt6_info *rt;
966 	struct inet6_dev *idev = in6_dev_get(dev);
967 	struct net *net = dev_net(dev);
968 
969 	if (unlikely(idev == NULL))
970 		return NULL;
971 
972 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
973 	if (unlikely(rt == NULL)) {
974 		in6_dev_put(idev);
975 		goto out;
976 	}
977 
978 	dev_hold(dev);
979 	if (neigh)
980 		neigh_hold(neigh);
981 	else {
982 		neigh = ndisc_get_neigh(dev, addr);
983 		if (IS_ERR(neigh))
984 			neigh = NULL;
985 	}
986 
987 	rt->rt6i_dev	  = dev;
988 	rt->rt6i_idev     = idev;
989 	rt->rt6i_nexthop  = neigh;
990 	atomic_set(&rt->u.dst.__refcnt, 1);
991 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
992 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
993 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
994 	rt->u.dst.output  = ip6_output;
995 
996 #if 0	/* there's no chance to use these for ndisc */
997 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
998 				? DST_HOST
999 				: 0;
1000 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1001 	rt->rt6i_dst.plen = 128;
1002 #endif
1003 
1004 	spin_lock_bh(&icmp6_dst_lock);
1005 	rt->u.dst.next = icmp6_dst_gc_list;
1006 	icmp6_dst_gc_list = &rt->u.dst;
1007 	spin_unlock_bh(&icmp6_dst_lock);
1008 
1009 	fib6_force_start_gc(net);
1010 
1011 out:
1012 	return &rt->u.dst;
1013 }
1014 
1015 int icmp6_dst_gc(void)
1016 {
1017 	struct dst_entry *dst, *next, **pprev;
1018 	int more = 0;
1019 
1020 	next = NULL;
1021 
1022 	spin_lock_bh(&icmp6_dst_lock);
1023 	pprev = &icmp6_dst_gc_list;
1024 
1025 	while ((dst = *pprev) != NULL) {
1026 		if (!atomic_read(&dst->__refcnt)) {
1027 			*pprev = dst->next;
1028 			dst_free(dst);
1029 		} else {
1030 			pprev = &dst->next;
1031 			++more;
1032 		}
1033 	}
1034 
1035 	spin_unlock_bh(&icmp6_dst_lock);
1036 
1037 	return more;
1038 }
1039 
1040 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1041 			    void *arg)
1042 {
1043 	struct dst_entry *dst, **pprev;
1044 
1045 	spin_lock_bh(&icmp6_dst_lock);
1046 	pprev = &icmp6_dst_gc_list;
1047 	while ((dst = *pprev) != NULL) {
1048 		struct rt6_info *rt = (struct rt6_info *) dst;
1049 		if (func(rt, arg)) {
1050 			*pprev = dst->next;
1051 			dst_free(dst);
1052 		} else {
1053 			pprev = &dst->next;
1054 		}
1055 	}
1056 	spin_unlock_bh(&icmp6_dst_lock);
1057 }
1058 
1059 static int ip6_dst_gc(struct dst_ops *ops)
1060 {
1061 	unsigned long now = jiffies;
1062 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1063 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1064 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1065 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1066 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1067 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1068 
1069 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1070 	    atomic_read(&ops->entries) <= rt_max_size)
1071 		goto out;
1072 
1073 	net->ipv6.ip6_rt_gc_expire++;
1074 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1075 	net->ipv6.ip6_rt_last_gc = now;
1076 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1077 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1078 out:
1079 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1080 	return (atomic_read(&ops->entries) > rt_max_size);
1081 }
1082 
1083 /* Clean host part of a prefix. Not necessary in radix tree,
1084    but results in cleaner routing tables.
1085 
1086    Remove it only when all the things will work!
1087  */
1088 
1089 static int ipv6_get_mtu(struct net_device *dev)
1090 {
1091 	int mtu = IPV6_MIN_MTU;
1092 	struct inet6_dev *idev;
1093 
1094 	idev = in6_dev_get(dev);
1095 	if (idev) {
1096 		mtu = idev->cnf.mtu6;
1097 		in6_dev_put(idev);
1098 	}
1099 	return mtu;
1100 }
1101 
1102 int ip6_dst_hoplimit(struct dst_entry *dst)
1103 {
1104 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1105 	if (hoplimit < 0) {
1106 		struct net_device *dev = dst->dev;
1107 		struct inet6_dev *idev = in6_dev_get(dev);
1108 		if (idev) {
1109 			hoplimit = idev->cnf.hop_limit;
1110 			in6_dev_put(idev);
1111 		} else
1112 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1113 	}
1114 	return hoplimit;
1115 }
1116 
1117 /*
1118  *
1119  */
1120 
1121 int ip6_route_add(struct fib6_config *cfg)
1122 {
1123 	int err;
1124 	struct net *net = cfg->fc_nlinfo.nl_net;
1125 	struct rt6_info *rt = NULL;
1126 	struct net_device *dev = NULL;
1127 	struct inet6_dev *idev = NULL;
1128 	struct fib6_table *table;
1129 	int addr_type;
1130 
1131 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1132 		return -EINVAL;
1133 #ifndef CONFIG_IPV6_SUBTREES
1134 	if (cfg->fc_src_len)
1135 		return -EINVAL;
1136 #endif
1137 	if (cfg->fc_ifindex) {
1138 		err = -ENODEV;
1139 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1140 		if (!dev)
1141 			goto out;
1142 		idev = in6_dev_get(dev);
1143 		if (!idev)
1144 			goto out;
1145 	}
1146 
1147 	if (cfg->fc_metric == 0)
1148 		cfg->fc_metric = IP6_RT_PRIO_USER;
1149 
1150 	table = fib6_new_table(net, cfg->fc_table);
1151 	if (table == NULL) {
1152 		err = -ENOBUFS;
1153 		goto out;
1154 	}
1155 
1156 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1157 
1158 	if (rt == NULL) {
1159 		err = -ENOMEM;
1160 		goto out;
1161 	}
1162 
1163 	rt->u.dst.obsolete = -1;
1164 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1165 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1166 				0;
1167 
1168 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1169 		cfg->fc_protocol = RTPROT_BOOT;
1170 	rt->rt6i_protocol = cfg->fc_protocol;
1171 
1172 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1173 
1174 	if (addr_type & IPV6_ADDR_MULTICAST)
1175 		rt->u.dst.input = ip6_mc_input;
1176 	else
1177 		rt->u.dst.input = ip6_forward;
1178 
1179 	rt->u.dst.output = ip6_output;
1180 
1181 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1182 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1183 	if (rt->rt6i_dst.plen == 128)
1184 	       rt->u.dst.flags = DST_HOST;
1185 
1186 #ifdef CONFIG_IPV6_SUBTREES
1187 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1188 	rt->rt6i_src.plen = cfg->fc_src_len;
1189 #endif
1190 
1191 	rt->rt6i_metric = cfg->fc_metric;
1192 
1193 	/* We cannot add true routes via loopback here,
1194 	   they would result in kernel looping; promote them to reject routes
1195 	 */
1196 	if ((cfg->fc_flags & RTF_REJECT) ||
1197 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1198 		/* hold loopback dev/idev if we haven't done so. */
1199 		if (dev != net->loopback_dev) {
1200 			if (dev) {
1201 				dev_put(dev);
1202 				in6_dev_put(idev);
1203 			}
1204 			dev = net->loopback_dev;
1205 			dev_hold(dev);
1206 			idev = in6_dev_get(dev);
1207 			if (!idev) {
1208 				err = -ENODEV;
1209 				goto out;
1210 			}
1211 		}
1212 		rt->u.dst.output = ip6_pkt_discard_out;
1213 		rt->u.dst.input = ip6_pkt_discard;
1214 		rt->u.dst.error = -ENETUNREACH;
1215 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1216 		goto install_route;
1217 	}
1218 
1219 	if (cfg->fc_flags & RTF_GATEWAY) {
1220 		struct in6_addr *gw_addr;
1221 		int gwa_type;
1222 
1223 		gw_addr = &cfg->fc_gateway;
1224 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1225 		gwa_type = ipv6_addr_type(gw_addr);
1226 
1227 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1228 			struct rt6_info *grt;
1229 
1230 			/* IPv6 strictly inhibits using not link-local
1231 			   addresses as nexthop address.
1232 			   Otherwise, router will not able to send redirects.
1233 			   It is very good, but in some (rare!) circumstances
1234 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1235 			   some exceptions. --ANK
1236 			 */
1237 			err = -EINVAL;
1238 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1239 				goto out;
1240 
1241 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1242 
1243 			err = -EHOSTUNREACH;
1244 			if (grt == NULL)
1245 				goto out;
1246 			if (dev) {
1247 				if (dev != grt->rt6i_dev) {
1248 					dst_release(&grt->u.dst);
1249 					goto out;
1250 				}
1251 			} else {
1252 				dev = grt->rt6i_dev;
1253 				idev = grt->rt6i_idev;
1254 				dev_hold(dev);
1255 				in6_dev_hold(grt->rt6i_idev);
1256 			}
1257 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1258 				err = 0;
1259 			dst_release(&grt->u.dst);
1260 
1261 			if (err)
1262 				goto out;
1263 		}
1264 		err = -EINVAL;
1265 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1266 			goto out;
1267 	}
1268 
1269 	err = -ENODEV;
1270 	if (dev == NULL)
1271 		goto out;
1272 
1273 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1274 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1275 		if (IS_ERR(rt->rt6i_nexthop)) {
1276 			err = PTR_ERR(rt->rt6i_nexthop);
1277 			rt->rt6i_nexthop = NULL;
1278 			goto out;
1279 		}
1280 	}
1281 
1282 	rt->rt6i_flags = cfg->fc_flags;
1283 
1284 install_route:
1285 	if (cfg->fc_mx) {
1286 		struct nlattr *nla;
1287 		int remaining;
1288 
1289 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1290 			int type = nla_type(nla);
1291 
1292 			if (type) {
1293 				if (type > RTAX_MAX) {
1294 					err = -EINVAL;
1295 					goto out;
1296 				}
1297 
1298 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1299 			}
1300 		}
1301 	}
1302 
1303 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1304 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1305 	if (!dst_mtu(&rt->u.dst))
1306 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1307 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1308 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1309 	rt->u.dst.dev = dev;
1310 	rt->rt6i_idev = idev;
1311 	rt->rt6i_table = table;
1312 
1313 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1314 
1315 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1316 
1317 out:
1318 	if (dev)
1319 		dev_put(dev);
1320 	if (idev)
1321 		in6_dev_put(idev);
1322 	if (rt)
1323 		dst_free(&rt->u.dst);
1324 	return err;
1325 }
1326 
1327 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1328 {
1329 	int err;
1330 	struct fib6_table *table;
1331 	struct net *net = dev_net(rt->rt6i_dev);
1332 
1333 	if (rt == net->ipv6.ip6_null_entry)
1334 		return -ENOENT;
1335 
1336 	table = rt->rt6i_table;
1337 	write_lock_bh(&table->tb6_lock);
1338 
1339 	err = fib6_del(rt, info);
1340 	dst_release(&rt->u.dst);
1341 
1342 	write_unlock_bh(&table->tb6_lock);
1343 
1344 	return err;
1345 }
1346 
1347 int ip6_del_rt(struct rt6_info *rt)
1348 {
1349 	struct nl_info info = {
1350 		.nl_net = dev_net(rt->rt6i_dev),
1351 	};
1352 	return __ip6_del_rt(rt, &info);
1353 }
1354 
1355 static int ip6_route_del(struct fib6_config *cfg)
1356 {
1357 	struct fib6_table *table;
1358 	struct fib6_node *fn;
1359 	struct rt6_info *rt;
1360 	int err = -ESRCH;
1361 
1362 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1363 	if (table == NULL)
1364 		return err;
1365 
1366 	read_lock_bh(&table->tb6_lock);
1367 
1368 	fn = fib6_locate(&table->tb6_root,
1369 			 &cfg->fc_dst, cfg->fc_dst_len,
1370 			 &cfg->fc_src, cfg->fc_src_len);
1371 
1372 	if (fn) {
1373 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1374 			if (cfg->fc_ifindex &&
1375 			    (rt->rt6i_dev == NULL ||
1376 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1377 				continue;
1378 			if (cfg->fc_flags & RTF_GATEWAY &&
1379 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1380 				continue;
1381 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1382 				continue;
1383 			dst_hold(&rt->u.dst);
1384 			read_unlock_bh(&table->tb6_lock);
1385 
1386 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1387 		}
1388 	}
1389 	read_unlock_bh(&table->tb6_lock);
1390 
1391 	return err;
1392 }
1393 
1394 /*
1395  *	Handle redirects
1396  */
1397 struct ip6rd_flowi {
1398 	struct flowi fl;
1399 	struct in6_addr gateway;
1400 };
1401 
1402 static struct rt6_info *__ip6_route_redirect(struct net *net,
1403 					     struct fib6_table *table,
1404 					     struct flowi *fl,
1405 					     int flags)
1406 {
1407 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1408 	struct rt6_info *rt;
1409 	struct fib6_node *fn;
1410 
1411 	/*
1412 	 * Get the "current" route for this destination and
1413 	 * check if the redirect has come from approriate router.
1414 	 *
1415 	 * RFC 2461 specifies that redirects should only be
1416 	 * accepted if they come from the nexthop to the target.
1417 	 * Due to the way the routes are chosen, this notion
1418 	 * is a bit fuzzy and one might need to check all possible
1419 	 * routes.
1420 	 */
1421 
1422 	read_lock_bh(&table->tb6_lock);
1423 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1424 restart:
1425 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1426 		/*
1427 		 * Current route is on-link; redirect is always invalid.
1428 		 *
1429 		 * Seems, previous statement is not true. It could
1430 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1431 		 * But then router serving it might decide, that we should
1432 		 * know truth 8)8) --ANK (980726).
1433 		 */
1434 		if (rt6_check_expired(rt))
1435 			continue;
1436 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1437 			continue;
1438 		if (fl->oif != rt->rt6i_dev->ifindex)
1439 			continue;
1440 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1441 			continue;
1442 		break;
1443 	}
1444 
1445 	if (!rt)
1446 		rt = net->ipv6.ip6_null_entry;
1447 	BACKTRACK(net, &fl->fl6_src);
1448 out:
1449 	dst_hold(&rt->u.dst);
1450 
1451 	read_unlock_bh(&table->tb6_lock);
1452 
1453 	return rt;
1454 };
1455 
1456 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1457 					   struct in6_addr *src,
1458 					   struct in6_addr *gateway,
1459 					   struct net_device *dev)
1460 {
1461 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1462 	struct net *net = dev_net(dev);
1463 	struct ip6rd_flowi rdfl = {
1464 		.fl = {
1465 			.oif = dev->ifindex,
1466 			.nl_u = {
1467 				.ip6_u = {
1468 					.daddr = *dest,
1469 					.saddr = *src,
1470 				},
1471 			},
1472 		},
1473 	};
1474 
1475 	ipv6_addr_copy(&rdfl.gateway, gateway);
1476 
1477 	if (rt6_need_strict(dest))
1478 		flags |= RT6_LOOKUP_F_IFACE;
1479 
1480 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481 						   flags, __ip6_route_redirect);
1482 }
1483 
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485 		  struct in6_addr *saddr,
1486 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488 	struct rt6_info *rt, *nrt = NULL;
1489 	struct netevent_redirect netevent;
1490 	struct net *net = dev_net(neigh->dev);
1491 
1492 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493 
1494 	if (rt == net->ipv6.ip6_null_entry) {
1495 		if (net_ratelimit())
1496 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497 			       "for redirect target\n");
1498 		goto out;
1499 	}
1500 
1501 	/*
1502 	 *	We have finally decided to accept it.
1503 	 */
1504 
1505 	neigh_update(neigh, lladdr, NUD_STALE,
1506 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507 		     NEIGH_UPDATE_F_OVERRIDE|
1508 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509 				     NEIGH_UPDATE_F_ISROUTER))
1510 		     );
1511 
1512 	/*
1513 	 * Redirect received -> path was valid.
1514 	 * Look, redirects are sent only in response to data packets,
1515 	 * so that this nexthop apparently is reachable. --ANK
1516 	 */
1517 	dst_confirm(&rt->u.dst);
1518 
1519 	/* Duplicate redirect: silently ignore. */
1520 	if (neigh == rt->u.dst.neighbour)
1521 		goto out;
1522 
1523 	nrt = ip6_rt_copy(rt);
1524 	if (nrt == NULL)
1525 		goto out;
1526 
1527 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528 	if (on_link)
1529 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1530 
1531 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532 	nrt->rt6i_dst.plen = 128;
1533 	nrt->u.dst.flags |= DST_HOST;
1534 
1535 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536 	nrt->rt6i_nexthop = neigh_clone(neigh);
1537 	/* Reset pmtu, it may be better */
1538 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1539 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1540 							dst_mtu(&nrt->u.dst));
1541 
1542 	if (ip6_ins_rt(nrt))
1543 		goto out;
1544 
1545 	netevent.old = &rt->u.dst;
1546 	netevent.new = &nrt->u.dst;
1547 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548 
1549 	if (rt->rt6i_flags&RTF_CACHE) {
1550 		ip6_del_rt(rt);
1551 		return;
1552 	}
1553 
1554 out:
1555 	dst_release(&rt->u.dst);
1556 	return;
1557 }
1558 
1559 /*
1560  *	Handle ICMP "packet too big" messages
1561  *	i.e. Path MTU discovery
1562  */
1563 
1564 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1565 			struct net_device *dev, u32 pmtu)
1566 {
1567 	struct rt6_info *rt, *nrt;
1568 	struct net *net = dev_net(dev);
1569 	int allfrag = 0;
1570 
1571 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1572 	if (rt == NULL)
1573 		return;
1574 
1575 	if (pmtu >= dst_mtu(&rt->u.dst))
1576 		goto out;
1577 
1578 	if (pmtu < IPV6_MIN_MTU) {
1579 		/*
1580 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1581 		 * MTU (1280) and a fragment header should always be included
1582 		 * after a node receiving Too Big message reporting PMTU is
1583 		 * less than the IPv6 Minimum Link MTU.
1584 		 */
1585 		pmtu = IPV6_MIN_MTU;
1586 		allfrag = 1;
1587 	}
1588 
1589 	/* New mtu received -> path was valid.
1590 	   They are sent only in response to data packets,
1591 	   so that this nexthop apparently is reachable. --ANK
1592 	 */
1593 	dst_confirm(&rt->u.dst);
1594 
1595 	/* Host route. If it is static, it would be better
1596 	   not to override it, but add new one, so that
1597 	   when cache entry will expire old pmtu
1598 	   would return automatically.
1599 	 */
1600 	if (rt->rt6i_flags & RTF_CACHE) {
1601 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1602 		if (allfrag)
1603 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1604 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1605 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1606 		goto out;
1607 	}
1608 
1609 	/* Network route.
1610 	   Two cases are possible:
1611 	   1. It is connected route. Action: COW
1612 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1613 	 */
1614 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1615 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1616 	else
1617 		nrt = rt6_alloc_clone(rt, daddr);
1618 
1619 	if (nrt) {
1620 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1621 		if (allfrag)
1622 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1623 
1624 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1625 		 * happened within 5 mins, the recommended timer is 10 mins.
1626 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1627 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1628 		 * and detecting PMTU increase will be automatically happened.
1629 		 */
1630 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1631 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1632 
1633 		ip6_ins_rt(nrt);
1634 	}
1635 out:
1636 	dst_release(&rt->u.dst);
1637 }
1638 
1639 /*
1640  *	Misc support functions
1641  */
1642 
1643 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1644 {
1645 	struct net *net = dev_net(ort->rt6i_dev);
1646 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1647 
1648 	if (rt) {
1649 		rt->u.dst.input = ort->u.dst.input;
1650 		rt->u.dst.output = ort->u.dst.output;
1651 
1652 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1653 		rt->u.dst.error = ort->u.dst.error;
1654 		rt->u.dst.dev = ort->u.dst.dev;
1655 		if (rt->u.dst.dev)
1656 			dev_hold(rt->u.dst.dev);
1657 		rt->rt6i_idev = ort->rt6i_idev;
1658 		if (rt->rt6i_idev)
1659 			in6_dev_hold(rt->rt6i_idev);
1660 		rt->u.dst.lastuse = jiffies;
1661 		rt->rt6i_expires = 0;
1662 
1663 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1664 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1665 		rt->rt6i_metric = 0;
1666 
1667 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1668 #ifdef CONFIG_IPV6_SUBTREES
1669 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1670 #endif
1671 		rt->rt6i_table = ort->rt6i_table;
1672 	}
1673 	return rt;
1674 }
1675 
1676 #ifdef CONFIG_IPV6_ROUTE_INFO
1677 static struct rt6_info *rt6_get_route_info(struct net *net,
1678 					   struct in6_addr *prefix, int prefixlen,
1679 					   struct in6_addr *gwaddr, int ifindex)
1680 {
1681 	struct fib6_node *fn;
1682 	struct rt6_info *rt = NULL;
1683 	struct fib6_table *table;
1684 
1685 	table = fib6_get_table(net, RT6_TABLE_INFO);
1686 	if (table == NULL)
1687 		return NULL;
1688 
1689 	write_lock_bh(&table->tb6_lock);
1690 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1691 	if (!fn)
1692 		goto out;
1693 
1694 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1695 		if (rt->rt6i_dev->ifindex != ifindex)
1696 			continue;
1697 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1698 			continue;
1699 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1700 			continue;
1701 		dst_hold(&rt->u.dst);
1702 		break;
1703 	}
1704 out:
1705 	write_unlock_bh(&table->tb6_lock);
1706 	return rt;
1707 }
1708 
1709 static struct rt6_info *rt6_add_route_info(struct net *net,
1710 					   struct in6_addr *prefix, int prefixlen,
1711 					   struct in6_addr *gwaddr, int ifindex,
1712 					   unsigned pref)
1713 {
1714 	struct fib6_config cfg = {
1715 		.fc_table	= RT6_TABLE_INFO,
1716 		.fc_metric	= IP6_RT_PRIO_USER,
1717 		.fc_ifindex	= ifindex,
1718 		.fc_dst_len	= prefixlen,
1719 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1720 				  RTF_UP | RTF_PREF(pref),
1721 		.fc_nlinfo.pid = 0,
1722 		.fc_nlinfo.nlh = NULL,
1723 		.fc_nlinfo.nl_net = net,
1724 	};
1725 
1726 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1727 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1728 
1729 	/* We should treat it as a default route if prefix length is 0. */
1730 	if (!prefixlen)
1731 		cfg.fc_flags |= RTF_DEFAULT;
1732 
1733 	ip6_route_add(&cfg);
1734 
1735 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1736 }
1737 #endif
1738 
1739 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1740 {
1741 	struct rt6_info *rt;
1742 	struct fib6_table *table;
1743 
1744 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1745 	if (table == NULL)
1746 		return NULL;
1747 
1748 	write_lock_bh(&table->tb6_lock);
1749 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1750 		if (dev == rt->rt6i_dev &&
1751 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1752 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1753 			break;
1754 	}
1755 	if (rt)
1756 		dst_hold(&rt->u.dst);
1757 	write_unlock_bh(&table->tb6_lock);
1758 	return rt;
1759 }
1760 
1761 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1762 				     struct net_device *dev,
1763 				     unsigned int pref)
1764 {
1765 	struct fib6_config cfg = {
1766 		.fc_table	= RT6_TABLE_DFLT,
1767 		.fc_metric	= IP6_RT_PRIO_USER,
1768 		.fc_ifindex	= dev->ifindex,
1769 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1770 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1771 		.fc_nlinfo.pid = 0,
1772 		.fc_nlinfo.nlh = NULL,
1773 		.fc_nlinfo.nl_net = dev_net(dev),
1774 	};
1775 
1776 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1777 
1778 	ip6_route_add(&cfg);
1779 
1780 	return rt6_get_dflt_router(gwaddr, dev);
1781 }
1782 
1783 void rt6_purge_dflt_routers(struct net *net)
1784 {
1785 	struct rt6_info *rt;
1786 	struct fib6_table *table;
1787 
1788 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1789 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1790 	if (table == NULL)
1791 		return;
1792 
1793 restart:
1794 	read_lock_bh(&table->tb6_lock);
1795 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1796 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1797 			dst_hold(&rt->u.dst);
1798 			read_unlock_bh(&table->tb6_lock);
1799 			ip6_del_rt(rt);
1800 			goto restart;
1801 		}
1802 	}
1803 	read_unlock_bh(&table->tb6_lock);
1804 }
1805 
1806 static void rtmsg_to_fib6_config(struct net *net,
1807 				 struct in6_rtmsg *rtmsg,
1808 				 struct fib6_config *cfg)
1809 {
1810 	memset(cfg, 0, sizeof(*cfg));
1811 
1812 	cfg->fc_table = RT6_TABLE_MAIN;
1813 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1814 	cfg->fc_metric = rtmsg->rtmsg_metric;
1815 	cfg->fc_expires = rtmsg->rtmsg_info;
1816 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1817 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1818 	cfg->fc_flags = rtmsg->rtmsg_flags;
1819 
1820 	cfg->fc_nlinfo.nl_net = net;
1821 
1822 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1823 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1824 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1825 }
1826 
1827 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1828 {
1829 	struct fib6_config cfg;
1830 	struct in6_rtmsg rtmsg;
1831 	int err;
1832 
1833 	switch(cmd) {
1834 	case SIOCADDRT:		/* Add a route */
1835 	case SIOCDELRT:		/* Delete a route */
1836 		if (!capable(CAP_NET_ADMIN))
1837 			return -EPERM;
1838 		err = copy_from_user(&rtmsg, arg,
1839 				     sizeof(struct in6_rtmsg));
1840 		if (err)
1841 			return -EFAULT;
1842 
1843 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1844 
1845 		rtnl_lock();
1846 		switch (cmd) {
1847 		case SIOCADDRT:
1848 			err = ip6_route_add(&cfg);
1849 			break;
1850 		case SIOCDELRT:
1851 			err = ip6_route_del(&cfg);
1852 			break;
1853 		default:
1854 			err = -EINVAL;
1855 		}
1856 		rtnl_unlock();
1857 
1858 		return err;
1859 	}
1860 
1861 	return -EINVAL;
1862 }
1863 
1864 /*
1865  *	Drop the packet on the floor
1866  */
1867 
1868 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1869 {
1870 	int type;
1871 	struct dst_entry *dst = skb_dst(skb);
1872 	switch (ipstats_mib_noroutes) {
1873 	case IPSTATS_MIB_INNOROUTES:
1874 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1875 		if (type == IPV6_ADDR_ANY) {
1876 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1877 				      IPSTATS_MIB_INADDRERRORS);
1878 			break;
1879 		}
1880 		/* FALLTHROUGH */
1881 	case IPSTATS_MIB_OUTNOROUTES:
1882 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1883 			      ipstats_mib_noroutes);
1884 		break;
1885 	}
1886 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1887 	kfree_skb(skb);
1888 	return 0;
1889 }
1890 
1891 static int ip6_pkt_discard(struct sk_buff *skb)
1892 {
1893 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1894 }
1895 
1896 static int ip6_pkt_discard_out(struct sk_buff *skb)
1897 {
1898 	skb->dev = skb_dst(skb)->dev;
1899 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1900 }
1901 
1902 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1903 
1904 static int ip6_pkt_prohibit(struct sk_buff *skb)
1905 {
1906 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1907 }
1908 
1909 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1910 {
1911 	skb->dev = skb_dst(skb)->dev;
1912 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1913 }
1914 
1915 #endif
1916 
1917 /*
1918  *	Allocate a dst for local (unicast / anycast) address.
1919  */
1920 
1921 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1922 				    const struct in6_addr *addr,
1923 				    int anycast)
1924 {
1925 	struct net *net = dev_net(idev->dev);
1926 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1927 	struct neighbour *neigh;
1928 
1929 	if (rt == NULL)
1930 		return ERR_PTR(-ENOMEM);
1931 
1932 	dev_hold(net->loopback_dev);
1933 	in6_dev_hold(idev);
1934 
1935 	rt->u.dst.flags = DST_HOST;
1936 	rt->u.dst.input = ip6_input;
1937 	rt->u.dst.output = ip6_output;
1938 	rt->rt6i_dev = net->loopback_dev;
1939 	rt->rt6i_idev = idev;
1940 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1941 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1942 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1943 	rt->u.dst.obsolete = -1;
1944 
1945 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1946 	if (anycast)
1947 		rt->rt6i_flags |= RTF_ANYCAST;
1948 	else
1949 		rt->rt6i_flags |= RTF_LOCAL;
1950 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1951 	if (IS_ERR(neigh)) {
1952 		dst_free(&rt->u.dst);
1953 
1954 		/* We are casting this because that is the return
1955 		 * value type.  But an errno encoded pointer is the
1956 		 * same regardless of the underlying pointer type,
1957 		 * and that's what we are returning.  So this is OK.
1958 		 */
1959 		return (struct rt6_info *) neigh;
1960 	}
1961 	rt->rt6i_nexthop = neigh;
1962 
1963 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1964 	rt->rt6i_dst.plen = 128;
1965 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1966 
1967 	atomic_set(&rt->u.dst.__refcnt, 1);
1968 
1969 	return rt;
1970 }
1971 
1972 struct arg_dev_net {
1973 	struct net_device *dev;
1974 	struct net *net;
1975 };
1976 
1977 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1978 {
1979 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1980 	struct net *net = ((struct arg_dev_net *)arg)->net;
1981 
1982 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1983 	    rt != net->ipv6.ip6_null_entry) {
1984 		RT6_TRACE("deleted by ifdown %p\n", rt);
1985 		return -1;
1986 	}
1987 	return 0;
1988 }
1989 
1990 void rt6_ifdown(struct net *net, struct net_device *dev)
1991 {
1992 	struct arg_dev_net adn = {
1993 		.dev = dev,
1994 		.net = net,
1995 	};
1996 
1997 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1998 	icmp6_clean_all(fib6_ifdown, &adn);
1999 }
2000 
2001 struct rt6_mtu_change_arg
2002 {
2003 	struct net_device *dev;
2004 	unsigned mtu;
2005 };
2006 
2007 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2008 {
2009 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2010 	struct inet6_dev *idev;
2011 	struct net *net = dev_net(arg->dev);
2012 
2013 	/* In IPv6 pmtu discovery is not optional,
2014 	   so that RTAX_MTU lock cannot disable it.
2015 	   We still use this lock to block changes
2016 	   caused by addrconf/ndisc.
2017 	*/
2018 
2019 	idev = __in6_dev_get(arg->dev);
2020 	if (idev == NULL)
2021 		return 0;
2022 
2023 	/* For administrative MTU increase, there is no way to discover
2024 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2025 	   Since RFC 1981 doesn't include administrative MTU increase
2026 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2027 	 */
2028 	/*
2029 	   If new MTU is less than route PMTU, this new MTU will be the
2030 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2031 	   decreases; if new MTU is greater than route PMTU, and the
2032 	   old MTU is the lowest MTU in the path, update the route PMTU
2033 	   to reflect the increase. In this case if the other nodes' MTU
2034 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2035 	   PMTU discouvery.
2036 	 */
2037 	if (rt->rt6i_dev == arg->dev &&
2038 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2039 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
2040 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
2041 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2042 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2043 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2044 	}
2045 	return 0;
2046 }
2047 
2048 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2049 {
2050 	struct rt6_mtu_change_arg arg = {
2051 		.dev = dev,
2052 		.mtu = mtu,
2053 	};
2054 
2055 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2056 }
2057 
2058 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2059 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2060 	[RTA_OIF]               = { .type = NLA_U32 },
2061 	[RTA_IIF]		= { .type = NLA_U32 },
2062 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2063 	[RTA_METRICS]           = { .type = NLA_NESTED },
2064 };
2065 
2066 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2067 			      struct fib6_config *cfg)
2068 {
2069 	struct rtmsg *rtm;
2070 	struct nlattr *tb[RTA_MAX+1];
2071 	int err;
2072 
2073 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2074 	if (err < 0)
2075 		goto errout;
2076 
2077 	err = -EINVAL;
2078 	rtm = nlmsg_data(nlh);
2079 	memset(cfg, 0, sizeof(*cfg));
2080 
2081 	cfg->fc_table = rtm->rtm_table;
2082 	cfg->fc_dst_len = rtm->rtm_dst_len;
2083 	cfg->fc_src_len = rtm->rtm_src_len;
2084 	cfg->fc_flags = RTF_UP;
2085 	cfg->fc_protocol = rtm->rtm_protocol;
2086 
2087 	if (rtm->rtm_type == RTN_UNREACHABLE)
2088 		cfg->fc_flags |= RTF_REJECT;
2089 
2090 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2091 	cfg->fc_nlinfo.nlh = nlh;
2092 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2093 
2094 	if (tb[RTA_GATEWAY]) {
2095 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2096 		cfg->fc_flags |= RTF_GATEWAY;
2097 	}
2098 
2099 	if (tb[RTA_DST]) {
2100 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2101 
2102 		if (nla_len(tb[RTA_DST]) < plen)
2103 			goto errout;
2104 
2105 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2106 	}
2107 
2108 	if (tb[RTA_SRC]) {
2109 		int plen = (rtm->rtm_src_len + 7) >> 3;
2110 
2111 		if (nla_len(tb[RTA_SRC]) < plen)
2112 			goto errout;
2113 
2114 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2115 	}
2116 
2117 	if (tb[RTA_OIF])
2118 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2119 
2120 	if (tb[RTA_PRIORITY])
2121 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2122 
2123 	if (tb[RTA_METRICS]) {
2124 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2125 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2126 	}
2127 
2128 	if (tb[RTA_TABLE])
2129 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2130 
2131 	err = 0;
2132 errout:
2133 	return err;
2134 }
2135 
2136 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2137 {
2138 	struct fib6_config cfg;
2139 	int err;
2140 
2141 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2142 	if (err < 0)
2143 		return err;
2144 
2145 	return ip6_route_del(&cfg);
2146 }
2147 
2148 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2149 {
2150 	struct fib6_config cfg;
2151 	int err;
2152 
2153 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2154 	if (err < 0)
2155 		return err;
2156 
2157 	return ip6_route_add(&cfg);
2158 }
2159 
2160 static inline size_t rt6_nlmsg_size(void)
2161 {
2162 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2163 	       + nla_total_size(16) /* RTA_SRC */
2164 	       + nla_total_size(16) /* RTA_DST */
2165 	       + nla_total_size(16) /* RTA_GATEWAY */
2166 	       + nla_total_size(16) /* RTA_PREFSRC */
2167 	       + nla_total_size(4) /* RTA_TABLE */
2168 	       + nla_total_size(4) /* RTA_IIF */
2169 	       + nla_total_size(4) /* RTA_OIF */
2170 	       + nla_total_size(4) /* RTA_PRIORITY */
2171 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2172 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2173 }
2174 
2175 static int rt6_fill_node(struct net *net,
2176 			 struct sk_buff *skb, struct rt6_info *rt,
2177 			 struct in6_addr *dst, struct in6_addr *src,
2178 			 int iif, int type, u32 pid, u32 seq,
2179 			 int prefix, int nowait, unsigned int flags)
2180 {
2181 	struct rtmsg *rtm;
2182 	struct nlmsghdr *nlh;
2183 	long expires;
2184 	u32 table;
2185 
2186 	if (prefix) {	/* user wants prefix routes only */
2187 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2188 			/* success since this is not a prefix route */
2189 			return 1;
2190 		}
2191 	}
2192 
2193 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2194 	if (nlh == NULL)
2195 		return -EMSGSIZE;
2196 
2197 	rtm = nlmsg_data(nlh);
2198 	rtm->rtm_family = AF_INET6;
2199 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2200 	rtm->rtm_src_len = rt->rt6i_src.plen;
2201 	rtm->rtm_tos = 0;
2202 	if (rt->rt6i_table)
2203 		table = rt->rt6i_table->tb6_id;
2204 	else
2205 		table = RT6_TABLE_UNSPEC;
2206 	rtm->rtm_table = table;
2207 	NLA_PUT_U32(skb, RTA_TABLE, table);
2208 	if (rt->rt6i_flags&RTF_REJECT)
2209 		rtm->rtm_type = RTN_UNREACHABLE;
2210 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2211 		rtm->rtm_type = RTN_LOCAL;
2212 	else
2213 		rtm->rtm_type = RTN_UNICAST;
2214 	rtm->rtm_flags = 0;
2215 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2216 	rtm->rtm_protocol = rt->rt6i_protocol;
2217 	if (rt->rt6i_flags&RTF_DYNAMIC)
2218 		rtm->rtm_protocol = RTPROT_REDIRECT;
2219 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2220 		rtm->rtm_protocol = RTPROT_KERNEL;
2221 	else if (rt->rt6i_flags&RTF_DEFAULT)
2222 		rtm->rtm_protocol = RTPROT_RA;
2223 
2224 	if (rt->rt6i_flags&RTF_CACHE)
2225 		rtm->rtm_flags |= RTM_F_CLONED;
2226 
2227 	if (dst) {
2228 		NLA_PUT(skb, RTA_DST, 16, dst);
2229 		rtm->rtm_dst_len = 128;
2230 	} else if (rtm->rtm_dst_len)
2231 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2232 #ifdef CONFIG_IPV6_SUBTREES
2233 	if (src) {
2234 		NLA_PUT(skb, RTA_SRC, 16, src);
2235 		rtm->rtm_src_len = 128;
2236 	} else if (rtm->rtm_src_len)
2237 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2238 #endif
2239 	if (iif) {
2240 #ifdef CONFIG_IPV6_MROUTE
2241 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2242 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2243 			if (err <= 0) {
2244 				if (!nowait) {
2245 					if (err == 0)
2246 						return 0;
2247 					goto nla_put_failure;
2248 				} else {
2249 					if (err == -EMSGSIZE)
2250 						goto nla_put_failure;
2251 				}
2252 			}
2253 		} else
2254 #endif
2255 			NLA_PUT_U32(skb, RTA_IIF, iif);
2256 	} else if (dst) {
2257 		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2258 		struct in6_addr saddr_buf;
2259 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2260 				       dst, 0, &saddr_buf) == 0)
2261 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2262 	}
2263 
2264 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2265 		goto nla_put_failure;
2266 
2267 	if (rt->u.dst.neighbour)
2268 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2269 
2270 	if (rt->u.dst.dev)
2271 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2272 
2273 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2274 
2275 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2276 		expires = 0;
2277 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2278 		expires = rt->rt6i_expires - jiffies;
2279 	else
2280 		expires = INT_MAX;
2281 
2282 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2283 			       expires, rt->u.dst.error) < 0)
2284 		goto nla_put_failure;
2285 
2286 	return nlmsg_end(skb, nlh);
2287 
2288 nla_put_failure:
2289 	nlmsg_cancel(skb, nlh);
2290 	return -EMSGSIZE;
2291 }
2292 
2293 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2294 {
2295 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2296 	int prefix;
2297 
2298 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2299 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2300 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2301 	} else
2302 		prefix = 0;
2303 
2304 	return rt6_fill_node(arg->net,
2305 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2306 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2307 		     prefix, 0, NLM_F_MULTI);
2308 }
2309 
2310 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2311 {
2312 	struct net *net = sock_net(in_skb->sk);
2313 	struct nlattr *tb[RTA_MAX+1];
2314 	struct rt6_info *rt;
2315 	struct sk_buff *skb;
2316 	struct rtmsg *rtm;
2317 	struct flowi fl;
2318 	int err, iif = 0;
2319 
2320 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2321 	if (err < 0)
2322 		goto errout;
2323 
2324 	err = -EINVAL;
2325 	memset(&fl, 0, sizeof(fl));
2326 
2327 	if (tb[RTA_SRC]) {
2328 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2329 			goto errout;
2330 
2331 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2332 	}
2333 
2334 	if (tb[RTA_DST]) {
2335 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2336 			goto errout;
2337 
2338 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2339 	}
2340 
2341 	if (tb[RTA_IIF])
2342 		iif = nla_get_u32(tb[RTA_IIF]);
2343 
2344 	if (tb[RTA_OIF])
2345 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2346 
2347 	if (iif) {
2348 		struct net_device *dev;
2349 		dev = __dev_get_by_index(net, iif);
2350 		if (!dev) {
2351 			err = -ENODEV;
2352 			goto errout;
2353 		}
2354 	}
2355 
2356 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2357 	if (skb == NULL) {
2358 		err = -ENOBUFS;
2359 		goto errout;
2360 	}
2361 
2362 	/* Reserve room for dummy headers, this skb can pass
2363 	   through good chunk of routing engine.
2364 	 */
2365 	skb_reset_mac_header(skb);
2366 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2367 
2368 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2369 	skb_dst_set(skb, &rt->u.dst);
2370 
2371 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2372 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2373 			    nlh->nlmsg_seq, 0, 0, 0);
2374 	if (err < 0) {
2375 		kfree_skb(skb);
2376 		goto errout;
2377 	}
2378 
2379 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2380 errout:
2381 	return err;
2382 }
2383 
2384 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2385 {
2386 	struct sk_buff *skb;
2387 	struct net *net = info->nl_net;
2388 	u32 seq;
2389 	int err;
2390 
2391 	err = -ENOBUFS;
2392 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2393 
2394 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2395 	if (skb == NULL)
2396 		goto errout;
2397 
2398 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2399 				event, info->pid, seq, 0, 0, 0);
2400 	if (err < 0) {
2401 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2402 		WARN_ON(err == -EMSGSIZE);
2403 		kfree_skb(skb);
2404 		goto errout;
2405 	}
2406 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2407 		    info->nlh, gfp_any());
2408 	return;
2409 errout:
2410 	if (err < 0)
2411 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2412 }
2413 
2414 static int ip6_route_dev_notify(struct notifier_block *this,
2415 				unsigned long event, void *data)
2416 {
2417 	struct net_device *dev = (struct net_device *)data;
2418 	struct net *net = dev_net(dev);
2419 
2420 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2421 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2422 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2423 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2424 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2425 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2426 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2427 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2428 #endif
2429 	}
2430 
2431 	return NOTIFY_OK;
2432 }
2433 
2434 /*
2435  *	/proc
2436  */
2437 
2438 #ifdef CONFIG_PROC_FS
2439 
2440 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2441 
2442 struct rt6_proc_arg
2443 {
2444 	char *buffer;
2445 	int offset;
2446 	int length;
2447 	int skip;
2448 	int len;
2449 };
2450 
2451 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2452 {
2453 	struct seq_file *m = p_arg;
2454 
2455 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2456 
2457 #ifdef CONFIG_IPV6_SUBTREES
2458 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2459 #else
2460 	seq_puts(m, "00000000000000000000000000000000 00 ");
2461 #endif
2462 
2463 	if (rt->rt6i_nexthop) {
2464 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2465 	} else {
2466 		seq_puts(m, "00000000000000000000000000000000");
2467 	}
2468 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2469 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2470 		   rt->u.dst.__use, rt->rt6i_flags,
2471 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2472 	return 0;
2473 }
2474 
2475 static int ipv6_route_show(struct seq_file *m, void *v)
2476 {
2477 	struct net *net = (struct net *)m->private;
2478 	fib6_clean_all(net, rt6_info_route, 0, m);
2479 	return 0;
2480 }
2481 
2482 static int ipv6_route_open(struct inode *inode, struct file *file)
2483 {
2484 	return single_open_net(inode, file, ipv6_route_show);
2485 }
2486 
2487 static const struct file_operations ipv6_route_proc_fops = {
2488 	.owner		= THIS_MODULE,
2489 	.open		= ipv6_route_open,
2490 	.read		= seq_read,
2491 	.llseek		= seq_lseek,
2492 	.release	= single_release_net,
2493 };
2494 
2495 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2496 {
2497 	struct net *net = (struct net *)seq->private;
2498 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2499 		   net->ipv6.rt6_stats->fib_nodes,
2500 		   net->ipv6.rt6_stats->fib_route_nodes,
2501 		   net->ipv6.rt6_stats->fib_rt_alloc,
2502 		   net->ipv6.rt6_stats->fib_rt_entries,
2503 		   net->ipv6.rt6_stats->fib_rt_cache,
2504 		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
2505 		   net->ipv6.rt6_stats->fib_discarded_routes);
2506 
2507 	return 0;
2508 }
2509 
2510 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2511 {
2512 	return single_open_net(inode, file, rt6_stats_seq_show);
2513 }
2514 
2515 static const struct file_operations rt6_stats_seq_fops = {
2516 	.owner	 = THIS_MODULE,
2517 	.open	 = rt6_stats_seq_open,
2518 	.read	 = seq_read,
2519 	.llseek	 = seq_lseek,
2520 	.release = single_release_net,
2521 };
2522 #endif	/* CONFIG_PROC_FS */
2523 
2524 #ifdef CONFIG_SYSCTL
2525 
2526 static
2527 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2528 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2529 {
2530 	struct net *net = current->nsproxy->net_ns;
2531 	int delay = net->ipv6.sysctl.flush_delay;
2532 	if (write) {
2533 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2534 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2535 		return 0;
2536 	} else
2537 		return -EINVAL;
2538 }
2539 
2540 ctl_table ipv6_route_table_template[] = {
2541 	{
2542 		.procname	=	"flush",
2543 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2544 		.maxlen		=	sizeof(int),
2545 		.mode		=	0200,
2546 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2547 	},
2548 	{
2549 		.procname	=	"gc_thresh",
2550 		.data		=	&ip6_dst_ops_template.gc_thresh,
2551 		.maxlen		=	sizeof(int),
2552 		.mode		=	0644,
2553 		.proc_handler	=	proc_dointvec,
2554 	},
2555 	{
2556 		.procname	=	"max_size",
2557 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2558 		.maxlen		=	sizeof(int),
2559 		.mode		=	0644,
2560 		.proc_handler	=	proc_dointvec,
2561 	},
2562 	{
2563 		.procname	=	"gc_min_interval",
2564 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2565 		.maxlen		=	sizeof(int),
2566 		.mode		=	0644,
2567 		.proc_handler	=	proc_dointvec_jiffies,
2568 	},
2569 	{
2570 		.procname	=	"gc_timeout",
2571 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2572 		.maxlen		=	sizeof(int),
2573 		.mode		=	0644,
2574 		.proc_handler	=	proc_dointvec_jiffies,
2575 	},
2576 	{
2577 		.procname	=	"gc_interval",
2578 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2579 		.maxlen		=	sizeof(int),
2580 		.mode		=	0644,
2581 		.proc_handler	=	proc_dointvec_jiffies,
2582 	},
2583 	{
2584 		.procname	=	"gc_elasticity",
2585 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2586 		.maxlen		=	sizeof(int),
2587 		.mode		=	0644,
2588 		.proc_handler	=	proc_dointvec_jiffies,
2589 	},
2590 	{
2591 		.procname	=	"mtu_expires",
2592 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2593 		.maxlen		=	sizeof(int),
2594 		.mode		=	0644,
2595 		.proc_handler	=	proc_dointvec_jiffies,
2596 	},
2597 	{
2598 		.procname	=	"min_adv_mss",
2599 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2600 		.maxlen		=	sizeof(int),
2601 		.mode		=	0644,
2602 		.proc_handler	=	proc_dointvec_jiffies,
2603 	},
2604 	{
2605 		.procname	=	"gc_min_interval_ms",
2606 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2607 		.maxlen		=	sizeof(int),
2608 		.mode		=	0644,
2609 		.proc_handler	=	proc_dointvec_ms_jiffies,
2610 	},
2611 	{ }
2612 };
2613 
2614 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2615 {
2616 	struct ctl_table *table;
2617 
2618 	table = kmemdup(ipv6_route_table_template,
2619 			sizeof(ipv6_route_table_template),
2620 			GFP_KERNEL);
2621 
2622 	if (table) {
2623 		table[0].data = &net->ipv6.sysctl.flush_delay;
2624 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2625 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2626 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2627 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2628 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2629 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2630 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2631 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2632 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2633 	}
2634 
2635 	return table;
2636 }
2637 #endif
2638 
2639 static int __net_init ip6_route_net_init(struct net *net)
2640 {
2641 	int ret = -ENOMEM;
2642 
2643 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2644 	       sizeof(net->ipv6.ip6_dst_ops));
2645 
2646 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2647 					   sizeof(*net->ipv6.ip6_null_entry),
2648 					   GFP_KERNEL);
2649 	if (!net->ipv6.ip6_null_entry)
2650 		goto out_ip6_dst_ops;
2651 	net->ipv6.ip6_null_entry->u.dst.path =
2652 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2653 	net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2654 
2655 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2656 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2657 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2658 					       GFP_KERNEL);
2659 	if (!net->ipv6.ip6_prohibit_entry)
2660 		goto out_ip6_null_entry;
2661 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2662 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2663 	net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2664 
2665 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2666 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2667 					       GFP_KERNEL);
2668 	if (!net->ipv6.ip6_blk_hole_entry)
2669 		goto out_ip6_prohibit_entry;
2670 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2671 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2672 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2673 #endif
2674 
2675 	net->ipv6.sysctl.flush_delay = 0;
2676 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2677 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2678 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2679 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2680 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2681 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2682 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2683 
2684 #ifdef CONFIG_PROC_FS
2685 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2686 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2687 #endif
2688 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2689 
2690 	ret = 0;
2691 out:
2692 	return ret;
2693 
2694 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2695 out_ip6_prohibit_entry:
2696 	kfree(net->ipv6.ip6_prohibit_entry);
2697 out_ip6_null_entry:
2698 	kfree(net->ipv6.ip6_null_entry);
2699 #endif
2700 out_ip6_dst_ops:
2701 	goto out;
2702 }
2703 
2704 static void __net_exit ip6_route_net_exit(struct net *net)
2705 {
2706 #ifdef CONFIG_PROC_FS
2707 	proc_net_remove(net, "ipv6_route");
2708 	proc_net_remove(net, "rt6_stats");
2709 #endif
2710 	kfree(net->ipv6.ip6_null_entry);
2711 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2712 	kfree(net->ipv6.ip6_prohibit_entry);
2713 	kfree(net->ipv6.ip6_blk_hole_entry);
2714 #endif
2715 }
2716 
2717 static struct pernet_operations ip6_route_net_ops = {
2718 	.init = ip6_route_net_init,
2719 	.exit = ip6_route_net_exit,
2720 };
2721 
2722 static struct notifier_block ip6_route_dev_notifier = {
2723 	.notifier_call = ip6_route_dev_notify,
2724 	.priority = 0,
2725 };
2726 
2727 int __init ip6_route_init(void)
2728 {
2729 	int ret;
2730 
2731 	ret = -ENOMEM;
2732 	ip6_dst_ops_template.kmem_cachep =
2733 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2734 				  SLAB_HWCACHE_ALIGN, NULL);
2735 	if (!ip6_dst_ops_template.kmem_cachep)
2736 		goto out;
2737 
2738 	ret = register_pernet_subsys(&ip6_route_net_ops);
2739 	if (ret)
2740 		goto out_kmem_cache;
2741 
2742 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2743 
2744 	/* Registering of the loopback is done before this portion of code,
2745 	 * the loopback reference in rt6_info will not be taken, do it
2746 	 * manually for init_net */
2747 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2748 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2749   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2750 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2751 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2752 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2753 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2754   #endif
2755 	ret = fib6_init();
2756 	if (ret)
2757 		goto out_register_subsys;
2758 
2759 	ret = xfrm6_init();
2760 	if (ret)
2761 		goto out_fib6_init;
2762 
2763 	ret = fib6_rules_init();
2764 	if (ret)
2765 		goto xfrm6_init;
2766 
2767 	ret = -ENOBUFS;
2768 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2769 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2770 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2771 		goto fib6_rules_init;
2772 
2773 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2774 	if (ret)
2775 		goto fib6_rules_init;
2776 
2777 out:
2778 	return ret;
2779 
2780 fib6_rules_init:
2781 	fib6_rules_cleanup();
2782 xfrm6_init:
2783 	xfrm6_fini();
2784 out_fib6_init:
2785 	fib6_gc_cleanup();
2786 out_register_subsys:
2787 	unregister_pernet_subsys(&ip6_route_net_ops);
2788 out_kmem_cache:
2789 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2790 	goto out;
2791 }
2792 
2793 void ip6_route_cleanup(void)
2794 {
2795 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2796 	fib6_rules_cleanup();
2797 	xfrm6_fini();
2798 	fib6_gc_cleanup();
2799 	unregister_pernet_subsys(&ip6_route_net_ops);
2800 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2801 }
2802