xref: /linux/net/ipv6/route.c (revision 092e0e7e520a1fca03e13c9f2d157432a8657ff2)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.destroy		=	ip6_dst_destroy,
107 	.ifdown			=	ip6_dst_ifdown,
108 	.negative_advice	=	ip6_negative_advice,
109 	.link_failure		=	ip6_link_failure,
110 	.update_pmtu		=	ip6_rt_update_pmtu,
111 	.local_out		=	__ip6_local_out,
112 	.entries		=	ATOMIC_INIT(0),
113 };
114 
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118 
119 static struct dst_ops ip6_dst_blackhole_ops = {
120 	.family			=	AF_INET6,
121 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
122 	.destroy		=	ip6_dst_destroy,
123 	.check			=	ip6_dst_check,
124 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
125 	.entries		=	ATOMIC_INIT(0),
126 };
127 
128 static struct rt6_info ip6_null_entry_template = {
129 	.dst = {
130 		.__refcnt	= ATOMIC_INIT(1),
131 		.__use		= 1,
132 		.obsolete	= -1,
133 		.error		= -ENETUNREACH,
134 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
135 		.input		= ip6_pkt_discard,
136 		.output		= ip6_pkt_discard_out,
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_protocol  = RTPROT_KERNEL,
140 	.rt6i_metric	= ~(u32) 0,
141 	.rt6i_ref	= ATOMIC_INIT(1),
142 };
143 
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148 
149 static struct rt6_info ip6_prohibit_entry_template = {
150 	.dst = {
151 		.__refcnt	= ATOMIC_INIT(1),
152 		.__use		= 1,
153 		.obsolete	= -1,
154 		.error		= -EACCES,
155 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
156 		.input		= ip6_pkt_prohibit,
157 		.output		= ip6_pkt_prohibit_out,
158 	},
159 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
160 	.rt6i_protocol  = RTPROT_KERNEL,
161 	.rt6i_metric	= ~(u32) 0,
162 	.rt6i_ref	= ATOMIC_INIT(1),
163 };
164 
165 static struct rt6_info ip6_blk_hole_entry_template = {
166 	.dst = {
167 		.__refcnt	= ATOMIC_INIT(1),
168 		.__use		= 1,
169 		.obsolete	= -1,
170 		.error		= -EINVAL,
171 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
172 		.input		= dst_discard,
173 		.output		= dst_discard,
174 	},
175 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
176 	.rt6i_protocol  = RTPROT_KERNEL,
177 	.rt6i_metric	= ~(u32) 0,
178 	.rt6i_ref	= ATOMIC_INIT(1),
179 };
180 
181 #endif
182 
183 /* allocate dst with ip6_dst_ops */
184 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
185 {
186 	return (struct rt6_info *)dst_alloc(ops);
187 }
188 
189 static void ip6_dst_destroy(struct dst_entry *dst)
190 {
191 	struct rt6_info *rt = (struct rt6_info *)dst;
192 	struct inet6_dev *idev = rt->rt6i_idev;
193 
194 	if (idev != NULL) {
195 		rt->rt6i_idev = NULL;
196 		in6_dev_put(idev);
197 	}
198 }
199 
200 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
201 			   int how)
202 {
203 	struct rt6_info *rt = (struct rt6_info *)dst;
204 	struct inet6_dev *idev = rt->rt6i_idev;
205 	struct net_device *loopback_dev =
206 		dev_net(dev)->loopback_dev;
207 
208 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
209 		struct inet6_dev *loopback_idev =
210 			in6_dev_get(loopback_dev);
211 		if (loopback_idev != NULL) {
212 			rt->rt6i_idev = loopback_idev;
213 			in6_dev_put(idev);
214 		}
215 	}
216 }
217 
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220 	return (rt->rt6i_flags & RTF_EXPIRES &&
221 		time_after(jiffies, rt->rt6i_expires));
222 }
223 
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226 	return (ipv6_addr_type(daddr) &
227 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
228 }
229 
230 /*
231  *	Route lookup. Any table->tb6_lock is implied.
232  */
233 
234 static inline struct rt6_info *rt6_device_match(struct net *net,
235 						    struct rt6_info *rt,
236 						    struct in6_addr *saddr,
237 						    int oif,
238 						    int flags)
239 {
240 	struct rt6_info *local = NULL;
241 	struct rt6_info *sprt;
242 
243 	if (!oif && ipv6_addr_any(saddr))
244 		goto out;
245 
246 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
247 		struct net_device *dev = sprt->rt6i_dev;
248 
249 		if (oif) {
250 			if (dev->ifindex == oif)
251 				return sprt;
252 			if (dev->flags & IFF_LOOPBACK) {
253 				if (sprt->rt6i_idev == NULL ||
254 				    sprt->rt6i_idev->dev->ifindex != oif) {
255 					if (flags & RT6_LOOKUP_F_IFACE && oif)
256 						continue;
257 					if (local && (!oif ||
258 						      local->rt6i_idev->dev->ifindex == oif))
259 						continue;
260 				}
261 				local = sprt;
262 			}
263 		} else {
264 			if (ipv6_chk_addr(net, saddr, dev,
265 					  flags & RT6_LOOKUP_F_IFACE))
266 				return sprt;
267 		}
268 	}
269 
270 	if (oif) {
271 		if (local)
272 			return local;
273 
274 		if (flags & RT6_LOOKUP_F_IFACE)
275 			return net->ipv6.ip6_null_entry;
276 	}
277 out:
278 	return rt;
279 }
280 
281 #ifdef CONFIG_IPV6_ROUTER_PREF
282 static void rt6_probe(struct rt6_info *rt)
283 {
284 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
285 	/*
286 	 * Okay, this does not seem to be appropriate
287 	 * for now, however, we need to check if it
288 	 * is really so; aka Router Reachability Probing.
289 	 *
290 	 * Router Reachability Probe MUST be rate-limited
291 	 * to no more than one per minute.
292 	 */
293 	if (!neigh || (neigh->nud_state & NUD_VALID))
294 		return;
295 	read_lock_bh(&neigh->lock);
296 	if (!(neigh->nud_state & NUD_VALID) &&
297 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
298 		struct in6_addr mcaddr;
299 		struct in6_addr *target;
300 
301 		neigh->updated = jiffies;
302 		read_unlock_bh(&neigh->lock);
303 
304 		target = (struct in6_addr *)&neigh->primary_key;
305 		addrconf_addr_solict_mult(target, &mcaddr);
306 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
307 	} else
308 		read_unlock_bh(&neigh->lock);
309 }
310 #else
311 static inline void rt6_probe(struct rt6_info *rt)
312 {
313 }
314 #endif
315 
316 /*
317  * Default Router Selection (RFC 2461 6.3.6)
318  */
319 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
320 {
321 	struct net_device *dev = rt->rt6i_dev;
322 	if (!oif || dev->ifindex == oif)
323 		return 2;
324 	if ((dev->flags & IFF_LOOPBACK) &&
325 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
326 		return 1;
327 	return 0;
328 }
329 
330 static inline int rt6_check_neigh(struct rt6_info *rt)
331 {
332 	struct neighbour *neigh = rt->rt6i_nexthop;
333 	int m;
334 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
335 	    !(rt->rt6i_flags & RTF_GATEWAY))
336 		m = 1;
337 	else if (neigh) {
338 		read_lock_bh(&neigh->lock);
339 		if (neigh->nud_state & NUD_VALID)
340 			m = 2;
341 #ifdef CONFIG_IPV6_ROUTER_PREF
342 		else if (neigh->nud_state & NUD_FAILED)
343 			m = 0;
344 #endif
345 		else
346 			m = 1;
347 		read_unlock_bh(&neigh->lock);
348 	} else
349 		m = 0;
350 	return m;
351 }
352 
353 static int rt6_score_route(struct rt6_info *rt, int oif,
354 			   int strict)
355 {
356 	int m, n;
357 
358 	m = rt6_check_dev(rt, oif);
359 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
360 		return -1;
361 #ifdef CONFIG_IPV6_ROUTER_PREF
362 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
363 #endif
364 	n = rt6_check_neigh(rt);
365 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
366 		return -1;
367 	return m;
368 }
369 
370 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
371 				   int *mpri, struct rt6_info *match)
372 {
373 	int m;
374 
375 	if (rt6_check_expired(rt))
376 		goto out;
377 
378 	m = rt6_score_route(rt, oif, strict);
379 	if (m < 0)
380 		goto out;
381 
382 	if (m > *mpri) {
383 		if (strict & RT6_LOOKUP_F_REACHABLE)
384 			rt6_probe(match);
385 		*mpri = m;
386 		match = rt;
387 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
388 		rt6_probe(rt);
389 	}
390 
391 out:
392 	return match;
393 }
394 
395 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
396 				     struct rt6_info *rr_head,
397 				     u32 metric, int oif, int strict)
398 {
399 	struct rt6_info *rt, *match;
400 	int mpri = -1;
401 
402 	match = NULL;
403 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
404 	     rt = rt->dst.rt6_next)
405 		match = find_match(rt, oif, strict, &mpri, match);
406 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
407 	     rt = rt->dst.rt6_next)
408 		match = find_match(rt, oif, strict, &mpri, match);
409 
410 	return match;
411 }
412 
413 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
414 {
415 	struct rt6_info *match, *rt0;
416 	struct net *net;
417 
418 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
419 		  __func__, fn->leaf, oif);
420 
421 	rt0 = fn->rr_ptr;
422 	if (!rt0)
423 		fn->rr_ptr = rt0 = fn->leaf;
424 
425 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
426 
427 	if (!match &&
428 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
429 		struct rt6_info *next = rt0->dst.rt6_next;
430 
431 		/* no entries matched; do round-robin */
432 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
433 			next = fn->leaf;
434 
435 		if (next != rt0)
436 			fn->rr_ptr = next;
437 	}
438 
439 	RT6_TRACE("%s() => %p\n",
440 		  __func__, match);
441 
442 	net = dev_net(rt0->rt6i_dev);
443 	return (match ? match : net->ipv6.ip6_null_entry);
444 }
445 
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
448 		  struct in6_addr *gwaddr)
449 {
450 	struct net *net = dev_net(dev);
451 	struct route_info *rinfo = (struct route_info *) opt;
452 	struct in6_addr prefix_buf, *prefix;
453 	unsigned int pref;
454 	unsigned long lifetime;
455 	struct rt6_info *rt;
456 
457 	if (len < sizeof(struct route_info)) {
458 		return -EINVAL;
459 	}
460 
461 	/* Sanity check for prefix_len and length */
462 	if (rinfo->length > 3) {
463 		return -EINVAL;
464 	} else if (rinfo->prefix_len > 128) {
465 		return -EINVAL;
466 	} else if (rinfo->prefix_len > 64) {
467 		if (rinfo->length < 2) {
468 			return -EINVAL;
469 		}
470 	} else if (rinfo->prefix_len > 0) {
471 		if (rinfo->length < 1) {
472 			return -EINVAL;
473 		}
474 	}
475 
476 	pref = rinfo->route_pref;
477 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
478 		return -EINVAL;
479 
480 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
481 
482 	if (rinfo->length == 3)
483 		prefix = (struct in6_addr *)rinfo->prefix;
484 	else {
485 		/* this function is safe */
486 		ipv6_addr_prefix(&prefix_buf,
487 				 (struct in6_addr *)rinfo->prefix,
488 				 rinfo->prefix_len);
489 		prefix = &prefix_buf;
490 	}
491 
492 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
493 				dev->ifindex);
494 
495 	if (rt && !lifetime) {
496 		ip6_del_rt(rt);
497 		rt = NULL;
498 	}
499 
500 	if (!rt && lifetime)
501 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502 					pref);
503 	else if (rt)
504 		rt->rt6i_flags = RTF_ROUTEINFO |
505 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506 
507 	if (rt) {
508 		if (!addrconf_finite_timeout(lifetime)) {
509 			rt->rt6i_flags &= ~RTF_EXPIRES;
510 		} else {
511 			rt->rt6i_expires = jiffies + HZ * lifetime;
512 			rt->rt6i_flags |= RTF_EXPIRES;
513 		}
514 		dst_release(&rt->dst);
515 	}
516 	return 0;
517 }
518 #endif
519 
520 #define BACKTRACK(__net, saddr)			\
521 do { \
522 	if (rt == __net->ipv6.ip6_null_entry) {	\
523 		struct fib6_node *pn; \
524 		while (1) { \
525 			if (fn->fn_flags & RTN_TL_ROOT) \
526 				goto out; \
527 			pn = fn->parent; \
528 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530 			else \
531 				fn = pn; \
532 			if (fn->fn_flags & RTN_RTINFO) \
533 				goto restart; \
534 		} \
535 	} \
536 } while(0)
537 
538 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
539 					     struct fib6_table *table,
540 					     struct flowi *fl, int flags)
541 {
542 	struct fib6_node *fn;
543 	struct rt6_info *rt;
544 
545 	read_lock_bh(&table->tb6_lock);
546 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548 	rt = fn->leaf;
549 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
550 	BACKTRACK(net, &fl->fl6_src);
551 out:
552 	dst_use(&rt->dst, jiffies);
553 	read_unlock_bh(&table->tb6_lock);
554 	return rt;
555 
556 }
557 
558 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
559 			    const struct in6_addr *saddr, int oif, int strict)
560 {
561 	struct flowi fl = {
562 		.oif = oif,
563 		.nl_u = {
564 			.ip6_u = {
565 				.daddr = *daddr,
566 			},
567 		},
568 	};
569 	struct dst_entry *dst;
570 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571 
572 	if (saddr) {
573 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574 		flags |= RT6_LOOKUP_F_HAS_SADDR;
575 	}
576 
577 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
578 	if (dst->error == 0)
579 		return (struct rt6_info *) dst;
580 
581 	dst_release(dst);
582 
583 	return NULL;
584 }
585 
586 EXPORT_SYMBOL(rt6_lookup);
587 
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593 
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596 	int err;
597 	struct fib6_table *table;
598 
599 	table = rt->rt6i_table;
600 	write_lock_bh(&table->tb6_lock);
601 	err = fib6_add(&table->tb6_root, rt, info);
602 	write_unlock_bh(&table->tb6_lock);
603 
604 	return err;
605 }
606 
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609 	struct nl_info info = {
610 		.nl_net = dev_net(rt->rt6i_dev),
611 	};
612 	return __ip6_ins_rt(rt, &info);
613 }
614 
615 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
616 				      struct in6_addr *saddr)
617 {
618 	struct rt6_info *rt;
619 
620 	/*
621 	 *	Clone the route.
622 	 */
623 
624 	rt = ip6_rt_copy(ort);
625 
626 	if (rt) {
627 		struct neighbour *neigh;
628 		int attempts = !in_softirq();
629 
630 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631 			if (rt->rt6i_dst.plen != 128 &&
632 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633 				rt->rt6i_flags |= RTF_ANYCAST;
634 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
635 		}
636 
637 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
638 		rt->rt6i_dst.plen = 128;
639 		rt->rt6i_flags |= RTF_CACHE;
640 		rt->dst.flags |= DST_HOST;
641 
642 #ifdef CONFIG_IPV6_SUBTREES
643 		if (rt->rt6i_src.plen && saddr) {
644 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
645 			rt->rt6i_src.plen = 128;
646 		}
647 #endif
648 
649 	retry:
650 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
651 		if (IS_ERR(neigh)) {
652 			struct net *net = dev_net(rt->rt6i_dev);
653 			int saved_rt_min_interval =
654 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
655 			int saved_rt_elasticity =
656 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
657 
658 			if (attempts-- > 0) {
659 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
660 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
661 
662 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
663 
664 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
665 					saved_rt_elasticity;
666 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
667 					saved_rt_min_interval;
668 				goto retry;
669 			}
670 
671 			if (net_ratelimit())
672 				printk(KERN_WARNING
673 				       "ipv6: Neighbour table overflow.\n");
674 			dst_free(&rt->dst);
675 			return NULL;
676 		}
677 		rt->rt6i_nexthop = neigh;
678 
679 	}
680 
681 	return rt;
682 }
683 
684 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
685 {
686 	struct rt6_info *rt = ip6_rt_copy(ort);
687 	if (rt) {
688 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
689 		rt->rt6i_dst.plen = 128;
690 		rt->rt6i_flags |= RTF_CACHE;
691 		rt->dst.flags |= DST_HOST;
692 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
693 	}
694 	return rt;
695 }
696 
697 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
698 				      struct flowi *fl, int flags)
699 {
700 	struct fib6_node *fn;
701 	struct rt6_info *rt, *nrt;
702 	int strict = 0;
703 	int attempts = 3;
704 	int err;
705 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
706 
707 	strict |= flags & RT6_LOOKUP_F_IFACE;
708 
709 relookup:
710 	read_lock_bh(&table->tb6_lock);
711 
712 restart_2:
713 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
714 
715 restart:
716 	rt = rt6_select(fn, oif, strict | reachable);
717 
718 	BACKTRACK(net, &fl->fl6_src);
719 	if (rt == net->ipv6.ip6_null_entry ||
720 	    rt->rt6i_flags & RTF_CACHE)
721 		goto out;
722 
723 	dst_hold(&rt->dst);
724 	read_unlock_bh(&table->tb6_lock);
725 
726 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
727 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
728 	else {
729 #if CLONE_OFFLINK_ROUTE
730 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
731 #else
732 		goto out2;
733 #endif
734 	}
735 
736 	dst_release(&rt->dst);
737 	rt = nrt ? : net->ipv6.ip6_null_entry;
738 
739 	dst_hold(&rt->dst);
740 	if (nrt) {
741 		err = ip6_ins_rt(nrt);
742 		if (!err)
743 			goto out2;
744 	}
745 
746 	if (--attempts <= 0)
747 		goto out2;
748 
749 	/*
750 	 * Race condition! In the gap, when table->tb6_lock was
751 	 * released someone could insert this route.  Relookup.
752 	 */
753 	dst_release(&rt->dst);
754 	goto relookup;
755 
756 out:
757 	if (reachable) {
758 		reachable = 0;
759 		goto restart_2;
760 	}
761 	dst_hold(&rt->dst);
762 	read_unlock_bh(&table->tb6_lock);
763 out2:
764 	rt->dst.lastuse = jiffies;
765 	rt->dst.__use++;
766 
767 	return rt;
768 }
769 
770 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
771 					    struct flowi *fl, int flags)
772 {
773 	return ip6_pol_route(net, table, fl->iif, fl, flags);
774 }
775 
776 void ip6_route_input(struct sk_buff *skb)
777 {
778 	struct ipv6hdr *iph = ipv6_hdr(skb);
779 	struct net *net = dev_net(skb->dev);
780 	int flags = RT6_LOOKUP_F_HAS_SADDR;
781 	struct flowi fl = {
782 		.iif = skb->dev->ifindex,
783 		.nl_u = {
784 			.ip6_u = {
785 				.daddr = iph->daddr,
786 				.saddr = iph->saddr,
787 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
788 			},
789 		},
790 		.mark = skb->mark,
791 		.proto = iph->nexthdr,
792 	};
793 
794 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
795 		flags |= RT6_LOOKUP_F_IFACE;
796 
797 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
798 }
799 
800 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
801 					     struct flowi *fl, int flags)
802 {
803 	return ip6_pol_route(net, table, fl->oif, fl, flags);
804 }
805 
806 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
807 				    struct flowi *fl)
808 {
809 	int flags = 0;
810 
811 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
812 		flags |= RT6_LOOKUP_F_IFACE;
813 
814 	if (!ipv6_addr_any(&fl->fl6_src))
815 		flags |= RT6_LOOKUP_F_HAS_SADDR;
816 	else if (sk)
817 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
818 
819 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
820 }
821 
822 EXPORT_SYMBOL(ip6_route_output);
823 
824 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
825 {
826 	struct rt6_info *ort = (struct rt6_info *) *dstp;
827 	struct rt6_info *rt = (struct rt6_info *)
828 		dst_alloc(&ip6_dst_blackhole_ops);
829 	struct dst_entry *new = NULL;
830 
831 	if (rt) {
832 		new = &rt->dst;
833 
834 		atomic_set(&new->__refcnt, 1);
835 		new->__use = 1;
836 		new->input = dst_discard;
837 		new->output = dst_discard;
838 
839 		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
840 		new->dev = ort->dst.dev;
841 		if (new->dev)
842 			dev_hold(new->dev);
843 		rt->rt6i_idev = ort->rt6i_idev;
844 		if (rt->rt6i_idev)
845 			in6_dev_hold(rt->rt6i_idev);
846 		rt->rt6i_expires = 0;
847 
848 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
849 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
850 		rt->rt6i_metric = 0;
851 
852 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
853 #ifdef CONFIG_IPV6_SUBTREES
854 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
855 #endif
856 
857 		dst_free(new);
858 	}
859 
860 	dst_release(*dstp);
861 	*dstp = new;
862 	return (new ? 0 : -ENOMEM);
863 }
864 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
865 
866 /*
867  *	Destination cache support functions
868  */
869 
870 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
871 {
872 	struct rt6_info *rt;
873 
874 	rt = (struct rt6_info *) dst;
875 
876 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
877 		return dst;
878 
879 	return NULL;
880 }
881 
882 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
883 {
884 	struct rt6_info *rt = (struct rt6_info *) dst;
885 
886 	if (rt) {
887 		if (rt->rt6i_flags & RTF_CACHE) {
888 			if (rt6_check_expired(rt)) {
889 				ip6_del_rt(rt);
890 				dst = NULL;
891 			}
892 		} else {
893 			dst_release(dst);
894 			dst = NULL;
895 		}
896 	}
897 	return dst;
898 }
899 
900 static void ip6_link_failure(struct sk_buff *skb)
901 {
902 	struct rt6_info *rt;
903 
904 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
905 
906 	rt = (struct rt6_info *) skb_dst(skb);
907 	if (rt) {
908 		if (rt->rt6i_flags&RTF_CACHE) {
909 			dst_set_expires(&rt->dst, 0);
910 			rt->rt6i_flags |= RTF_EXPIRES;
911 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
912 			rt->rt6i_node->fn_sernum = -1;
913 	}
914 }
915 
916 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
917 {
918 	struct rt6_info *rt6 = (struct rt6_info*)dst;
919 
920 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
921 		rt6->rt6i_flags |= RTF_MODIFIED;
922 		if (mtu < IPV6_MIN_MTU) {
923 			mtu = IPV6_MIN_MTU;
924 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
925 		}
926 		dst->metrics[RTAX_MTU-1] = mtu;
927 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
928 	}
929 }
930 
931 static int ipv6_get_mtu(struct net_device *dev);
932 
933 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
934 {
935 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
936 
937 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
938 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
939 
940 	/*
941 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
942 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
943 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
944 	 * rely only on pmtu discovery"
945 	 */
946 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
947 		mtu = IPV6_MAXPLEN;
948 	return mtu;
949 }
950 
951 static struct dst_entry *icmp6_dst_gc_list;
952 static DEFINE_SPINLOCK(icmp6_dst_lock);
953 
954 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
955 				  struct neighbour *neigh,
956 				  const struct in6_addr *addr)
957 {
958 	struct rt6_info *rt;
959 	struct inet6_dev *idev = in6_dev_get(dev);
960 	struct net *net = dev_net(dev);
961 
962 	if (unlikely(idev == NULL))
963 		return NULL;
964 
965 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
966 	if (unlikely(rt == NULL)) {
967 		in6_dev_put(idev);
968 		goto out;
969 	}
970 
971 	dev_hold(dev);
972 	if (neigh)
973 		neigh_hold(neigh);
974 	else {
975 		neigh = ndisc_get_neigh(dev, addr);
976 		if (IS_ERR(neigh))
977 			neigh = NULL;
978 	}
979 
980 	rt->rt6i_dev	  = dev;
981 	rt->rt6i_idev     = idev;
982 	rt->rt6i_nexthop  = neigh;
983 	atomic_set(&rt->dst.__refcnt, 1);
984 	rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
985 	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
986 	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
987 	rt->dst.output  = ip6_output;
988 
989 #if 0	/* there's no chance to use these for ndisc */
990 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
991 				? DST_HOST
992 				: 0;
993 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
994 	rt->rt6i_dst.plen = 128;
995 #endif
996 
997 	spin_lock_bh(&icmp6_dst_lock);
998 	rt->dst.next = icmp6_dst_gc_list;
999 	icmp6_dst_gc_list = &rt->dst;
1000 	spin_unlock_bh(&icmp6_dst_lock);
1001 
1002 	fib6_force_start_gc(net);
1003 
1004 out:
1005 	return &rt->dst;
1006 }
1007 
1008 int icmp6_dst_gc(void)
1009 {
1010 	struct dst_entry *dst, *next, **pprev;
1011 	int more = 0;
1012 
1013 	next = NULL;
1014 
1015 	spin_lock_bh(&icmp6_dst_lock);
1016 	pprev = &icmp6_dst_gc_list;
1017 
1018 	while ((dst = *pprev) != NULL) {
1019 		if (!atomic_read(&dst->__refcnt)) {
1020 			*pprev = dst->next;
1021 			dst_free(dst);
1022 		} else {
1023 			pprev = &dst->next;
1024 			++more;
1025 		}
1026 	}
1027 
1028 	spin_unlock_bh(&icmp6_dst_lock);
1029 
1030 	return more;
1031 }
1032 
1033 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1034 			    void *arg)
1035 {
1036 	struct dst_entry *dst, **pprev;
1037 
1038 	spin_lock_bh(&icmp6_dst_lock);
1039 	pprev = &icmp6_dst_gc_list;
1040 	while ((dst = *pprev) != NULL) {
1041 		struct rt6_info *rt = (struct rt6_info *) dst;
1042 		if (func(rt, arg)) {
1043 			*pprev = dst->next;
1044 			dst_free(dst);
1045 		} else {
1046 			pprev = &dst->next;
1047 		}
1048 	}
1049 	spin_unlock_bh(&icmp6_dst_lock);
1050 }
1051 
1052 static int ip6_dst_gc(struct dst_ops *ops)
1053 {
1054 	unsigned long now = jiffies;
1055 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1056 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1057 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1058 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1059 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1060 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1061 
1062 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1063 	    atomic_read(&ops->entries) <= rt_max_size)
1064 		goto out;
1065 
1066 	net->ipv6.ip6_rt_gc_expire++;
1067 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068 	net->ipv6.ip6_rt_last_gc = now;
1069 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1070 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1071 out:
1072 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1073 	return (atomic_read(&ops->entries) > rt_max_size);
1074 }
1075 
1076 /* Clean host part of a prefix. Not necessary in radix tree,
1077    but results in cleaner routing tables.
1078 
1079    Remove it only when all the things will work!
1080  */
1081 
1082 static int ipv6_get_mtu(struct net_device *dev)
1083 {
1084 	int mtu = IPV6_MIN_MTU;
1085 	struct inet6_dev *idev;
1086 
1087 	rcu_read_lock();
1088 	idev = __in6_dev_get(dev);
1089 	if (idev)
1090 		mtu = idev->cnf.mtu6;
1091 	rcu_read_unlock();
1092 	return mtu;
1093 }
1094 
1095 int ip6_dst_hoplimit(struct dst_entry *dst)
1096 {
1097 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1098 	if (hoplimit < 0) {
1099 		struct net_device *dev = dst->dev;
1100 		struct inet6_dev *idev;
1101 
1102 		rcu_read_lock();
1103 		idev = __in6_dev_get(dev);
1104 		if (idev)
1105 			hoplimit = idev->cnf.hop_limit;
1106 		else
1107 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1108 		rcu_read_unlock();
1109 	}
1110 	return hoplimit;
1111 }
1112 
1113 /*
1114  *
1115  */
1116 
1117 int ip6_route_add(struct fib6_config *cfg)
1118 {
1119 	int err;
1120 	struct net *net = cfg->fc_nlinfo.nl_net;
1121 	struct rt6_info *rt = NULL;
1122 	struct net_device *dev = NULL;
1123 	struct inet6_dev *idev = NULL;
1124 	struct fib6_table *table;
1125 	int addr_type;
1126 
1127 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1128 		return -EINVAL;
1129 #ifndef CONFIG_IPV6_SUBTREES
1130 	if (cfg->fc_src_len)
1131 		return -EINVAL;
1132 #endif
1133 	if (cfg->fc_ifindex) {
1134 		err = -ENODEV;
1135 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1136 		if (!dev)
1137 			goto out;
1138 		idev = in6_dev_get(dev);
1139 		if (!idev)
1140 			goto out;
1141 	}
1142 
1143 	if (cfg->fc_metric == 0)
1144 		cfg->fc_metric = IP6_RT_PRIO_USER;
1145 
1146 	table = fib6_new_table(net, cfg->fc_table);
1147 	if (table == NULL) {
1148 		err = -ENOBUFS;
1149 		goto out;
1150 	}
1151 
1152 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1153 
1154 	if (rt == NULL) {
1155 		err = -ENOMEM;
1156 		goto out;
1157 	}
1158 
1159 	rt->dst.obsolete = -1;
1160 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1161 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1162 				0;
1163 
1164 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1165 		cfg->fc_protocol = RTPROT_BOOT;
1166 	rt->rt6i_protocol = cfg->fc_protocol;
1167 
1168 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1169 
1170 	if (addr_type & IPV6_ADDR_MULTICAST)
1171 		rt->dst.input = ip6_mc_input;
1172 	else
1173 		rt->dst.input = ip6_forward;
1174 
1175 	rt->dst.output = ip6_output;
1176 
1177 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1178 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1179 	if (rt->rt6i_dst.plen == 128)
1180 	       rt->dst.flags = DST_HOST;
1181 
1182 #ifdef CONFIG_IPV6_SUBTREES
1183 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1184 	rt->rt6i_src.plen = cfg->fc_src_len;
1185 #endif
1186 
1187 	rt->rt6i_metric = cfg->fc_metric;
1188 
1189 	/* We cannot add true routes via loopback here,
1190 	   they would result in kernel looping; promote them to reject routes
1191 	 */
1192 	if ((cfg->fc_flags & RTF_REJECT) ||
1193 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1194 		/* hold loopback dev/idev if we haven't done so. */
1195 		if (dev != net->loopback_dev) {
1196 			if (dev) {
1197 				dev_put(dev);
1198 				in6_dev_put(idev);
1199 			}
1200 			dev = net->loopback_dev;
1201 			dev_hold(dev);
1202 			idev = in6_dev_get(dev);
1203 			if (!idev) {
1204 				err = -ENODEV;
1205 				goto out;
1206 			}
1207 		}
1208 		rt->dst.output = ip6_pkt_discard_out;
1209 		rt->dst.input = ip6_pkt_discard;
1210 		rt->dst.error = -ENETUNREACH;
1211 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1212 		goto install_route;
1213 	}
1214 
1215 	if (cfg->fc_flags & RTF_GATEWAY) {
1216 		struct in6_addr *gw_addr;
1217 		int gwa_type;
1218 
1219 		gw_addr = &cfg->fc_gateway;
1220 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1221 		gwa_type = ipv6_addr_type(gw_addr);
1222 
1223 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1224 			struct rt6_info *grt;
1225 
1226 			/* IPv6 strictly inhibits using not link-local
1227 			   addresses as nexthop address.
1228 			   Otherwise, router will not able to send redirects.
1229 			   It is very good, but in some (rare!) circumstances
1230 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1231 			   some exceptions. --ANK
1232 			 */
1233 			err = -EINVAL;
1234 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1235 				goto out;
1236 
1237 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1238 
1239 			err = -EHOSTUNREACH;
1240 			if (grt == NULL)
1241 				goto out;
1242 			if (dev) {
1243 				if (dev != grt->rt6i_dev) {
1244 					dst_release(&grt->dst);
1245 					goto out;
1246 				}
1247 			} else {
1248 				dev = grt->rt6i_dev;
1249 				idev = grt->rt6i_idev;
1250 				dev_hold(dev);
1251 				in6_dev_hold(grt->rt6i_idev);
1252 			}
1253 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1254 				err = 0;
1255 			dst_release(&grt->dst);
1256 
1257 			if (err)
1258 				goto out;
1259 		}
1260 		err = -EINVAL;
1261 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1262 			goto out;
1263 	}
1264 
1265 	err = -ENODEV;
1266 	if (dev == NULL)
1267 		goto out;
1268 
1269 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1270 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1271 		if (IS_ERR(rt->rt6i_nexthop)) {
1272 			err = PTR_ERR(rt->rt6i_nexthop);
1273 			rt->rt6i_nexthop = NULL;
1274 			goto out;
1275 		}
1276 	}
1277 
1278 	rt->rt6i_flags = cfg->fc_flags;
1279 
1280 install_route:
1281 	if (cfg->fc_mx) {
1282 		struct nlattr *nla;
1283 		int remaining;
1284 
1285 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1286 			int type = nla_type(nla);
1287 
1288 			if (type) {
1289 				if (type > RTAX_MAX) {
1290 					err = -EINVAL;
1291 					goto out;
1292 				}
1293 
1294 				rt->dst.metrics[type - 1] = nla_get_u32(nla);
1295 			}
1296 		}
1297 	}
1298 
1299 	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1300 		rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1301 	if (!dst_mtu(&rt->dst))
1302 		rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1303 	if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1304 		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1305 	rt->dst.dev = dev;
1306 	rt->rt6i_idev = idev;
1307 	rt->rt6i_table = table;
1308 
1309 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1310 
1311 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1312 
1313 out:
1314 	if (dev)
1315 		dev_put(dev);
1316 	if (idev)
1317 		in6_dev_put(idev);
1318 	if (rt)
1319 		dst_free(&rt->dst);
1320 	return err;
1321 }
1322 
1323 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1324 {
1325 	int err;
1326 	struct fib6_table *table;
1327 	struct net *net = dev_net(rt->rt6i_dev);
1328 
1329 	if (rt == net->ipv6.ip6_null_entry)
1330 		return -ENOENT;
1331 
1332 	table = rt->rt6i_table;
1333 	write_lock_bh(&table->tb6_lock);
1334 
1335 	err = fib6_del(rt, info);
1336 	dst_release(&rt->dst);
1337 
1338 	write_unlock_bh(&table->tb6_lock);
1339 
1340 	return err;
1341 }
1342 
1343 int ip6_del_rt(struct rt6_info *rt)
1344 {
1345 	struct nl_info info = {
1346 		.nl_net = dev_net(rt->rt6i_dev),
1347 	};
1348 	return __ip6_del_rt(rt, &info);
1349 }
1350 
1351 static int ip6_route_del(struct fib6_config *cfg)
1352 {
1353 	struct fib6_table *table;
1354 	struct fib6_node *fn;
1355 	struct rt6_info *rt;
1356 	int err = -ESRCH;
1357 
1358 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1359 	if (table == NULL)
1360 		return err;
1361 
1362 	read_lock_bh(&table->tb6_lock);
1363 
1364 	fn = fib6_locate(&table->tb6_root,
1365 			 &cfg->fc_dst, cfg->fc_dst_len,
1366 			 &cfg->fc_src, cfg->fc_src_len);
1367 
1368 	if (fn) {
1369 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1370 			if (cfg->fc_ifindex &&
1371 			    (rt->rt6i_dev == NULL ||
1372 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1373 				continue;
1374 			if (cfg->fc_flags & RTF_GATEWAY &&
1375 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1376 				continue;
1377 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1378 				continue;
1379 			dst_hold(&rt->dst);
1380 			read_unlock_bh(&table->tb6_lock);
1381 
1382 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1383 		}
1384 	}
1385 	read_unlock_bh(&table->tb6_lock);
1386 
1387 	return err;
1388 }
1389 
1390 /*
1391  *	Handle redirects
1392  */
1393 struct ip6rd_flowi {
1394 	struct flowi fl;
1395 	struct in6_addr gateway;
1396 };
1397 
1398 static struct rt6_info *__ip6_route_redirect(struct net *net,
1399 					     struct fib6_table *table,
1400 					     struct flowi *fl,
1401 					     int flags)
1402 {
1403 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1404 	struct rt6_info *rt;
1405 	struct fib6_node *fn;
1406 
1407 	/*
1408 	 * Get the "current" route for this destination and
1409 	 * check if the redirect has come from approriate router.
1410 	 *
1411 	 * RFC 2461 specifies that redirects should only be
1412 	 * accepted if they come from the nexthop to the target.
1413 	 * Due to the way the routes are chosen, this notion
1414 	 * is a bit fuzzy and one might need to check all possible
1415 	 * routes.
1416 	 */
1417 
1418 	read_lock_bh(&table->tb6_lock);
1419 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1420 restart:
1421 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1422 		/*
1423 		 * Current route is on-link; redirect is always invalid.
1424 		 *
1425 		 * Seems, previous statement is not true. It could
1426 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1427 		 * But then router serving it might decide, that we should
1428 		 * know truth 8)8) --ANK (980726).
1429 		 */
1430 		if (rt6_check_expired(rt))
1431 			continue;
1432 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1433 			continue;
1434 		if (fl->oif != rt->rt6i_dev->ifindex)
1435 			continue;
1436 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1437 			continue;
1438 		break;
1439 	}
1440 
1441 	if (!rt)
1442 		rt = net->ipv6.ip6_null_entry;
1443 	BACKTRACK(net, &fl->fl6_src);
1444 out:
1445 	dst_hold(&rt->dst);
1446 
1447 	read_unlock_bh(&table->tb6_lock);
1448 
1449 	return rt;
1450 };
1451 
1452 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1453 					   struct in6_addr *src,
1454 					   struct in6_addr *gateway,
1455 					   struct net_device *dev)
1456 {
1457 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1458 	struct net *net = dev_net(dev);
1459 	struct ip6rd_flowi rdfl = {
1460 		.fl = {
1461 			.oif = dev->ifindex,
1462 			.nl_u = {
1463 				.ip6_u = {
1464 					.daddr = *dest,
1465 					.saddr = *src,
1466 				},
1467 			},
1468 		},
1469 	};
1470 
1471 	ipv6_addr_copy(&rdfl.gateway, gateway);
1472 
1473 	if (rt6_need_strict(dest))
1474 		flags |= RT6_LOOKUP_F_IFACE;
1475 
1476 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1477 						   flags, __ip6_route_redirect);
1478 }
1479 
1480 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1481 		  struct in6_addr *saddr,
1482 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1483 {
1484 	struct rt6_info *rt, *nrt = NULL;
1485 	struct netevent_redirect netevent;
1486 	struct net *net = dev_net(neigh->dev);
1487 
1488 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1489 
1490 	if (rt == net->ipv6.ip6_null_entry) {
1491 		if (net_ratelimit())
1492 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1493 			       "for redirect target\n");
1494 		goto out;
1495 	}
1496 
1497 	/*
1498 	 *	We have finally decided to accept it.
1499 	 */
1500 
1501 	neigh_update(neigh, lladdr, NUD_STALE,
1502 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1503 		     NEIGH_UPDATE_F_OVERRIDE|
1504 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1505 				     NEIGH_UPDATE_F_ISROUTER))
1506 		     );
1507 
1508 	/*
1509 	 * Redirect received -> path was valid.
1510 	 * Look, redirects are sent only in response to data packets,
1511 	 * so that this nexthop apparently is reachable. --ANK
1512 	 */
1513 	dst_confirm(&rt->dst);
1514 
1515 	/* Duplicate redirect: silently ignore. */
1516 	if (neigh == rt->dst.neighbour)
1517 		goto out;
1518 
1519 	nrt = ip6_rt_copy(rt);
1520 	if (nrt == NULL)
1521 		goto out;
1522 
1523 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1524 	if (on_link)
1525 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1526 
1527 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1528 	nrt->rt6i_dst.plen = 128;
1529 	nrt->dst.flags |= DST_HOST;
1530 
1531 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1532 	nrt->rt6i_nexthop = neigh_clone(neigh);
1533 	/* Reset pmtu, it may be better */
1534 	nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1535 	nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1536 							dst_mtu(&nrt->dst));
1537 
1538 	if (ip6_ins_rt(nrt))
1539 		goto out;
1540 
1541 	netevent.old = &rt->dst;
1542 	netevent.new = &nrt->dst;
1543 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1544 
1545 	if (rt->rt6i_flags&RTF_CACHE) {
1546 		ip6_del_rt(rt);
1547 		return;
1548 	}
1549 
1550 out:
1551 	dst_release(&rt->dst);
1552 }
1553 
1554 /*
1555  *	Handle ICMP "packet too big" messages
1556  *	i.e. Path MTU discovery
1557  */
1558 
1559 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1560 			     struct net *net, u32 pmtu, int ifindex)
1561 {
1562 	struct rt6_info *rt, *nrt;
1563 	int allfrag = 0;
1564 
1565 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1566 	if (rt == NULL)
1567 		return;
1568 
1569 	if (pmtu >= dst_mtu(&rt->dst))
1570 		goto out;
1571 
1572 	if (pmtu < IPV6_MIN_MTU) {
1573 		/*
1574 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1575 		 * MTU (1280) and a fragment header should always be included
1576 		 * after a node receiving Too Big message reporting PMTU is
1577 		 * less than the IPv6 Minimum Link MTU.
1578 		 */
1579 		pmtu = IPV6_MIN_MTU;
1580 		allfrag = 1;
1581 	}
1582 
1583 	/* New mtu received -> path was valid.
1584 	   They are sent only in response to data packets,
1585 	   so that this nexthop apparently is reachable. --ANK
1586 	 */
1587 	dst_confirm(&rt->dst);
1588 
1589 	/* Host route. If it is static, it would be better
1590 	   not to override it, but add new one, so that
1591 	   when cache entry will expire old pmtu
1592 	   would return automatically.
1593 	 */
1594 	if (rt->rt6i_flags & RTF_CACHE) {
1595 		rt->dst.metrics[RTAX_MTU-1] = pmtu;
1596 		if (allfrag)
1597 			rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1598 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1599 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1600 		goto out;
1601 	}
1602 
1603 	/* Network route.
1604 	   Two cases are possible:
1605 	   1. It is connected route. Action: COW
1606 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1607 	 */
1608 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1609 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1610 	else
1611 		nrt = rt6_alloc_clone(rt, daddr);
1612 
1613 	if (nrt) {
1614 		nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1615 		if (allfrag)
1616 			nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1617 
1618 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1619 		 * happened within 5 mins, the recommended timer is 10 mins.
1620 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1621 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1622 		 * and detecting PMTU increase will be automatically happened.
1623 		 */
1624 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1625 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1626 
1627 		ip6_ins_rt(nrt);
1628 	}
1629 out:
1630 	dst_release(&rt->dst);
1631 }
1632 
1633 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1634 			struct net_device *dev, u32 pmtu)
1635 {
1636 	struct net *net = dev_net(dev);
1637 
1638 	/*
1639 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1640 	 * is sending along the path" that caused the Packet Too Big message.
1641 	 * Since it's not possible in the general case to determine which
1642 	 * interface was used to send the original packet, we update the MTU
1643 	 * on the interface that will be used to send future packets. We also
1644 	 * update the MTU on the interface that received the Packet Too Big in
1645 	 * case the original packet was forced out that interface with
1646 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1647 	 * correct behaviour, which would be to update the MTU on all
1648 	 * interfaces.
1649 	 */
1650 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1651 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1652 }
1653 
1654 /*
1655  *	Misc support functions
1656  */
1657 
1658 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1659 {
1660 	struct net *net = dev_net(ort->rt6i_dev);
1661 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1662 
1663 	if (rt) {
1664 		rt->dst.input = ort->dst.input;
1665 		rt->dst.output = ort->dst.output;
1666 
1667 		memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1668 		rt->dst.error = ort->dst.error;
1669 		rt->dst.dev = ort->dst.dev;
1670 		if (rt->dst.dev)
1671 			dev_hold(rt->dst.dev);
1672 		rt->rt6i_idev = ort->rt6i_idev;
1673 		if (rt->rt6i_idev)
1674 			in6_dev_hold(rt->rt6i_idev);
1675 		rt->dst.lastuse = jiffies;
1676 		rt->rt6i_expires = 0;
1677 
1678 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1679 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1680 		rt->rt6i_metric = 0;
1681 
1682 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1683 #ifdef CONFIG_IPV6_SUBTREES
1684 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1685 #endif
1686 		rt->rt6i_table = ort->rt6i_table;
1687 	}
1688 	return rt;
1689 }
1690 
1691 #ifdef CONFIG_IPV6_ROUTE_INFO
1692 static struct rt6_info *rt6_get_route_info(struct net *net,
1693 					   struct in6_addr *prefix, int prefixlen,
1694 					   struct in6_addr *gwaddr, int ifindex)
1695 {
1696 	struct fib6_node *fn;
1697 	struct rt6_info *rt = NULL;
1698 	struct fib6_table *table;
1699 
1700 	table = fib6_get_table(net, RT6_TABLE_INFO);
1701 	if (table == NULL)
1702 		return NULL;
1703 
1704 	write_lock_bh(&table->tb6_lock);
1705 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1706 	if (!fn)
1707 		goto out;
1708 
1709 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1710 		if (rt->rt6i_dev->ifindex != ifindex)
1711 			continue;
1712 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1713 			continue;
1714 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1715 			continue;
1716 		dst_hold(&rt->dst);
1717 		break;
1718 	}
1719 out:
1720 	write_unlock_bh(&table->tb6_lock);
1721 	return rt;
1722 }
1723 
1724 static struct rt6_info *rt6_add_route_info(struct net *net,
1725 					   struct in6_addr *prefix, int prefixlen,
1726 					   struct in6_addr *gwaddr, int ifindex,
1727 					   unsigned pref)
1728 {
1729 	struct fib6_config cfg = {
1730 		.fc_table	= RT6_TABLE_INFO,
1731 		.fc_metric	= IP6_RT_PRIO_USER,
1732 		.fc_ifindex	= ifindex,
1733 		.fc_dst_len	= prefixlen,
1734 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1735 				  RTF_UP | RTF_PREF(pref),
1736 		.fc_nlinfo.pid = 0,
1737 		.fc_nlinfo.nlh = NULL,
1738 		.fc_nlinfo.nl_net = net,
1739 	};
1740 
1741 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1742 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1743 
1744 	/* We should treat it as a default route if prefix length is 0. */
1745 	if (!prefixlen)
1746 		cfg.fc_flags |= RTF_DEFAULT;
1747 
1748 	ip6_route_add(&cfg);
1749 
1750 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1751 }
1752 #endif
1753 
1754 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1755 {
1756 	struct rt6_info *rt;
1757 	struct fib6_table *table;
1758 
1759 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1760 	if (table == NULL)
1761 		return NULL;
1762 
1763 	write_lock_bh(&table->tb6_lock);
1764 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1765 		if (dev == rt->rt6i_dev &&
1766 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1767 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1768 			break;
1769 	}
1770 	if (rt)
1771 		dst_hold(&rt->dst);
1772 	write_unlock_bh(&table->tb6_lock);
1773 	return rt;
1774 }
1775 
1776 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1777 				     struct net_device *dev,
1778 				     unsigned int pref)
1779 {
1780 	struct fib6_config cfg = {
1781 		.fc_table	= RT6_TABLE_DFLT,
1782 		.fc_metric	= IP6_RT_PRIO_USER,
1783 		.fc_ifindex	= dev->ifindex,
1784 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1785 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1786 		.fc_nlinfo.pid = 0,
1787 		.fc_nlinfo.nlh = NULL,
1788 		.fc_nlinfo.nl_net = dev_net(dev),
1789 	};
1790 
1791 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1792 
1793 	ip6_route_add(&cfg);
1794 
1795 	return rt6_get_dflt_router(gwaddr, dev);
1796 }
1797 
1798 void rt6_purge_dflt_routers(struct net *net)
1799 {
1800 	struct rt6_info *rt;
1801 	struct fib6_table *table;
1802 
1803 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1804 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1805 	if (table == NULL)
1806 		return;
1807 
1808 restart:
1809 	read_lock_bh(&table->tb6_lock);
1810 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1811 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1812 			dst_hold(&rt->dst);
1813 			read_unlock_bh(&table->tb6_lock);
1814 			ip6_del_rt(rt);
1815 			goto restart;
1816 		}
1817 	}
1818 	read_unlock_bh(&table->tb6_lock);
1819 }
1820 
1821 static void rtmsg_to_fib6_config(struct net *net,
1822 				 struct in6_rtmsg *rtmsg,
1823 				 struct fib6_config *cfg)
1824 {
1825 	memset(cfg, 0, sizeof(*cfg));
1826 
1827 	cfg->fc_table = RT6_TABLE_MAIN;
1828 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1829 	cfg->fc_metric = rtmsg->rtmsg_metric;
1830 	cfg->fc_expires = rtmsg->rtmsg_info;
1831 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1832 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1833 	cfg->fc_flags = rtmsg->rtmsg_flags;
1834 
1835 	cfg->fc_nlinfo.nl_net = net;
1836 
1837 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1838 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1839 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1840 }
1841 
1842 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1843 {
1844 	struct fib6_config cfg;
1845 	struct in6_rtmsg rtmsg;
1846 	int err;
1847 
1848 	switch(cmd) {
1849 	case SIOCADDRT:		/* Add a route */
1850 	case SIOCDELRT:		/* Delete a route */
1851 		if (!capable(CAP_NET_ADMIN))
1852 			return -EPERM;
1853 		err = copy_from_user(&rtmsg, arg,
1854 				     sizeof(struct in6_rtmsg));
1855 		if (err)
1856 			return -EFAULT;
1857 
1858 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1859 
1860 		rtnl_lock();
1861 		switch (cmd) {
1862 		case SIOCADDRT:
1863 			err = ip6_route_add(&cfg);
1864 			break;
1865 		case SIOCDELRT:
1866 			err = ip6_route_del(&cfg);
1867 			break;
1868 		default:
1869 			err = -EINVAL;
1870 		}
1871 		rtnl_unlock();
1872 
1873 		return err;
1874 	}
1875 
1876 	return -EINVAL;
1877 }
1878 
1879 /*
1880  *	Drop the packet on the floor
1881  */
1882 
1883 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1884 {
1885 	int type;
1886 	struct dst_entry *dst = skb_dst(skb);
1887 	switch (ipstats_mib_noroutes) {
1888 	case IPSTATS_MIB_INNOROUTES:
1889 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1890 		if (type == IPV6_ADDR_ANY) {
1891 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1892 				      IPSTATS_MIB_INADDRERRORS);
1893 			break;
1894 		}
1895 		/* FALLTHROUGH */
1896 	case IPSTATS_MIB_OUTNOROUTES:
1897 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1898 			      ipstats_mib_noroutes);
1899 		break;
1900 	}
1901 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1902 	kfree_skb(skb);
1903 	return 0;
1904 }
1905 
1906 static int ip6_pkt_discard(struct sk_buff *skb)
1907 {
1908 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1909 }
1910 
1911 static int ip6_pkt_discard_out(struct sk_buff *skb)
1912 {
1913 	skb->dev = skb_dst(skb)->dev;
1914 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1915 }
1916 
1917 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1918 
1919 static int ip6_pkt_prohibit(struct sk_buff *skb)
1920 {
1921 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1922 }
1923 
1924 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1925 {
1926 	skb->dev = skb_dst(skb)->dev;
1927 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1928 }
1929 
1930 #endif
1931 
1932 /*
1933  *	Allocate a dst for local (unicast / anycast) address.
1934  */
1935 
1936 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1937 				    const struct in6_addr *addr,
1938 				    int anycast)
1939 {
1940 	struct net *net = dev_net(idev->dev);
1941 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1942 	struct neighbour *neigh;
1943 
1944 	if (rt == NULL)
1945 		return ERR_PTR(-ENOMEM);
1946 
1947 	dev_hold(net->loopback_dev);
1948 	in6_dev_hold(idev);
1949 
1950 	rt->dst.flags = DST_HOST;
1951 	rt->dst.input = ip6_input;
1952 	rt->dst.output = ip6_output;
1953 	rt->rt6i_dev = net->loopback_dev;
1954 	rt->rt6i_idev = idev;
1955 	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1956 	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1957 	rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1958 	rt->dst.obsolete = -1;
1959 
1960 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1961 	if (anycast)
1962 		rt->rt6i_flags |= RTF_ANYCAST;
1963 	else
1964 		rt->rt6i_flags |= RTF_LOCAL;
1965 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1966 	if (IS_ERR(neigh)) {
1967 		dst_free(&rt->dst);
1968 
1969 		/* We are casting this because that is the return
1970 		 * value type.  But an errno encoded pointer is the
1971 		 * same regardless of the underlying pointer type,
1972 		 * and that's what we are returning.  So this is OK.
1973 		 */
1974 		return (struct rt6_info *) neigh;
1975 	}
1976 	rt->rt6i_nexthop = neigh;
1977 
1978 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1979 	rt->rt6i_dst.plen = 128;
1980 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1981 
1982 	atomic_set(&rt->dst.__refcnt, 1);
1983 
1984 	return rt;
1985 }
1986 
1987 struct arg_dev_net {
1988 	struct net_device *dev;
1989 	struct net *net;
1990 };
1991 
1992 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1993 {
1994 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1995 	struct net *net = ((struct arg_dev_net *)arg)->net;
1996 
1997 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1998 	    rt != net->ipv6.ip6_null_entry) {
1999 		RT6_TRACE("deleted by ifdown %p\n", rt);
2000 		return -1;
2001 	}
2002 	return 0;
2003 }
2004 
2005 void rt6_ifdown(struct net *net, struct net_device *dev)
2006 {
2007 	struct arg_dev_net adn = {
2008 		.dev = dev,
2009 		.net = net,
2010 	};
2011 
2012 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2013 	icmp6_clean_all(fib6_ifdown, &adn);
2014 }
2015 
2016 struct rt6_mtu_change_arg
2017 {
2018 	struct net_device *dev;
2019 	unsigned mtu;
2020 };
2021 
2022 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2023 {
2024 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2025 	struct inet6_dev *idev;
2026 	struct net *net = dev_net(arg->dev);
2027 
2028 	/* In IPv6 pmtu discovery is not optional,
2029 	   so that RTAX_MTU lock cannot disable it.
2030 	   We still use this lock to block changes
2031 	   caused by addrconf/ndisc.
2032 	*/
2033 
2034 	idev = __in6_dev_get(arg->dev);
2035 	if (idev == NULL)
2036 		return 0;
2037 
2038 	/* For administrative MTU increase, there is no way to discover
2039 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2040 	   Since RFC 1981 doesn't include administrative MTU increase
2041 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2042 	 */
2043 	/*
2044 	   If new MTU is less than route PMTU, this new MTU will be the
2045 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2046 	   decreases; if new MTU is greater than route PMTU, and the
2047 	   old MTU is the lowest MTU in the path, update the route PMTU
2048 	   to reflect the increase. In this case if the other nodes' MTU
2049 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2050 	   PMTU discouvery.
2051 	 */
2052 	if (rt->rt6i_dev == arg->dev &&
2053 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2054 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2055 	     (dst_mtu(&rt->dst) < arg->mtu &&
2056 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2057 		rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2058 		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2059 	}
2060 	return 0;
2061 }
2062 
2063 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2064 {
2065 	struct rt6_mtu_change_arg arg = {
2066 		.dev = dev,
2067 		.mtu = mtu,
2068 	};
2069 
2070 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2071 }
2072 
2073 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2074 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2075 	[RTA_OIF]               = { .type = NLA_U32 },
2076 	[RTA_IIF]		= { .type = NLA_U32 },
2077 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2078 	[RTA_METRICS]           = { .type = NLA_NESTED },
2079 };
2080 
2081 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2082 			      struct fib6_config *cfg)
2083 {
2084 	struct rtmsg *rtm;
2085 	struct nlattr *tb[RTA_MAX+1];
2086 	int err;
2087 
2088 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2089 	if (err < 0)
2090 		goto errout;
2091 
2092 	err = -EINVAL;
2093 	rtm = nlmsg_data(nlh);
2094 	memset(cfg, 0, sizeof(*cfg));
2095 
2096 	cfg->fc_table = rtm->rtm_table;
2097 	cfg->fc_dst_len = rtm->rtm_dst_len;
2098 	cfg->fc_src_len = rtm->rtm_src_len;
2099 	cfg->fc_flags = RTF_UP;
2100 	cfg->fc_protocol = rtm->rtm_protocol;
2101 
2102 	if (rtm->rtm_type == RTN_UNREACHABLE)
2103 		cfg->fc_flags |= RTF_REJECT;
2104 
2105 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2106 	cfg->fc_nlinfo.nlh = nlh;
2107 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2108 
2109 	if (tb[RTA_GATEWAY]) {
2110 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2111 		cfg->fc_flags |= RTF_GATEWAY;
2112 	}
2113 
2114 	if (tb[RTA_DST]) {
2115 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2116 
2117 		if (nla_len(tb[RTA_DST]) < plen)
2118 			goto errout;
2119 
2120 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2121 	}
2122 
2123 	if (tb[RTA_SRC]) {
2124 		int plen = (rtm->rtm_src_len + 7) >> 3;
2125 
2126 		if (nla_len(tb[RTA_SRC]) < plen)
2127 			goto errout;
2128 
2129 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2130 	}
2131 
2132 	if (tb[RTA_OIF])
2133 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2134 
2135 	if (tb[RTA_PRIORITY])
2136 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2137 
2138 	if (tb[RTA_METRICS]) {
2139 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2140 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2141 	}
2142 
2143 	if (tb[RTA_TABLE])
2144 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2145 
2146 	err = 0;
2147 errout:
2148 	return err;
2149 }
2150 
2151 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2152 {
2153 	struct fib6_config cfg;
2154 	int err;
2155 
2156 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2157 	if (err < 0)
2158 		return err;
2159 
2160 	return ip6_route_del(&cfg);
2161 }
2162 
2163 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2164 {
2165 	struct fib6_config cfg;
2166 	int err;
2167 
2168 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2169 	if (err < 0)
2170 		return err;
2171 
2172 	return ip6_route_add(&cfg);
2173 }
2174 
2175 static inline size_t rt6_nlmsg_size(void)
2176 {
2177 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2178 	       + nla_total_size(16) /* RTA_SRC */
2179 	       + nla_total_size(16) /* RTA_DST */
2180 	       + nla_total_size(16) /* RTA_GATEWAY */
2181 	       + nla_total_size(16) /* RTA_PREFSRC */
2182 	       + nla_total_size(4) /* RTA_TABLE */
2183 	       + nla_total_size(4) /* RTA_IIF */
2184 	       + nla_total_size(4) /* RTA_OIF */
2185 	       + nla_total_size(4) /* RTA_PRIORITY */
2186 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2187 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2188 }
2189 
2190 static int rt6_fill_node(struct net *net,
2191 			 struct sk_buff *skb, struct rt6_info *rt,
2192 			 struct in6_addr *dst, struct in6_addr *src,
2193 			 int iif, int type, u32 pid, u32 seq,
2194 			 int prefix, int nowait, unsigned int flags)
2195 {
2196 	struct rtmsg *rtm;
2197 	struct nlmsghdr *nlh;
2198 	long expires;
2199 	u32 table;
2200 
2201 	if (prefix) {	/* user wants prefix routes only */
2202 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2203 			/* success since this is not a prefix route */
2204 			return 1;
2205 		}
2206 	}
2207 
2208 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2209 	if (nlh == NULL)
2210 		return -EMSGSIZE;
2211 
2212 	rtm = nlmsg_data(nlh);
2213 	rtm->rtm_family = AF_INET6;
2214 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2215 	rtm->rtm_src_len = rt->rt6i_src.plen;
2216 	rtm->rtm_tos = 0;
2217 	if (rt->rt6i_table)
2218 		table = rt->rt6i_table->tb6_id;
2219 	else
2220 		table = RT6_TABLE_UNSPEC;
2221 	rtm->rtm_table = table;
2222 	NLA_PUT_U32(skb, RTA_TABLE, table);
2223 	if (rt->rt6i_flags&RTF_REJECT)
2224 		rtm->rtm_type = RTN_UNREACHABLE;
2225 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2226 		rtm->rtm_type = RTN_LOCAL;
2227 	else
2228 		rtm->rtm_type = RTN_UNICAST;
2229 	rtm->rtm_flags = 0;
2230 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2231 	rtm->rtm_protocol = rt->rt6i_protocol;
2232 	if (rt->rt6i_flags&RTF_DYNAMIC)
2233 		rtm->rtm_protocol = RTPROT_REDIRECT;
2234 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2235 		rtm->rtm_protocol = RTPROT_KERNEL;
2236 	else if (rt->rt6i_flags&RTF_DEFAULT)
2237 		rtm->rtm_protocol = RTPROT_RA;
2238 
2239 	if (rt->rt6i_flags&RTF_CACHE)
2240 		rtm->rtm_flags |= RTM_F_CLONED;
2241 
2242 	if (dst) {
2243 		NLA_PUT(skb, RTA_DST, 16, dst);
2244 		rtm->rtm_dst_len = 128;
2245 	} else if (rtm->rtm_dst_len)
2246 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2247 #ifdef CONFIG_IPV6_SUBTREES
2248 	if (src) {
2249 		NLA_PUT(skb, RTA_SRC, 16, src);
2250 		rtm->rtm_src_len = 128;
2251 	} else if (rtm->rtm_src_len)
2252 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2253 #endif
2254 	if (iif) {
2255 #ifdef CONFIG_IPV6_MROUTE
2256 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2257 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2258 			if (err <= 0) {
2259 				if (!nowait) {
2260 					if (err == 0)
2261 						return 0;
2262 					goto nla_put_failure;
2263 				} else {
2264 					if (err == -EMSGSIZE)
2265 						goto nla_put_failure;
2266 				}
2267 			}
2268 		} else
2269 #endif
2270 			NLA_PUT_U32(skb, RTA_IIF, iif);
2271 	} else if (dst) {
2272 		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2273 		struct in6_addr saddr_buf;
2274 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2275 				       dst, 0, &saddr_buf) == 0)
2276 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2277 	}
2278 
2279 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2280 		goto nla_put_failure;
2281 
2282 	if (rt->dst.neighbour)
2283 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2284 
2285 	if (rt->dst.dev)
2286 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2287 
2288 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2289 
2290 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2291 		expires = 0;
2292 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2293 		expires = rt->rt6i_expires - jiffies;
2294 	else
2295 		expires = INT_MAX;
2296 
2297 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2298 			       expires, rt->dst.error) < 0)
2299 		goto nla_put_failure;
2300 
2301 	return nlmsg_end(skb, nlh);
2302 
2303 nla_put_failure:
2304 	nlmsg_cancel(skb, nlh);
2305 	return -EMSGSIZE;
2306 }
2307 
2308 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2309 {
2310 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2311 	int prefix;
2312 
2313 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2314 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2315 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2316 	} else
2317 		prefix = 0;
2318 
2319 	return rt6_fill_node(arg->net,
2320 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2321 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2322 		     prefix, 0, NLM_F_MULTI);
2323 }
2324 
2325 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2326 {
2327 	struct net *net = sock_net(in_skb->sk);
2328 	struct nlattr *tb[RTA_MAX+1];
2329 	struct rt6_info *rt;
2330 	struct sk_buff *skb;
2331 	struct rtmsg *rtm;
2332 	struct flowi fl;
2333 	int err, iif = 0;
2334 
2335 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2336 	if (err < 0)
2337 		goto errout;
2338 
2339 	err = -EINVAL;
2340 	memset(&fl, 0, sizeof(fl));
2341 
2342 	if (tb[RTA_SRC]) {
2343 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2344 			goto errout;
2345 
2346 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2347 	}
2348 
2349 	if (tb[RTA_DST]) {
2350 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2351 			goto errout;
2352 
2353 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2354 	}
2355 
2356 	if (tb[RTA_IIF])
2357 		iif = nla_get_u32(tb[RTA_IIF]);
2358 
2359 	if (tb[RTA_OIF])
2360 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2361 
2362 	if (iif) {
2363 		struct net_device *dev;
2364 		dev = __dev_get_by_index(net, iif);
2365 		if (!dev) {
2366 			err = -ENODEV;
2367 			goto errout;
2368 		}
2369 	}
2370 
2371 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372 	if (skb == NULL) {
2373 		err = -ENOBUFS;
2374 		goto errout;
2375 	}
2376 
2377 	/* Reserve room for dummy headers, this skb can pass
2378 	   through good chunk of routing engine.
2379 	 */
2380 	skb_reset_mac_header(skb);
2381 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2382 
2383 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2384 	skb_dst_set(skb, &rt->dst);
2385 
2386 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2387 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2388 			    nlh->nlmsg_seq, 0, 0, 0);
2389 	if (err < 0) {
2390 		kfree_skb(skb);
2391 		goto errout;
2392 	}
2393 
2394 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2395 errout:
2396 	return err;
2397 }
2398 
2399 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2400 {
2401 	struct sk_buff *skb;
2402 	struct net *net = info->nl_net;
2403 	u32 seq;
2404 	int err;
2405 
2406 	err = -ENOBUFS;
2407 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2408 
2409 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2410 	if (skb == NULL)
2411 		goto errout;
2412 
2413 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2414 				event, info->pid, seq, 0, 0, 0);
2415 	if (err < 0) {
2416 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2417 		WARN_ON(err == -EMSGSIZE);
2418 		kfree_skb(skb);
2419 		goto errout;
2420 	}
2421 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2422 		    info->nlh, gfp_any());
2423 	return;
2424 errout:
2425 	if (err < 0)
2426 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2427 }
2428 
2429 static int ip6_route_dev_notify(struct notifier_block *this,
2430 				unsigned long event, void *data)
2431 {
2432 	struct net_device *dev = (struct net_device *)data;
2433 	struct net *net = dev_net(dev);
2434 
2435 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2436 		net->ipv6.ip6_null_entry->dst.dev = dev;
2437 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2438 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2439 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2440 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2441 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2442 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2443 #endif
2444 	}
2445 
2446 	return NOTIFY_OK;
2447 }
2448 
2449 /*
2450  *	/proc
2451  */
2452 
2453 #ifdef CONFIG_PROC_FS
2454 
2455 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2456 
2457 struct rt6_proc_arg
2458 {
2459 	char *buffer;
2460 	int offset;
2461 	int length;
2462 	int skip;
2463 	int len;
2464 };
2465 
2466 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2467 {
2468 	struct seq_file *m = p_arg;
2469 
2470 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2471 
2472 #ifdef CONFIG_IPV6_SUBTREES
2473 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2474 #else
2475 	seq_puts(m, "00000000000000000000000000000000 00 ");
2476 #endif
2477 
2478 	if (rt->rt6i_nexthop) {
2479 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2480 	} else {
2481 		seq_puts(m, "00000000000000000000000000000000");
2482 	}
2483 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2484 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2485 		   rt->dst.__use, rt->rt6i_flags,
2486 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2487 	return 0;
2488 }
2489 
2490 static int ipv6_route_show(struct seq_file *m, void *v)
2491 {
2492 	struct net *net = (struct net *)m->private;
2493 	fib6_clean_all(net, rt6_info_route, 0, m);
2494 	return 0;
2495 }
2496 
2497 static int ipv6_route_open(struct inode *inode, struct file *file)
2498 {
2499 	return single_open_net(inode, file, ipv6_route_show);
2500 }
2501 
2502 static const struct file_operations ipv6_route_proc_fops = {
2503 	.owner		= THIS_MODULE,
2504 	.open		= ipv6_route_open,
2505 	.read		= seq_read,
2506 	.llseek		= seq_lseek,
2507 	.release	= single_release_net,
2508 };
2509 
2510 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2511 {
2512 	struct net *net = (struct net *)seq->private;
2513 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2514 		   net->ipv6.rt6_stats->fib_nodes,
2515 		   net->ipv6.rt6_stats->fib_route_nodes,
2516 		   net->ipv6.rt6_stats->fib_rt_alloc,
2517 		   net->ipv6.rt6_stats->fib_rt_entries,
2518 		   net->ipv6.rt6_stats->fib_rt_cache,
2519 		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
2520 		   net->ipv6.rt6_stats->fib_discarded_routes);
2521 
2522 	return 0;
2523 }
2524 
2525 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2526 {
2527 	return single_open_net(inode, file, rt6_stats_seq_show);
2528 }
2529 
2530 static const struct file_operations rt6_stats_seq_fops = {
2531 	.owner	 = THIS_MODULE,
2532 	.open	 = rt6_stats_seq_open,
2533 	.read	 = seq_read,
2534 	.llseek	 = seq_lseek,
2535 	.release = single_release_net,
2536 };
2537 #endif	/* CONFIG_PROC_FS */
2538 
2539 #ifdef CONFIG_SYSCTL
2540 
2541 static
2542 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2543 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2544 {
2545 	struct net *net = current->nsproxy->net_ns;
2546 	int delay = net->ipv6.sysctl.flush_delay;
2547 	if (write) {
2548 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2549 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2550 		return 0;
2551 	} else
2552 		return -EINVAL;
2553 }
2554 
2555 ctl_table ipv6_route_table_template[] = {
2556 	{
2557 		.procname	=	"flush",
2558 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2559 		.maxlen		=	sizeof(int),
2560 		.mode		=	0200,
2561 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2562 	},
2563 	{
2564 		.procname	=	"gc_thresh",
2565 		.data		=	&ip6_dst_ops_template.gc_thresh,
2566 		.maxlen		=	sizeof(int),
2567 		.mode		=	0644,
2568 		.proc_handler	=	proc_dointvec,
2569 	},
2570 	{
2571 		.procname	=	"max_size",
2572 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2573 		.maxlen		=	sizeof(int),
2574 		.mode		=	0644,
2575 		.proc_handler	=	proc_dointvec,
2576 	},
2577 	{
2578 		.procname	=	"gc_min_interval",
2579 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2580 		.maxlen		=	sizeof(int),
2581 		.mode		=	0644,
2582 		.proc_handler	=	proc_dointvec_jiffies,
2583 	},
2584 	{
2585 		.procname	=	"gc_timeout",
2586 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2587 		.maxlen		=	sizeof(int),
2588 		.mode		=	0644,
2589 		.proc_handler	=	proc_dointvec_jiffies,
2590 	},
2591 	{
2592 		.procname	=	"gc_interval",
2593 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2594 		.maxlen		=	sizeof(int),
2595 		.mode		=	0644,
2596 		.proc_handler	=	proc_dointvec_jiffies,
2597 	},
2598 	{
2599 		.procname	=	"gc_elasticity",
2600 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2601 		.maxlen		=	sizeof(int),
2602 		.mode		=	0644,
2603 		.proc_handler	=	proc_dointvec,
2604 	},
2605 	{
2606 		.procname	=	"mtu_expires",
2607 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2608 		.maxlen		=	sizeof(int),
2609 		.mode		=	0644,
2610 		.proc_handler	=	proc_dointvec_jiffies,
2611 	},
2612 	{
2613 		.procname	=	"min_adv_mss",
2614 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2615 		.maxlen		=	sizeof(int),
2616 		.mode		=	0644,
2617 		.proc_handler	=	proc_dointvec,
2618 	},
2619 	{
2620 		.procname	=	"gc_min_interval_ms",
2621 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2622 		.maxlen		=	sizeof(int),
2623 		.mode		=	0644,
2624 		.proc_handler	=	proc_dointvec_ms_jiffies,
2625 	},
2626 	{ }
2627 };
2628 
2629 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2630 {
2631 	struct ctl_table *table;
2632 
2633 	table = kmemdup(ipv6_route_table_template,
2634 			sizeof(ipv6_route_table_template),
2635 			GFP_KERNEL);
2636 
2637 	if (table) {
2638 		table[0].data = &net->ipv6.sysctl.flush_delay;
2639 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2640 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2641 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2642 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2643 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2644 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2645 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2646 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2647 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2648 	}
2649 
2650 	return table;
2651 }
2652 #endif
2653 
2654 static int __net_init ip6_route_net_init(struct net *net)
2655 {
2656 	int ret = -ENOMEM;
2657 
2658 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2659 	       sizeof(net->ipv6.ip6_dst_ops));
2660 
2661 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2662 					   sizeof(*net->ipv6.ip6_null_entry),
2663 					   GFP_KERNEL);
2664 	if (!net->ipv6.ip6_null_entry)
2665 		goto out_ip6_dst_ops;
2666 	net->ipv6.ip6_null_entry->dst.path =
2667 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2668 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2669 
2670 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2671 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2672 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2673 					       GFP_KERNEL);
2674 	if (!net->ipv6.ip6_prohibit_entry)
2675 		goto out_ip6_null_entry;
2676 	net->ipv6.ip6_prohibit_entry->dst.path =
2677 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2678 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2679 
2680 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2681 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2682 					       GFP_KERNEL);
2683 	if (!net->ipv6.ip6_blk_hole_entry)
2684 		goto out_ip6_prohibit_entry;
2685 	net->ipv6.ip6_blk_hole_entry->dst.path =
2686 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2687 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2688 #endif
2689 
2690 	net->ipv6.sysctl.flush_delay = 0;
2691 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2692 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2693 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2694 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2695 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2696 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2697 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2698 
2699 #ifdef CONFIG_PROC_FS
2700 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2701 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2702 #endif
2703 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2704 
2705 	ret = 0;
2706 out:
2707 	return ret;
2708 
2709 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2710 out_ip6_prohibit_entry:
2711 	kfree(net->ipv6.ip6_prohibit_entry);
2712 out_ip6_null_entry:
2713 	kfree(net->ipv6.ip6_null_entry);
2714 #endif
2715 out_ip6_dst_ops:
2716 	goto out;
2717 }
2718 
2719 static void __net_exit ip6_route_net_exit(struct net *net)
2720 {
2721 #ifdef CONFIG_PROC_FS
2722 	proc_net_remove(net, "ipv6_route");
2723 	proc_net_remove(net, "rt6_stats");
2724 #endif
2725 	kfree(net->ipv6.ip6_null_entry);
2726 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2727 	kfree(net->ipv6.ip6_prohibit_entry);
2728 	kfree(net->ipv6.ip6_blk_hole_entry);
2729 #endif
2730 }
2731 
2732 static struct pernet_operations ip6_route_net_ops = {
2733 	.init = ip6_route_net_init,
2734 	.exit = ip6_route_net_exit,
2735 };
2736 
2737 static struct notifier_block ip6_route_dev_notifier = {
2738 	.notifier_call = ip6_route_dev_notify,
2739 	.priority = 0,
2740 };
2741 
2742 int __init ip6_route_init(void)
2743 {
2744 	int ret;
2745 
2746 	ret = -ENOMEM;
2747 	ip6_dst_ops_template.kmem_cachep =
2748 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2749 				  SLAB_HWCACHE_ALIGN, NULL);
2750 	if (!ip6_dst_ops_template.kmem_cachep)
2751 		goto out;
2752 
2753 	ret = register_pernet_subsys(&ip6_route_net_ops);
2754 	if (ret)
2755 		goto out_kmem_cache;
2756 
2757 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2758 
2759 	/* Registering of the loopback is done before this portion of code,
2760 	 * the loopback reference in rt6_info will not be taken, do it
2761 	 * manually for init_net */
2762 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2763 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2764   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2765 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2766 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2767 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2768 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2769   #endif
2770 	ret = fib6_init();
2771 	if (ret)
2772 		goto out_register_subsys;
2773 
2774 	ret = xfrm6_init();
2775 	if (ret)
2776 		goto out_fib6_init;
2777 
2778 	ret = fib6_rules_init();
2779 	if (ret)
2780 		goto xfrm6_init;
2781 
2782 	ret = -ENOBUFS;
2783 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2784 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2785 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2786 		goto fib6_rules_init;
2787 
2788 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2789 	if (ret)
2790 		goto fib6_rules_init;
2791 
2792 out:
2793 	return ret;
2794 
2795 fib6_rules_init:
2796 	fib6_rules_cleanup();
2797 xfrm6_init:
2798 	xfrm6_fini();
2799 out_fib6_init:
2800 	fib6_gc_cleanup();
2801 out_register_subsys:
2802 	unregister_pernet_subsys(&ip6_route_net_ops);
2803 out_kmem_cache:
2804 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2805 	goto out;
2806 }
2807 
2808 void ip6_route_cleanup(void)
2809 {
2810 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2811 	fib6_rules_cleanup();
2812 	xfrm6_fini();
2813 	fib6_gc_cleanup();
2814 	unregister_pernet_subsys(&ip6_route_net_ops);
2815 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2816 }
2817