xref: /linux/net/ipv6/route.c (revision cb299ba8b5ef2239429484072fea394cd7581bd7)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.destroy		=	ip6_dst_destroy,
107 	.ifdown			=	ip6_dst_ifdown,
108 	.negative_advice	=	ip6_negative_advice,
109 	.link_failure		=	ip6_link_failure,
110 	.update_pmtu		=	ip6_rt_update_pmtu,
111 	.local_out		=	__ip6_local_out,
112 };
113 
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117 
118 static struct dst_ops ip6_dst_blackhole_ops = {
119 	.family			=	AF_INET6,
120 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
121 	.destroy		=	ip6_dst_destroy,
122 	.check			=	ip6_dst_check,
123 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
124 };
125 
126 static struct rt6_info ip6_null_entry_template = {
127 	.dst = {
128 		.__refcnt	= ATOMIC_INIT(1),
129 		.__use		= 1,
130 		.obsolete	= -1,
131 		.error		= -ENETUNREACH,
132 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
133 		.input		= ip6_pkt_discard,
134 		.output		= ip6_pkt_discard_out,
135 	},
136 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
137 	.rt6i_protocol  = RTPROT_KERNEL,
138 	.rt6i_metric	= ~(u32) 0,
139 	.rt6i_ref	= ATOMIC_INIT(1),
140 };
141 
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143 
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 
147 static struct rt6_info ip6_prohibit_entry_template = {
148 	.dst = {
149 		.__refcnt	= ATOMIC_INIT(1),
150 		.__use		= 1,
151 		.obsolete	= -1,
152 		.error		= -EACCES,
153 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
154 		.input		= ip6_pkt_prohibit,
155 		.output		= ip6_pkt_prohibit_out,
156 	},
157 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
158 	.rt6i_protocol  = RTPROT_KERNEL,
159 	.rt6i_metric	= ~(u32) 0,
160 	.rt6i_ref	= ATOMIC_INIT(1),
161 };
162 
163 static struct rt6_info ip6_blk_hole_entry_template = {
164 	.dst = {
165 		.__refcnt	= ATOMIC_INIT(1),
166 		.__use		= 1,
167 		.obsolete	= -1,
168 		.error		= -EINVAL,
169 		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
170 		.input		= dst_discard,
171 		.output		= dst_discard,
172 	},
173 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
174 	.rt6i_protocol  = RTPROT_KERNEL,
175 	.rt6i_metric	= ~(u32) 0,
176 	.rt6i_ref	= ATOMIC_INIT(1),
177 };
178 
179 #endif
180 
181 /* allocate dst with ip6_dst_ops */
182 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
183 {
184 	return (struct rt6_info *)dst_alloc(ops);
185 }
186 
187 static void ip6_dst_destroy(struct dst_entry *dst)
188 {
189 	struct rt6_info *rt = (struct rt6_info *)dst;
190 	struct inet6_dev *idev = rt->rt6i_idev;
191 
192 	if (idev != NULL) {
193 		rt->rt6i_idev = NULL;
194 		in6_dev_put(idev);
195 	}
196 }
197 
198 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
199 			   int how)
200 {
201 	struct rt6_info *rt = (struct rt6_info *)dst;
202 	struct inet6_dev *idev = rt->rt6i_idev;
203 	struct net_device *loopback_dev =
204 		dev_net(dev)->loopback_dev;
205 
206 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
207 		struct inet6_dev *loopback_idev =
208 			in6_dev_get(loopback_dev);
209 		if (loopback_idev != NULL) {
210 			rt->rt6i_idev = loopback_idev;
211 			in6_dev_put(idev);
212 		}
213 	}
214 }
215 
216 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
217 {
218 	return (rt->rt6i_flags & RTF_EXPIRES) &&
219 		time_after(jiffies, rt->rt6i_expires);
220 }
221 
222 static inline int rt6_need_strict(struct in6_addr *daddr)
223 {
224 	return ipv6_addr_type(daddr) &
225 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
226 }
227 
228 /*
229  *	Route lookup. Any table->tb6_lock is implied.
230  */
231 
232 static inline struct rt6_info *rt6_device_match(struct net *net,
233 						    struct rt6_info *rt,
234 						    struct in6_addr *saddr,
235 						    int oif,
236 						    int flags)
237 {
238 	struct rt6_info *local = NULL;
239 	struct rt6_info *sprt;
240 
241 	if (!oif && ipv6_addr_any(saddr))
242 		goto out;
243 
244 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
245 		struct net_device *dev = sprt->rt6i_dev;
246 
247 		if (oif) {
248 			if (dev->ifindex == oif)
249 				return sprt;
250 			if (dev->flags & IFF_LOOPBACK) {
251 				if (sprt->rt6i_idev == NULL ||
252 				    sprt->rt6i_idev->dev->ifindex != oif) {
253 					if (flags & RT6_LOOKUP_F_IFACE && oif)
254 						continue;
255 					if (local && (!oif ||
256 						      local->rt6i_idev->dev->ifindex == oif))
257 						continue;
258 				}
259 				local = sprt;
260 			}
261 		} else {
262 			if (ipv6_chk_addr(net, saddr, dev,
263 					  flags & RT6_LOOKUP_F_IFACE))
264 				return sprt;
265 		}
266 	}
267 
268 	if (oif) {
269 		if (local)
270 			return local;
271 
272 		if (flags & RT6_LOOKUP_F_IFACE)
273 			return net->ipv6.ip6_null_entry;
274 	}
275 out:
276 	return rt;
277 }
278 
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283 	/*
284 	 * Okay, this does not seem to be appropriate
285 	 * for now, however, we need to check if it
286 	 * is really so; aka Router Reachability Probing.
287 	 *
288 	 * Router Reachability Probe MUST be rate-limited
289 	 * to no more than one per minute.
290 	 */
291 	if (!neigh || (neigh->nud_state & NUD_VALID))
292 		return;
293 	read_lock_bh(&neigh->lock);
294 	if (!(neigh->nud_state & NUD_VALID) &&
295 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296 		struct in6_addr mcaddr;
297 		struct in6_addr *target;
298 
299 		neigh->updated = jiffies;
300 		read_unlock_bh(&neigh->lock);
301 
302 		target = (struct in6_addr *)&neigh->primary_key;
303 		addrconf_addr_solict_mult(target, &mcaddr);
304 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305 	} else
306 		read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311 }
312 #endif
313 
314 /*
315  * Default Router Selection (RFC 2461 6.3.6)
316  */
317 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 {
319 	struct net_device *dev = rt->rt6i_dev;
320 	if (!oif || dev->ifindex == oif)
321 		return 2;
322 	if ((dev->flags & IFF_LOOPBACK) &&
323 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
324 		return 1;
325 	return 0;
326 }
327 
328 static inline int rt6_check_neigh(struct rt6_info *rt)
329 {
330 	struct neighbour *neigh = rt->rt6i_nexthop;
331 	int m;
332 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
333 	    !(rt->rt6i_flags & RTF_GATEWAY))
334 		m = 1;
335 	else if (neigh) {
336 		read_lock_bh(&neigh->lock);
337 		if (neigh->nud_state & NUD_VALID)
338 			m = 2;
339 #ifdef CONFIG_IPV6_ROUTER_PREF
340 		else if (neigh->nud_state & NUD_FAILED)
341 			m = 0;
342 #endif
343 		else
344 			m = 1;
345 		read_unlock_bh(&neigh->lock);
346 	} else
347 		m = 0;
348 	return m;
349 }
350 
351 static int rt6_score_route(struct rt6_info *rt, int oif,
352 			   int strict)
353 {
354 	int m, n;
355 
356 	m = rt6_check_dev(rt, oif);
357 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
358 		return -1;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
361 #endif
362 	n = rt6_check_neigh(rt);
363 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
364 		return -1;
365 	return m;
366 }
367 
368 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
369 				   int *mpri, struct rt6_info *match)
370 {
371 	int m;
372 
373 	if (rt6_check_expired(rt))
374 		goto out;
375 
376 	m = rt6_score_route(rt, oif, strict);
377 	if (m < 0)
378 		goto out;
379 
380 	if (m > *mpri) {
381 		if (strict & RT6_LOOKUP_F_REACHABLE)
382 			rt6_probe(match);
383 		*mpri = m;
384 		match = rt;
385 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
386 		rt6_probe(rt);
387 	}
388 
389 out:
390 	return match;
391 }
392 
393 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
394 				     struct rt6_info *rr_head,
395 				     u32 metric, int oif, int strict)
396 {
397 	struct rt6_info *rt, *match;
398 	int mpri = -1;
399 
400 	match = NULL;
401 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
402 	     rt = rt->dst.rt6_next)
403 		match = find_match(rt, oif, strict, &mpri, match);
404 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
405 	     rt = rt->dst.rt6_next)
406 		match = find_match(rt, oif, strict, &mpri, match);
407 
408 	return match;
409 }
410 
411 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 {
413 	struct rt6_info *match, *rt0;
414 	struct net *net;
415 
416 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417 		  __func__, fn->leaf, oif);
418 
419 	rt0 = fn->rr_ptr;
420 	if (!rt0)
421 		fn->rr_ptr = rt0 = fn->leaf;
422 
423 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424 
425 	if (!match &&
426 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
427 		struct rt6_info *next = rt0->dst.rt6_next;
428 
429 		/* no entries matched; do round-robin */
430 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
431 			next = fn->leaf;
432 
433 		if (next != rt0)
434 			fn->rr_ptr = next;
435 	}
436 
437 	RT6_TRACE("%s() => %p\n",
438 		  __func__, match);
439 
440 	net = dev_net(rt0->rt6i_dev);
441 	return match ? match : net->ipv6.ip6_null_entry;
442 }
443 
444 #ifdef CONFIG_IPV6_ROUTE_INFO
445 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
446 		  struct in6_addr *gwaddr)
447 {
448 	struct net *net = dev_net(dev);
449 	struct route_info *rinfo = (struct route_info *) opt;
450 	struct in6_addr prefix_buf, *prefix;
451 	unsigned int pref;
452 	unsigned long lifetime;
453 	struct rt6_info *rt;
454 
455 	if (len < sizeof(struct route_info)) {
456 		return -EINVAL;
457 	}
458 
459 	/* Sanity check for prefix_len and length */
460 	if (rinfo->length > 3) {
461 		return -EINVAL;
462 	} else if (rinfo->prefix_len > 128) {
463 		return -EINVAL;
464 	} else if (rinfo->prefix_len > 64) {
465 		if (rinfo->length < 2) {
466 			return -EINVAL;
467 		}
468 	} else if (rinfo->prefix_len > 0) {
469 		if (rinfo->length < 1) {
470 			return -EINVAL;
471 		}
472 	}
473 
474 	pref = rinfo->route_pref;
475 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
476 		return -EINVAL;
477 
478 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
479 
480 	if (rinfo->length == 3)
481 		prefix = (struct in6_addr *)rinfo->prefix;
482 	else {
483 		/* this function is safe */
484 		ipv6_addr_prefix(&prefix_buf,
485 				 (struct in6_addr *)rinfo->prefix,
486 				 rinfo->prefix_len);
487 		prefix = &prefix_buf;
488 	}
489 
490 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
491 				dev->ifindex);
492 
493 	if (rt && !lifetime) {
494 		ip6_del_rt(rt);
495 		rt = NULL;
496 	}
497 
498 	if (!rt && lifetime)
499 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
500 					pref);
501 	else if (rt)
502 		rt->rt6i_flags = RTF_ROUTEINFO |
503 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
504 
505 	if (rt) {
506 		if (!addrconf_finite_timeout(lifetime)) {
507 			rt->rt6i_flags &= ~RTF_EXPIRES;
508 		} else {
509 			rt->rt6i_expires = jiffies + HZ * lifetime;
510 			rt->rt6i_flags |= RTF_EXPIRES;
511 		}
512 		dst_release(&rt->dst);
513 	}
514 	return 0;
515 }
516 #endif
517 
518 #define BACKTRACK(__net, saddr)			\
519 do { \
520 	if (rt == __net->ipv6.ip6_null_entry) {	\
521 		struct fib6_node *pn; \
522 		while (1) { \
523 			if (fn->fn_flags & RTN_TL_ROOT) \
524 				goto out; \
525 			pn = fn->parent; \
526 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
527 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
528 			else \
529 				fn = pn; \
530 			if (fn->fn_flags & RTN_RTINFO) \
531 				goto restart; \
532 		} \
533 	} \
534 } while(0)
535 
536 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
537 					     struct fib6_table *table,
538 					     struct flowi *fl, int flags)
539 {
540 	struct fib6_node *fn;
541 	struct rt6_info *rt;
542 
543 	read_lock_bh(&table->tb6_lock);
544 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
545 restart:
546 	rt = fn->leaf;
547 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
548 	BACKTRACK(net, &fl->fl6_src);
549 out:
550 	dst_use(&rt->dst, jiffies);
551 	read_unlock_bh(&table->tb6_lock);
552 	return rt;
553 
554 }
555 
556 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
557 			    const struct in6_addr *saddr, int oif, int strict)
558 {
559 	struct flowi fl = {
560 		.oif = oif,
561 		.nl_u = {
562 			.ip6_u = {
563 				.daddr = *daddr,
564 			},
565 		},
566 	};
567 	struct dst_entry *dst;
568 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
569 
570 	if (saddr) {
571 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
572 		flags |= RT6_LOOKUP_F_HAS_SADDR;
573 	}
574 
575 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
576 	if (dst->error == 0)
577 		return (struct rt6_info *) dst;
578 
579 	dst_release(dst);
580 
581 	return NULL;
582 }
583 
584 EXPORT_SYMBOL(rt6_lookup);
585 
586 /* ip6_ins_rt is called with FREE table->tb6_lock.
587    It takes new route entry, the addition fails by any reason the
588    route is freed. In any case, if caller does not hold it, it may
589    be destroyed.
590  */
591 
592 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
593 {
594 	int err;
595 	struct fib6_table *table;
596 
597 	table = rt->rt6i_table;
598 	write_lock_bh(&table->tb6_lock);
599 	err = fib6_add(&table->tb6_root, rt, info);
600 	write_unlock_bh(&table->tb6_lock);
601 
602 	return err;
603 }
604 
605 int ip6_ins_rt(struct rt6_info *rt)
606 {
607 	struct nl_info info = {
608 		.nl_net = dev_net(rt->rt6i_dev),
609 	};
610 	return __ip6_ins_rt(rt, &info);
611 }
612 
613 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
614 				      struct in6_addr *saddr)
615 {
616 	struct rt6_info *rt;
617 
618 	/*
619 	 *	Clone the route.
620 	 */
621 
622 	rt = ip6_rt_copy(ort);
623 
624 	if (rt) {
625 		struct neighbour *neigh;
626 		int attempts = !in_softirq();
627 
628 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
629 			if (rt->rt6i_dst.plen != 128 &&
630 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
631 				rt->rt6i_flags |= RTF_ANYCAST;
632 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
633 		}
634 
635 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
636 		rt->rt6i_dst.plen = 128;
637 		rt->rt6i_flags |= RTF_CACHE;
638 		rt->dst.flags |= DST_HOST;
639 
640 #ifdef CONFIG_IPV6_SUBTREES
641 		if (rt->rt6i_src.plen && saddr) {
642 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
643 			rt->rt6i_src.plen = 128;
644 		}
645 #endif
646 
647 	retry:
648 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
649 		if (IS_ERR(neigh)) {
650 			struct net *net = dev_net(rt->rt6i_dev);
651 			int saved_rt_min_interval =
652 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
653 			int saved_rt_elasticity =
654 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
655 
656 			if (attempts-- > 0) {
657 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
658 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
659 
660 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
661 
662 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
663 					saved_rt_elasticity;
664 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
665 					saved_rt_min_interval;
666 				goto retry;
667 			}
668 
669 			if (net_ratelimit())
670 				printk(KERN_WARNING
671 				       "ipv6: Neighbour table overflow.\n");
672 			dst_free(&rt->dst);
673 			return NULL;
674 		}
675 		rt->rt6i_nexthop = neigh;
676 
677 	}
678 
679 	return rt;
680 }
681 
682 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
683 {
684 	struct rt6_info *rt = ip6_rt_copy(ort);
685 	if (rt) {
686 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
687 		rt->rt6i_dst.plen = 128;
688 		rt->rt6i_flags |= RTF_CACHE;
689 		rt->dst.flags |= DST_HOST;
690 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
691 	}
692 	return rt;
693 }
694 
695 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
696 				      struct flowi *fl, int flags)
697 {
698 	struct fib6_node *fn;
699 	struct rt6_info *rt, *nrt;
700 	int strict = 0;
701 	int attempts = 3;
702 	int err;
703 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
704 
705 	strict |= flags & RT6_LOOKUP_F_IFACE;
706 
707 relookup:
708 	read_lock_bh(&table->tb6_lock);
709 
710 restart_2:
711 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
712 
713 restart:
714 	rt = rt6_select(fn, oif, strict | reachable);
715 
716 	BACKTRACK(net, &fl->fl6_src);
717 	if (rt == net->ipv6.ip6_null_entry ||
718 	    rt->rt6i_flags & RTF_CACHE)
719 		goto out;
720 
721 	dst_hold(&rt->dst);
722 	read_unlock_bh(&table->tb6_lock);
723 
724 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
725 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
726 	else {
727 #if CLONE_OFFLINK_ROUTE
728 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
729 #else
730 		goto out2;
731 #endif
732 	}
733 
734 	dst_release(&rt->dst);
735 	rt = nrt ? : net->ipv6.ip6_null_entry;
736 
737 	dst_hold(&rt->dst);
738 	if (nrt) {
739 		err = ip6_ins_rt(nrt);
740 		if (!err)
741 			goto out2;
742 	}
743 
744 	if (--attempts <= 0)
745 		goto out2;
746 
747 	/*
748 	 * Race condition! In the gap, when table->tb6_lock was
749 	 * released someone could insert this route.  Relookup.
750 	 */
751 	dst_release(&rt->dst);
752 	goto relookup;
753 
754 out:
755 	if (reachable) {
756 		reachable = 0;
757 		goto restart_2;
758 	}
759 	dst_hold(&rt->dst);
760 	read_unlock_bh(&table->tb6_lock);
761 out2:
762 	rt->dst.lastuse = jiffies;
763 	rt->dst.__use++;
764 
765 	return rt;
766 }
767 
768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
769 					    struct flowi *fl, int flags)
770 {
771 	return ip6_pol_route(net, table, fl->iif, fl, flags);
772 }
773 
774 void ip6_route_input(struct sk_buff *skb)
775 {
776 	struct ipv6hdr *iph = ipv6_hdr(skb);
777 	struct net *net = dev_net(skb->dev);
778 	int flags = RT6_LOOKUP_F_HAS_SADDR;
779 	struct flowi fl = {
780 		.iif = skb->dev->ifindex,
781 		.nl_u = {
782 			.ip6_u = {
783 				.daddr = iph->daddr,
784 				.saddr = iph->saddr,
785 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
786 			},
787 		},
788 		.mark = skb->mark,
789 		.proto = iph->nexthdr,
790 	};
791 
792 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
793 		flags |= RT6_LOOKUP_F_IFACE;
794 
795 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
796 }
797 
798 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
799 					     struct flowi *fl, int flags)
800 {
801 	return ip6_pol_route(net, table, fl->oif, fl, flags);
802 }
803 
804 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
805 				    struct flowi *fl)
806 {
807 	int flags = 0;
808 
809 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
810 		flags |= RT6_LOOKUP_F_IFACE;
811 
812 	if (!ipv6_addr_any(&fl->fl6_src))
813 		flags |= RT6_LOOKUP_F_HAS_SADDR;
814 	else if (sk)
815 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
816 
817 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
818 }
819 
820 EXPORT_SYMBOL(ip6_route_output);
821 
822 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
823 {
824 	struct rt6_info *ort = (struct rt6_info *) *dstp;
825 	struct rt6_info *rt = (struct rt6_info *)
826 		dst_alloc(&ip6_dst_blackhole_ops);
827 	struct dst_entry *new = NULL;
828 
829 	if (rt) {
830 		new = &rt->dst;
831 
832 		atomic_set(&new->__refcnt, 1);
833 		new->__use = 1;
834 		new->input = dst_discard;
835 		new->output = dst_discard;
836 
837 		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
838 		new->dev = ort->dst.dev;
839 		if (new->dev)
840 			dev_hold(new->dev);
841 		rt->rt6i_idev = ort->rt6i_idev;
842 		if (rt->rt6i_idev)
843 			in6_dev_hold(rt->rt6i_idev);
844 		rt->rt6i_expires = 0;
845 
846 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
847 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
848 		rt->rt6i_metric = 0;
849 
850 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
851 #ifdef CONFIG_IPV6_SUBTREES
852 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
853 #endif
854 
855 		dst_free(new);
856 	}
857 
858 	dst_release(*dstp);
859 	*dstp = new;
860 	return new ? 0 : -ENOMEM;
861 }
862 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
863 
864 /*
865  *	Destination cache support functions
866  */
867 
868 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
869 {
870 	struct rt6_info *rt;
871 
872 	rt = (struct rt6_info *) dst;
873 
874 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
875 		return dst;
876 
877 	return NULL;
878 }
879 
880 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
881 {
882 	struct rt6_info *rt = (struct rt6_info *) dst;
883 
884 	if (rt) {
885 		if (rt->rt6i_flags & RTF_CACHE) {
886 			if (rt6_check_expired(rt)) {
887 				ip6_del_rt(rt);
888 				dst = NULL;
889 			}
890 		} else {
891 			dst_release(dst);
892 			dst = NULL;
893 		}
894 	}
895 	return dst;
896 }
897 
898 static void ip6_link_failure(struct sk_buff *skb)
899 {
900 	struct rt6_info *rt;
901 
902 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
903 
904 	rt = (struct rt6_info *) skb_dst(skb);
905 	if (rt) {
906 		if (rt->rt6i_flags&RTF_CACHE) {
907 			dst_set_expires(&rt->dst, 0);
908 			rt->rt6i_flags |= RTF_EXPIRES;
909 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
910 			rt->rt6i_node->fn_sernum = -1;
911 	}
912 }
913 
914 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
915 {
916 	struct rt6_info *rt6 = (struct rt6_info*)dst;
917 
918 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
919 		rt6->rt6i_flags |= RTF_MODIFIED;
920 		if (mtu < IPV6_MIN_MTU) {
921 			mtu = IPV6_MIN_MTU;
922 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
923 		}
924 		dst->metrics[RTAX_MTU-1] = mtu;
925 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
926 	}
927 }
928 
929 static int ipv6_get_mtu(struct net_device *dev);
930 
931 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
932 {
933 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
934 
935 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
936 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
937 
938 	/*
939 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
940 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
941 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
942 	 * rely only on pmtu discovery"
943 	 */
944 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
945 		mtu = IPV6_MAXPLEN;
946 	return mtu;
947 }
948 
949 static struct dst_entry *icmp6_dst_gc_list;
950 static DEFINE_SPINLOCK(icmp6_dst_lock);
951 
952 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
953 				  struct neighbour *neigh,
954 				  const struct in6_addr *addr)
955 {
956 	struct rt6_info *rt;
957 	struct inet6_dev *idev = in6_dev_get(dev);
958 	struct net *net = dev_net(dev);
959 
960 	if (unlikely(idev == NULL))
961 		return NULL;
962 
963 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
964 	if (unlikely(rt == NULL)) {
965 		in6_dev_put(idev);
966 		goto out;
967 	}
968 
969 	dev_hold(dev);
970 	if (neigh)
971 		neigh_hold(neigh);
972 	else {
973 		neigh = ndisc_get_neigh(dev, addr);
974 		if (IS_ERR(neigh))
975 			neigh = NULL;
976 	}
977 
978 	rt->rt6i_dev	  = dev;
979 	rt->rt6i_idev     = idev;
980 	rt->rt6i_nexthop  = neigh;
981 	atomic_set(&rt->dst.__refcnt, 1);
982 	rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
983 	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
984 	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
985 	rt->dst.output  = ip6_output;
986 
987 #if 0	/* there's no chance to use these for ndisc */
988 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
989 				? DST_HOST
990 				: 0;
991 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
992 	rt->rt6i_dst.plen = 128;
993 #endif
994 
995 	spin_lock_bh(&icmp6_dst_lock);
996 	rt->dst.next = icmp6_dst_gc_list;
997 	icmp6_dst_gc_list = &rt->dst;
998 	spin_unlock_bh(&icmp6_dst_lock);
999 
1000 	fib6_force_start_gc(net);
1001 
1002 out:
1003 	return &rt->dst;
1004 }
1005 
1006 int icmp6_dst_gc(void)
1007 {
1008 	struct dst_entry *dst, *next, **pprev;
1009 	int more = 0;
1010 
1011 	next = NULL;
1012 
1013 	spin_lock_bh(&icmp6_dst_lock);
1014 	pprev = &icmp6_dst_gc_list;
1015 
1016 	while ((dst = *pprev) != NULL) {
1017 		if (!atomic_read(&dst->__refcnt)) {
1018 			*pprev = dst->next;
1019 			dst_free(dst);
1020 		} else {
1021 			pprev = &dst->next;
1022 			++more;
1023 		}
1024 	}
1025 
1026 	spin_unlock_bh(&icmp6_dst_lock);
1027 
1028 	return more;
1029 }
1030 
1031 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1032 			    void *arg)
1033 {
1034 	struct dst_entry *dst, **pprev;
1035 
1036 	spin_lock_bh(&icmp6_dst_lock);
1037 	pprev = &icmp6_dst_gc_list;
1038 	while ((dst = *pprev) != NULL) {
1039 		struct rt6_info *rt = (struct rt6_info *) dst;
1040 		if (func(rt, arg)) {
1041 			*pprev = dst->next;
1042 			dst_free(dst);
1043 		} else {
1044 			pprev = &dst->next;
1045 		}
1046 	}
1047 	spin_unlock_bh(&icmp6_dst_lock);
1048 }
1049 
1050 static int ip6_dst_gc(struct dst_ops *ops)
1051 {
1052 	unsigned long now = jiffies;
1053 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1054 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1055 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1056 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1057 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1058 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1059 	int entries;
1060 
1061 	entries = dst_entries_get_fast(ops);
1062 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1063 	    entries <= rt_max_size)
1064 		goto out;
1065 
1066 	net->ipv6.ip6_rt_gc_expire++;
1067 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068 	net->ipv6.ip6_rt_last_gc = now;
1069 	entries = dst_entries_get_slow(ops);
1070 	if (entries < ops->gc_thresh)
1071 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1072 out:
1073 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1074 	return entries > rt_max_size;
1075 }
1076 
1077 /* Clean host part of a prefix. Not necessary in radix tree,
1078    but results in cleaner routing tables.
1079 
1080    Remove it only when all the things will work!
1081  */
1082 
1083 static int ipv6_get_mtu(struct net_device *dev)
1084 {
1085 	int mtu = IPV6_MIN_MTU;
1086 	struct inet6_dev *idev;
1087 
1088 	rcu_read_lock();
1089 	idev = __in6_dev_get(dev);
1090 	if (idev)
1091 		mtu = idev->cnf.mtu6;
1092 	rcu_read_unlock();
1093 	return mtu;
1094 }
1095 
1096 int ip6_dst_hoplimit(struct dst_entry *dst)
1097 {
1098 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1099 	if (hoplimit < 0) {
1100 		struct net_device *dev = dst->dev;
1101 		struct inet6_dev *idev;
1102 
1103 		rcu_read_lock();
1104 		idev = __in6_dev_get(dev);
1105 		if (idev)
1106 			hoplimit = idev->cnf.hop_limit;
1107 		else
1108 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1109 		rcu_read_unlock();
1110 	}
1111 	return hoplimit;
1112 }
1113 
1114 /*
1115  *
1116  */
1117 
1118 int ip6_route_add(struct fib6_config *cfg)
1119 {
1120 	int err;
1121 	struct net *net = cfg->fc_nlinfo.nl_net;
1122 	struct rt6_info *rt = NULL;
1123 	struct net_device *dev = NULL;
1124 	struct inet6_dev *idev = NULL;
1125 	struct fib6_table *table;
1126 	int addr_type;
1127 
1128 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1129 		return -EINVAL;
1130 #ifndef CONFIG_IPV6_SUBTREES
1131 	if (cfg->fc_src_len)
1132 		return -EINVAL;
1133 #endif
1134 	if (cfg->fc_ifindex) {
1135 		err = -ENODEV;
1136 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1137 		if (!dev)
1138 			goto out;
1139 		idev = in6_dev_get(dev);
1140 		if (!idev)
1141 			goto out;
1142 	}
1143 
1144 	if (cfg->fc_metric == 0)
1145 		cfg->fc_metric = IP6_RT_PRIO_USER;
1146 
1147 	table = fib6_new_table(net, cfg->fc_table);
1148 	if (table == NULL) {
1149 		err = -ENOBUFS;
1150 		goto out;
1151 	}
1152 
1153 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1154 
1155 	if (rt == NULL) {
1156 		err = -ENOMEM;
1157 		goto out;
1158 	}
1159 
1160 	rt->dst.obsolete = -1;
1161 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1162 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1163 				0;
1164 
1165 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1166 		cfg->fc_protocol = RTPROT_BOOT;
1167 	rt->rt6i_protocol = cfg->fc_protocol;
1168 
1169 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1170 
1171 	if (addr_type & IPV6_ADDR_MULTICAST)
1172 		rt->dst.input = ip6_mc_input;
1173 	else if (cfg->fc_flags & RTF_LOCAL)
1174 		rt->dst.input = ip6_input;
1175 	else
1176 		rt->dst.input = ip6_forward;
1177 
1178 	rt->dst.output = ip6_output;
1179 
1180 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1181 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1182 	if (rt->rt6i_dst.plen == 128)
1183 	       rt->dst.flags = DST_HOST;
1184 
1185 #ifdef CONFIG_IPV6_SUBTREES
1186 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1187 	rt->rt6i_src.plen = cfg->fc_src_len;
1188 #endif
1189 
1190 	rt->rt6i_metric = cfg->fc_metric;
1191 
1192 	/* We cannot add true routes via loopback here,
1193 	   they would result in kernel looping; promote them to reject routes
1194 	 */
1195 	if ((cfg->fc_flags & RTF_REJECT) ||
1196 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1197 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1198 		/* hold loopback dev/idev if we haven't done so. */
1199 		if (dev != net->loopback_dev) {
1200 			if (dev) {
1201 				dev_put(dev);
1202 				in6_dev_put(idev);
1203 			}
1204 			dev = net->loopback_dev;
1205 			dev_hold(dev);
1206 			idev = in6_dev_get(dev);
1207 			if (!idev) {
1208 				err = -ENODEV;
1209 				goto out;
1210 			}
1211 		}
1212 		rt->dst.output = ip6_pkt_discard_out;
1213 		rt->dst.input = ip6_pkt_discard;
1214 		rt->dst.error = -ENETUNREACH;
1215 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1216 		goto install_route;
1217 	}
1218 
1219 	if (cfg->fc_flags & RTF_GATEWAY) {
1220 		struct in6_addr *gw_addr;
1221 		int gwa_type;
1222 
1223 		gw_addr = &cfg->fc_gateway;
1224 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1225 		gwa_type = ipv6_addr_type(gw_addr);
1226 
1227 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1228 			struct rt6_info *grt;
1229 
1230 			/* IPv6 strictly inhibits using not link-local
1231 			   addresses as nexthop address.
1232 			   Otherwise, router will not able to send redirects.
1233 			   It is very good, but in some (rare!) circumstances
1234 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1235 			   some exceptions. --ANK
1236 			 */
1237 			err = -EINVAL;
1238 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1239 				goto out;
1240 
1241 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1242 
1243 			err = -EHOSTUNREACH;
1244 			if (grt == NULL)
1245 				goto out;
1246 			if (dev) {
1247 				if (dev != grt->rt6i_dev) {
1248 					dst_release(&grt->dst);
1249 					goto out;
1250 				}
1251 			} else {
1252 				dev = grt->rt6i_dev;
1253 				idev = grt->rt6i_idev;
1254 				dev_hold(dev);
1255 				in6_dev_hold(grt->rt6i_idev);
1256 			}
1257 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1258 				err = 0;
1259 			dst_release(&grt->dst);
1260 
1261 			if (err)
1262 				goto out;
1263 		}
1264 		err = -EINVAL;
1265 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1266 			goto out;
1267 	}
1268 
1269 	err = -ENODEV;
1270 	if (dev == NULL)
1271 		goto out;
1272 
1273 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1274 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1275 		if (IS_ERR(rt->rt6i_nexthop)) {
1276 			err = PTR_ERR(rt->rt6i_nexthop);
1277 			rt->rt6i_nexthop = NULL;
1278 			goto out;
1279 		}
1280 	}
1281 
1282 	rt->rt6i_flags = cfg->fc_flags;
1283 
1284 install_route:
1285 	if (cfg->fc_mx) {
1286 		struct nlattr *nla;
1287 		int remaining;
1288 
1289 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1290 			int type = nla_type(nla);
1291 
1292 			if (type) {
1293 				if (type > RTAX_MAX) {
1294 					err = -EINVAL;
1295 					goto out;
1296 				}
1297 
1298 				rt->dst.metrics[type - 1] = nla_get_u32(nla);
1299 			}
1300 		}
1301 	}
1302 
1303 	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1304 		rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1305 	if (!dst_mtu(&rt->dst))
1306 		rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1307 	if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1308 		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1309 	rt->dst.dev = dev;
1310 	rt->rt6i_idev = idev;
1311 	rt->rt6i_table = table;
1312 
1313 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1314 
1315 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1316 
1317 out:
1318 	if (dev)
1319 		dev_put(dev);
1320 	if (idev)
1321 		in6_dev_put(idev);
1322 	if (rt)
1323 		dst_free(&rt->dst);
1324 	return err;
1325 }
1326 
1327 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1328 {
1329 	int err;
1330 	struct fib6_table *table;
1331 	struct net *net = dev_net(rt->rt6i_dev);
1332 
1333 	if (rt == net->ipv6.ip6_null_entry)
1334 		return -ENOENT;
1335 
1336 	table = rt->rt6i_table;
1337 	write_lock_bh(&table->tb6_lock);
1338 
1339 	err = fib6_del(rt, info);
1340 	dst_release(&rt->dst);
1341 
1342 	write_unlock_bh(&table->tb6_lock);
1343 
1344 	return err;
1345 }
1346 
1347 int ip6_del_rt(struct rt6_info *rt)
1348 {
1349 	struct nl_info info = {
1350 		.nl_net = dev_net(rt->rt6i_dev),
1351 	};
1352 	return __ip6_del_rt(rt, &info);
1353 }
1354 
1355 static int ip6_route_del(struct fib6_config *cfg)
1356 {
1357 	struct fib6_table *table;
1358 	struct fib6_node *fn;
1359 	struct rt6_info *rt;
1360 	int err = -ESRCH;
1361 
1362 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1363 	if (table == NULL)
1364 		return err;
1365 
1366 	read_lock_bh(&table->tb6_lock);
1367 
1368 	fn = fib6_locate(&table->tb6_root,
1369 			 &cfg->fc_dst, cfg->fc_dst_len,
1370 			 &cfg->fc_src, cfg->fc_src_len);
1371 
1372 	if (fn) {
1373 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1374 			if (cfg->fc_ifindex &&
1375 			    (rt->rt6i_dev == NULL ||
1376 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1377 				continue;
1378 			if (cfg->fc_flags & RTF_GATEWAY &&
1379 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1380 				continue;
1381 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1382 				continue;
1383 			dst_hold(&rt->dst);
1384 			read_unlock_bh(&table->tb6_lock);
1385 
1386 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1387 		}
1388 	}
1389 	read_unlock_bh(&table->tb6_lock);
1390 
1391 	return err;
1392 }
1393 
1394 /*
1395  *	Handle redirects
1396  */
1397 struct ip6rd_flowi {
1398 	struct flowi fl;
1399 	struct in6_addr gateway;
1400 };
1401 
1402 static struct rt6_info *__ip6_route_redirect(struct net *net,
1403 					     struct fib6_table *table,
1404 					     struct flowi *fl,
1405 					     int flags)
1406 {
1407 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1408 	struct rt6_info *rt;
1409 	struct fib6_node *fn;
1410 
1411 	/*
1412 	 * Get the "current" route for this destination and
1413 	 * check if the redirect has come from approriate router.
1414 	 *
1415 	 * RFC 2461 specifies that redirects should only be
1416 	 * accepted if they come from the nexthop to the target.
1417 	 * Due to the way the routes are chosen, this notion
1418 	 * is a bit fuzzy and one might need to check all possible
1419 	 * routes.
1420 	 */
1421 
1422 	read_lock_bh(&table->tb6_lock);
1423 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1424 restart:
1425 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1426 		/*
1427 		 * Current route is on-link; redirect is always invalid.
1428 		 *
1429 		 * Seems, previous statement is not true. It could
1430 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1431 		 * But then router serving it might decide, that we should
1432 		 * know truth 8)8) --ANK (980726).
1433 		 */
1434 		if (rt6_check_expired(rt))
1435 			continue;
1436 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1437 			continue;
1438 		if (fl->oif != rt->rt6i_dev->ifindex)
1439 			continue;
1440 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1441 			continue;
1442 		break;
1443 	}
1444 
1445 	if (!rt)
1446 		rt = net->ipv6.ip6_null_entry;
1447 	BACKTRACK(net, &fl->fl6_src);
1448 out:
1449 	dst_hold(&rt->dst);
1450 
1451 	read_unlock_bh(&table->tb6_lock);
1452 
1453 	return rt;
1454 };
1455 
1456 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1457 					   struct in6_addr *src,
1458 					   struct in6_addr *gateway,
1459 					   struct net_device *dev)
1460 {
1461 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1462 	struct net *net = dev_net(dev);
1463 	struct ip6rd_flowi rdfl = {
1464 		.fl = {
1465 			.oif = dev->ifindex,
1466 			.nl_u = {
1467 				.ip6_u = {
1468 					.daddr = *dest,
1469 					.saddr = *src,
1470 				},
1471 			},
1472 		},
1473 	};
1474 
1475 	ipv6_addr_copy(&rdfl.gateway, gateway);
1476 
1477 	if (rt6_need_strict(dest))
1478 		flags |= RT6_LOOKUP_F_IFACE;
1479 
1480 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481 						   flags, __ip6_route_redirect);
1482 }
1483 
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485 		  struct in6_addr *saddr,
1486 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488 	struct rt6_info *rt, *nrt = NULL;
1489 	struct netevent_redirect netevent;
1490 	struct net *net = dev_net(neigh->dev);
1491 
1492 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493 
1494 	if (rt == net->ipv6.ip6_null_entry) {
1495 		if (net_ratelimit())
1496 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497 			       "for redirect target\n");
1498 		goto out;
1499 	}
1500 
1501 	/*
1502 	 *	We have finally decided to accept it.
1503 	 */
1504 
1505 	neigh_update(neigh, lladdr, NUD_STALE,
1506 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507 		     NEIGH_UPDATE_F_OVERRIDE|
1508 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509 				     NEIGH_UPDATE_F_ISROUTER))
1510 		     );
1511 
1512 	/*
1513 	 * Redirect received -> path was valid.
1514 	 * Look, redirects are sent only in response to data packets,
1515 	 * so that this nexthop apparently is reachable. --ANK
1516 	 */
1517 	dst_confirm(&rt->dst);
1518 
1519 	/* Duplicate redirect: silently ignore. */
1520 	if (neigh == rt->dst.neighbour)
1521 		goto out;
1522 
1523 	nrt = ip6_rt_copy(rt);
1524 	if (nrt == NULL)
1525 		goto out;
1526 
1527 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528 	if (on_link)
1529 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1530 
1531 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532 	nrt->rt6i_dst.plen = 128;
1533 	nrt->dst.flags |= DST_HOST;
1534 
1535 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536 	nrt->rt6i_nexthop = neigh_clone(neigh);
1537 	/* Reset pmtu, it may be better */
1538 	nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1539 	nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1540 							dst_mtu(&nrt->dst));
1541 
1542 	if (ip6_ins_rt(nrt))
1543 		goto out;
1544 
1545 	netevent.old = &rt->dst;
1546 	netevent.new = &nrt->dst;
1547 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548 
1549 	if (rt->rt6i_flags&RTF_CACHE) {
1550 		ip6_del_rt(rt);
1551 		return;
1552 	}
1553 
1554 out:
1555 	dst_release(&rt->dst);
1556 }
1557 
1558 /*
1559  *	Handle ICMP "packet too big" messages
1560  *	i.e. Path MTU discovery
1561  */
1562 
1563 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1564 			     struct net *net, u32 pmtu, int ifindex)
1565 {
1566 	struct rt6_info *rt, *nrt;
1567 	int allfrag = 0;
1568 
1569 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1570 	if (rt == NULL)
1571 		return;
1572 
1573 	if (pmtu >= dst_mtu(&rt->dst))
1574 		goto out;
1575 
1576 	if (pmtu < IPV6_MIN_MTU) {
1577 		/*
1578 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1579 		 * MTU (1280) and a fragment header should always be included
1580 		 * after a node receiving Too Big message reporting PMTU is
1581 		 * less than the IPv6 Minimum Link MTU.
1582 		 */
1583 		pmtu = IPV6_MIN_MTU;
1584 		allfrag = 1;
1585 	}
1586 
1587 	/* New mtu received -> path was valid.
1588 	   They are sent only in response to data packets,
1589 	   so that this nexthop apparently is reachable. --ANK
1590 	 */
1591 	dst_confirm(&rt->dst);
1592 
1593 	/* Host route. If it is static, it would be better
1594 	   not to override it, but add new one, so that
1595 	   when cache entry will expire old pmtu
1596 	   would return automatically.
1597 	 */
1598 	if (rt->rt6i_flags & RTF_CACHE) {
1599 		rt->dst.metrics[RTAX_MTU-1] = pmtu;
1600 		if (allfrag)
1601 			rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1602 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1603 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1604 		goto out;
1605 	}
1606 
1607 	/* Network route.
1608 	   Two cases are possible:
1609 	   1. It is connected route. Action: COW
1610 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1611 	 */
1612 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1613 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1614 	else
1615 		nrt = rt6_alloc_clone(rt, daddr);
1616 
1617 	if (nrt) {
1618 		nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1619 		if (allfrag)
1620 			nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1621 
1622 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1623 		 * happened within 5 mins, the recommended timer is 10 mins.
1624 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1625 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1626 		 * and detecting PMTU increase will be automatically happened.
1627 		 */
1628 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1629 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1630 
1631 		ip6_ins_rt(nrt);
1632 	}
1633 out:
1634 	dst_release(&rt->dst);
1635 }
1636 
1637 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1638 			struct net_device *dev, u32 pmtu)
1639 {
1640 	struct net *net = dev_net(dev);
1641 
1642 	/*
1643 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1644 	 * is sending along the path" that caused the Packet Too Big message.
1645 	 * Since it's not possible in the general case to determine which
1646 	 * interface was used to send the original packet, we update the MTU
1647 	 * on the interface that will be used to send future packets. We also
1648 	 * update the MTU on the interface that received the Packet Too Big in
1649 	 * case the original packet was forced out that interface with
1650 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1651 	 * correct behaviour, which would be to update the MTU on all
1652 	 * interfaces.
1653 	 */
1654 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1655 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1656 }
1657 
1658 /*
1659  *	Misc support functions
1660  */
1661 
1662 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1663 {
1664 	struct net *net = dev_net(ort->rt6i_dev);
1665 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1666 
1667 	if (rt) {
1668 		rt->dst.input = ort->dst.input;
1669 		rt->dst.output = ort->dst.output;
1670 
1671 		memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1672 		rt->dst.error = ort->dst.error;
1673 		rt->dst.dev = ort->dst.dev;
1674 		if (rt->dst.dev)
1675 			dev_hold(rt->dst.dev);
1676 		rt->rt6i_idev = ort->rt6i_idev;
1677 		if (rt->rt6i_idev)
1678 			in6_dev_hold(rt->rt6i_idev);
1679 		rt->dst.lastuse = jiffies;
1680 		rt->rt6i_expires = 0;
1681 
1682 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1683 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1684 		rt->rt6i_metric = 0;
1685 
1686 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1687 #ifdef CONFIG_IPV6_SUBTREES
1688 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1689 #endif
1690 		rt->rt6i_table = ort->rt6i_table;
1691 	}
1692 	return rt;
1693 }
1694 
1695 #ifdef CONFIG_IPV6_ROUTE_INFO
1696 static struct rt6_info *rt6_get_route_info(struct net *net,
1697 					   struct in6_addr *prefix, int prefixlen,
1698 					   struct in6_addr *gwaddr, int ifindex)
1699 {
1700 	struct fib6_node *fn;
1701 	struct rt6_info *rt = NULL;
1702 	struct fib6_table *table;
1703 
1704 	table = fib6_get_table(net, RT6_TABLE_INFO);
1705 	if (table == NULL)
1706 		return NULL;
1707 
1708 	write_lock_bh(&table->tb6_lock);
1709 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1710 	if (!fn)
1711 		goto out;
1712 
1713 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1714 		if (rt->rt6i_dev->ifindex != ifindex)
1715 			continue;
1716 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1717 			continue;
1718 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1719 			continue;
1720 		dst_hold(&rt->dst);
1721 		break;
1722 	}
1723 out:
1724 	write_unlock_bh(&table->tb6_lock);
1725 	return rt;
1726 }
1727 
1728 static struct rt6_info *rt6_add_route_info(struct net *net,
1729 					   struct in6_addr *prefix, int prefixlen,
1730 					   struct in6_addr *gwaddr, int ifindex,
1731 					   unsigned pref)
1732 {
1733 	struct fib6_config cfg = {
1734 		.fc_table	= RT6_TABLE_INFO,
1735 		.fc_metric	= IP6_RT_PRIO_USER,
1736 		.fc_ifindex	= ifindex,
1737 		.fc_dst_len	= prefixlen,
1738 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1739 				  RTF_UP | RTF_PREF(pref),
1740 		.fc_nlinfo.pid = 0,
1741 		.fc_nlinfo.nlh = NULL,
1742 		.fc_nlinfo.nl_net = net,
1743 	};
1744 
1745 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1746 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1747 
1748 	/* We should treat it as a default route if prefix length is 0. */
1749 	if (!prefixlen)
1750 		cfg.fc_flags |= RTF_DEFAULT;
1751 
1752 	ip6_route_add(&cfg);
1753 
1754 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1755 }
1756 #endif
1757 
1758 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1759 {
1760 	struct rt6_info *rt;
1761 	struct fib6_table *table;
1762 
1763 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1764 	if (table == NULL)
1765 		return NULL;
1766 
1767 	write_lock_bh(&table->tb6_lock);
1768 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1769 		if (dev == rt->rt6i_dev &&
1770 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1771 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1772 			break;
1773 	}
1774 	if (rt)
1775 		dst_hold(&rt->dst);
1776 	write_unlock_bh(&table->tb6_lock);
1777 	return rt;
1778 }
1779 
1780 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1781 				     struct net_device *dev,
1782 				     unsigned int pref)
1783 {
1784 	struct fib6_config cfg = {
1785 		.fc_table	= RT6_TABLE_DFLT,
1786 		.fc_metric	= IP6_RT_PRIO_USER,
1787 		.fc_ifindex	= dev->ifindex,
1788 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1789 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1790 		.fc_nlinfo.pid = 0,
1791 		.fc_nlinfo.nlh = NULL,
1792 		.fc_nlinfo.nl_net = dev_net(dev),
1793 	};
1794 
1795 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1796 
1797 	ip6_route_add(&cfg);
1798 
1799 	return rt6_get_dflt_router(gwaddr, dev);
1800 }
1801 
1802 void rt6_purge_dflt_routers(struct net *net)
1803 {
1804 	struct rt6_info *rt;
1805 	struct fib6_table *table;
1806 
1807 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1808 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1809 	if (table == NULL)
1810 		return;
1811 
1812 restart:
1813 	read_lock_bh(&table->tb6_lock);
1814 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1815 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1816 			dst_hold(&rt->dst);
1817 			read_unlock_bh(&table->tb6_lock);
1818 			ip6_del_rt(rt);
1819 			goto restart;
1820 		}
1821 	}
1822 	read_unlock_bh(&table->tb6_lock);
1823 }
1824 
1825 static void rtmsg_to_fib6_config(struct net *net,
1826 				 struct in6_rtmsg *rtmsg,
1827 				 struct fib6_config *cfg)
1828 {
1829 	memset(cfg, 0, sizeof(*cfg));
1830 
1831 	cfg->fc_table = RT6_TABLE_MAIN;
1832 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1833 	cfg->fc_metric = rtmsg->rtmsg_metric;
1834 	cfg->fc_expires = rtmsg->rtmsg_info;
1835 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1836 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1837 	cfg->fc_flags = rtmsg->rtmsg_flags;
1838 
1839 	cfg->fc_nlinfo.nl_net = net;
1840 
1841 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1842 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1843 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1844 }
1845 
1846 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1847 {
1848 	struct fib6_config cfg;
1849 	struct in6_rtmsg rtmsg;
1850 	int err;
1851 
1852 	switch(cmd) {
1853 	case SIOCADDRT:		/* Add a route */
1854 	case SIOCDELRT:		/* Delete a route */
1855 		if (!capable(CAP_NET_ADMIN))
1856 			return -EPERM;
1857 		err = copy_from_user(&rtmsg, arg,
1858 				     sizeof(struct in6_rtmsg));
1859 		if (err)
1860 			return -EFAULT;
1861 
1862 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1863 
1864 		rtnl_lock();
1865 		switch (cmd) {
1866 		case SIOCADDRT:
1867 			err = ip6_route_add(&cfg);
1868 			break;
1869 		case SIOCDELRT:
1870 			err = ip6_route_del(&cfg);
1871 			break;
1872 		default:
1873 			err = -EINVAL;
1874 		}
1875 		rtnl_unlock();
1876 
1877 		return err;
1878 	}
1879 
1880 	return -EINVAL;
1881 }
1882 
1883 /*
1884  *	Drop the packet on the floor
1885  */
1886 
1887 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1888 {
1889 	int type;
1890 	struct dst_entry *dst = skb_dst(skb);
1891 	switch (ipstats_mib_noroutes) {
1892 	case IPSTATS_MIB_INNOROUTES:
1893 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1894 		if (type == IPV6_ADDR_ANY) {
1895 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1896 				      IPSTATS_MIB_INADDRERRORS);
1897 			break;
1898 		}
1899 		/* FALLTHROUGH */
1900 	case IPSTATS_MIB_OUTNOROUTES:
1901 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1902 			      ipstats_mib_noroutes);
1903 		break;
1904 	}
1905 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1906 	kfree_skb(skb);
1907 	return 0;
1908 }
1909 
1910 static int ip6_pkt_discard(struct sk_buff *skb)
1911 {
1912 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1913 }
1914 
1915 static int ip6_pkt_discard_out(struct sk_buff *skb)
1916 {
1917 	skb->dev = skb_dst(skb)->dev;
1918 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1919 }
1920 
1921 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1922 
1923 static int ip6_pkt_prohibit(struct sk_buff *skb)
1924 {
1925 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1926 }
1927 
1928 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1929 {
1930 	skb->dev = skb_dst(skb)->dev;
1931 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1932 }
1933 
1934 #endif
1935 
1936 /*
1937  *	Allocate a dst for local (unicast / anycast) address.
1938  */
1939 
1940 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1941 				    const struct in6_addr *addr,
1942 				    int anycast)
1943 {
1944 	struct net *net = dev_net(idev->dev);
1945 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1946 	struct neighbour *neigh;
1947 
1948 	if (rt == NULL)
1949 		return ERR_PTR(-ENOMEM);
1950 
1951 	dev_hold(net->loopback_dev);
1952 	in6_dev_hold(idev);
1953 
1954 	rt->dst.flags = DST_HOST;
1955 	rt->dst.input = ip6_input;
1956 	rt->dst.output = ip6_output;
1957 	rt->rt6i_dev = net->loopback_dev;
1958 	rt->rt6i_idev = idev;
1959 	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1960 	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1961 	rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1962 	rt->dst.obsolete = -1;
1963 
1964 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1965 	if (anycast)
1966 		rt->rt6i_flags |= RTF_ANYCAST;
1967 	else
1968 		rt->rt6i_flags |= RTF_LOCAL;
1969 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1970 	if (IS_ERR(neigh)) {
1971 		dst_free(&rt->dst);
1972 
1973 		/* We are casting this because that is the return
1974 		 * value type.  But an errno encoded pointer is the
1975 		 * same regardless of the underlying pointer type,
1976 		 * and that's what we are returning.  So this is OK.
1977 		 */
1978 		return (struct rt6_info *) neigh;
1979 	}
1980 	rt->rt6i_nexthop = neigh;
1981 
1982 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1983 	rt->rt6i_dst.plen = 128;
1984 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1985 
1986 	atomic_set(&rt->dst.__refcnt, 1);
1987 
1988 	return rt;
1989 }
1990 
1991 struct arg_dev_net {
1992 	struct net_device *dev;
1993 	struct net *net;
1994 };
1995 
1996 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1997 {
1998 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1999 	struct net *net = ((struct arg_dev_net *)arg)->net;
2000 
2001 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2002 	    rt != net->ipv6.ip6_null_entry) {
2003 		RT6_TRACE("deleted by ifdown %p\n", rt);
2004 		return -1;
2005 	}
2006 	return 0;
2007 }
2008 
2009 void rt6_ifdown(struct net *net, struct net_device *dev)
2010 {
2011 	struct arg_dev_net adn = {
2012 		.dev = dev,
2013 		.net = net,
2014 	};
2015 
2016 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2017 	icmp6_clean_all(fib6_ifdown, &adn);
2018 }
2019 
2020 struct rt6_mtu_change_arg
2021 {
2022 	struct net_device *dev;
2023 	unsigned mtu;
2024 };
2025 
2026 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2027 {
2028 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2029 	struct inet6_dev *idev;
2030 	struct net *net = dev_net(arg->dev);
2031 
2032 	/* In IPv6 pmtu discovery is not optional,
2033 	   so that RTAX_MTU lock cannot disable it.
2034 	   We still use this lock to block changes
2035 	   caused by addrconf/ndisc.
2036 	*/
2037 
2038 	idev = __in6_dev_get(arg->dev);
2039 	if (idev == NULL)
2040 		return 0;
2041 
2042 	/* For administrative MTU increase, there is no way to discover
2043 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2044 	   Since RFC 1981 doesn't include administrative MTU increase
2045 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2046 	 */
2047 	/*
2048 	   If new MTU is less than route PMTU, this new MTU will be the
2049 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2050 	   decreases; if new MTU is greater than route PMTU, and the
2051 	   old MTU is the lowest MTU in the path, update the route PMTU
2052 	   to reflect the increase. In this case if the other nodes' MTU
2053 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2054 	   PMTU discouvery.
2055 	 */
2056 	if (rt->rt6i_dev == arg->dev &&
2057 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2058 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2059 	     (dst_mtu(&rt->dst) < arg->mtu &&
2060 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2061 		rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2062 		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2063 	}
2064 	return 0;
2065 }
2066 
2067 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2068 {
2069 	struct rt6_mtu_change_arg arg = {
2070 		.dev = dev,
2071 		.mtu = mtu,
2072 	};
2073 
2074 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2075 }
2076 
2077 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2078 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2079 	[RTA_OIF]               = { .type = NLA_U32 },
2080 	[RTA_IIF]		= { .type = NLA_U32 },
2081 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2082 	[RTA_METRICS]           = { .type = NLA_NESTED },
2083 };
2084 
2085 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2086 			      struct fib6_config *cfg)
2087 {
2088 	struct rtmsg *rtm;
2089 	struct nlattr *tb[RTA_MAX+1];
2090 	int err;
2091 
2092 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2093 	if (err < 0)
2094 		goto errout;
2095 
2096 	err = -EINVAL;
2097 	rtm = nlmsg_data(nlh);
2098 	memset(cfg, 0, sizeof(*cfg));
2099 
2100 	cfg->fc_table = rtm->rtm_table;
2101 	cfg->fc_dst_len = rtm->rtm_dst_len;
2102 	cfg->fc_src_len = rtm->rtm_src_len;
2103 	cfg->fc_flags = RTF_UP;
2104 	cfg->fc_protocol = rtm->rtm_protocol;
2105 
2106 	if (rtm->rtm_type == RTN_UNREACHABLE)
2107 		cfg->fc_flags |= RTF_REJECT;
2108 
2109 	if (rtm->rtm_type == RTN_LOCAL)
2110 		cfg->fc_flags |= RTF_LOCAL;
2111 
2112 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2113 	cfg->fc_nlinfo.nlh = nlh;
2114 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2115 
2116 	if (tb[RTA_GATEWAY]) {
2117 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2118 		cfg->fc_flags |= RTF_GATEWAY;
2119 	}
2120 
2121 	if (tb[RTA_DST]) {
2122 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2123 
2124 		if (nla_len(tb[RTA_DST]) < plen)
2125 			goto errout;
2126 
2127 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2128 	}
2129 
2130 	if (tb[RTA_SRC]) {
2131 		int plen = (rtm->rtm_src_len + 7) >> 3;
2132 
2133 		if (nla_len(tb[RTA_SRC]) < plen)
2134 			goto errout;
2135 
2136 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2137 	}
2138 
2139 	if (tb[RTA_OIF])
2140 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2141 
2142 	if (tb[RTA_PRIORITY])
2143 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2144 
2145 	if (tb[RTA_METRICS]) {
2146 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2147 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2148 	}
2149 
2150 	if (tb[RTA_TABLE])
2151 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2152 
2153 	err = 0;
2154 errout:
2155 	return err;
2156 }
2157 
2158 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2159 {
2160 	struct fib6_config cfg;
2161 	int err;
2162 
2163 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2164 	if (err < 0)
2165 		return err;
2166 
2167 	return ip6_route_del(&cfg);
2168 }
2169 
2170 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2171 {
2172 	struct fib6_config cfg;
2173 	int err;
2174 
2175 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2176 	if (err < 0)
2177 		return err;
2178 
2179 	return ip6_route_add(&cfg);
2180 }
2181 
2182 static inline size_t rt6_nlmsg_size(void)
2183 {
2184 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2185 	       + nla_total_size(16) /* RTA_SRC */
2186 	       + nla_total_size(16) /* RTA_DST */
2187 	       + nla_total_size(16) /* RTA_GATEWAY */
2188 	       + nla_total_size(16) /* RTA_PREFSRC */
2189 	       + nla_total_size(4) /* RTA_TABLE */
2190 	       + nla_total_size(4) /* RTA_IIF */
2191 	       + nla_total_size(4) /* RTA_OIF */
2192 	       + nla_total_size(4) /* RTA_PRIORITY */
2193 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2194 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2195 }
2196 
2197 static int rt6_fill_node(struct net *net,
2198 			 struct sk_buff *skb, struct rt6_info *rt,
2199 			 struct in6_addr *dst, struct in6_addr *src,
2200 			 int iif, int type, u32 pid, u32 seq,
2201 			 int prefix, int nowait, unsigned int flags)
2202 {
2203 	struct rtmsg *rtm;
2204 	struct nlmsghdr *nlh;
2205 	long expires;
2206 	u32 table;
2207 
2208 	if (prefix) {	/* user wants prefix routes only */
2209 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2210 			/* success since this is not a prefix route */
2211 			return 1;
2212 		}
2213 	}
2214 
2215 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2216 	if (nlh == NULL)
2217 		return -EMSGSIZE;
2218 
2219 	rtm = nlmsg_data(nlh);
2220 	rtm->rtm_family = AF_INET6;
2221 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2222 	rtm->rtm_src_len = rt->rt6i_src.plen;
2223 	rtm->rtm_tos = 0;
2224 	if (rt->rt6i_table)
2225 		table = rt->rt6i_table->tb6_id;
2226 	else
2227 		table = RT6_TABLE_UNSPEC;
2228 	rtm->rtm_table = table;
2229 	NLA_PUT_U32(skb, RTA_TABLE, table);
2230 	if (rt->rt6i_flags&RTF_REJECT)
2231 		rtm->rtm_type = RTN_UNREACHABLE;
2232 	else if (rt->rt6i_flags&RTF_LOCAL)
2233 		rtm->rtm_type = RTN_LOCAL;
2234 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2235 		rtm->rtm_type = RTN_LOCAL;
2236 	else
2237 		rtm->rtm_type = RTN_UNICAST;
2238 	rtm->rtm_flags = 0;
2239 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2240 	rtm->rtm_protocol = rt->rt6i_protocol;
2241 	if (rt->rt6i_flags&RTF_DYNAMIC)
2242 		rtm->rtm_protocol = RTPROT_REDIRECT;
2243 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2244 		rtm->rtm_protocol = RTPROT_KERNEL;
2245 	else if (rt->rt6i_flags&RTF_DEFAULT)
2246 		rtm->rtm_protocol = RTPROT_RA;
2247 
2248 	if (rt->rt6i_flags&RTF_CACHE)
2249 		rtm->rtm_flags |= RTM_F_CLONED;
2250 
2251 	if (dst) {
2252 		NLA_PUT(skb, RTA_DST, 16, dst);
2253 		rtm->rtm_dst_len = 128;
2254 	} else if (rtm->rtm_dst_len)
2255 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2256 #ifdef CONFIG_IPV6_SUBTREES
2257 	if (src) {
2258 		NLA_PUT(skb, RTA_SRC, 16, src);
2259 		rtm->rtm_src_len = 128;
2260 	} else if (rtm->rtm_src_len)
2261 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2262 #endif
2263 	if (iif) {
2264 #ifdef CONFIG_IPV6_MROUTE
2265 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2266 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2267 			if (err <= 0) {
2268 				if (!nowait) {
2269 					if (err == 0)
2270 						return 0;
2271 					goto nla_put_failure;
2272 				} else {
2273 					if (err == -EMSGSIZE)
2274 						goto nla_put_failure;
2275 				}
2276 			}
2277 		} else
2278 #endif
2279 			NLA_PUT_U32(skb, RTA_IIF, iif);
2280 	} else if (dst) {
2281 		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2282 		struct in6_addr saddr_buf;
2283 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2284 				       dst, 0, &saddr_buf) == 0)
2285 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2286 	}
2287 
2288 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2289 		goto nla_put_failure;
2290 
2291 	if (rt->dst.neighbour)
2292 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2293 
2294 	if (rt->dst.dev)
2295 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2296 
2297 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2298 
2299 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2300 		expires = 0;
2301 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2302 		expires = rt->rt6i_expires - jiffies;
2303 	else
2304 		expires = INT_MAX;
2305 
2306 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2307 			       expires, rt->dst.error) < 0)
2308 		goto nla_put_failure;
2309 
2310 	return nlmsg_end(skb, nlh);
2311 
2312 nla_put_failure:
2313 	nlmsg_cancel(skb, nlh);
2314 	return -EMSGSIZE;
2315 }
2316 
2317 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2318 {
2319 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2320 	int prefix;
2321 
2322 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2323 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2324 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2325 	} else
2326 		prefix = 0;
2327 
2328 	return rt6_fill_node(arg->net,
2329 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2330 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2331 		     prefix, 0, NLM_F_MULTI);
2332 }
2333 
2334 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2335 {
2336 	struct net *net = sock_net(in_skb->sk);
2337 	struct nlattr *tb[RTA_MAX+1];
2338 	struct rt6_info *rt;
2339 	struct sk_buff *skb;
2340 	struct rtmsg *rtm;
2341 	struct flowi fl;
2342 	int err, iif = 0;
2343 
2344 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2345 	if (err < 0)
2346 		goto errout;
2347 
2348 	err = -EINVAL;
2349 	memset(&fl, 0, sizeof(fl));
2350 
2351 	if (tb[RTA_SRC]) {
2352 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2353 			goto errout;
2354 
2355 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2356 	}
2357 
2358 	if (tb[RTA_DST]) {
2359 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2360 			goto errout;
2361 
2362 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2363 	}
2364 
2365 	if (tb[RTA_IIF])
2366 		iif = nla_get_u32(tb[RTA_IIF]);
2367 
2368 	if (tb[RTA_OIF])
2369 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2370 
2371 	if (iif) {
2372 		struct net_device *dev;
2373 		dev = __dev_get_by_index(net, iif);
2374 		if (!dev) {
2375 			err = -ENODEV;
2376 			goto errout;
2377 		}
2378 	}
2379 
2380 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2381 	if (skb == NULL) {
2382 		err = -ENOBUFS;
2383 		goto errout;
2384 	}
2385 
2386 	/* Reserve room for dummy headers, this skb can pass
2387 	   through good chunk of routing engine.
2388 	 */
2389 	skb_reset_mac_header(skb);
2390 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2391 
2392 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2393 	skb_dst_set(skb, &rt->dst);
2394 
2395 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2396 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2397 			    nlh->nlmsg_seq, 0, 0, 0);
2398 	if (err < 0) {
2399 		kfree_skb(skb);
2400 		goto errout;
2401 	}
2402 
2403 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2404 errout:
2405 	return err;
2406 }
2407 
2408 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2409 {
2410 	struct sk_buff *skb;
2411 	struct net *net = info->nl_net;
2412 	u32 seq;
2413 	int err;
2414 
2415 	err = -ENOBUFS;
2416 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2417 
2418 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2419 	if (skb == NULL)
2420 		goto errout;
2421 
2422 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2423 				event, info->pid, seq, 0, 0, 0);
2424 	if (err < 0) {
2425 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2426 		WARN_ON(err == -EMSGSIZE);
2427 		kfree_skb(skb);
2428 		goto errout;
2429 	}
2430 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2431 		    info->nlh, gfp_any());
2432 	return;
2433 errout:
2434 	if (err < 0)
2435 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2436 }
2437 
2438 static int ip6_route_dev_notify(struct notifier_block *this,
2439 				unsigned long event, void *data)
2440 {
2441 	struct net_device *dev = (struct net_device *)data;
2442 	struct net *net = dev_net(dev);
2443 
2444 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2445 		net->ipv6.ip6_null_entry->dst.dev = dev;
2446 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2447 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2448 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2449 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2450 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2451 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2452 #endif
2453 	}
2454 
2455 	return NOTIFY_OK;
2456 }
2457 
2458 /*
2459  *	/proc
2460  */
2461 
2462 #ifdef CONFIG_PROC_FS
2463 
2464 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2465 
2466 struct rt6_proc_arg
2467 {
2468 	char *buffer;
2469 	int offset;
2470 	int length;
2471 	int skip;
2472 	int len;
2473 };
2474 
2475 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2476 {
2477 	struct seq_file *m = p_arg;
2478 
2479 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2480 
2481 #ifdef CONFIG_IPV6_SUBTREES
2482 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2483 #else
2484 	seq_puts(m, "00000000000000000000000000000000 00 ");
2485 #endif
2486 
2487 	if (rt->rt6i_nexthop) {
2488 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2489 	} else {
2490 		seq_puts(m, "00000000000000000000000000000000");
2491 	}
2492 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2493 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2494 		   rt->dst.__use, rt->rt6i_flags,
2495 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2496 	return 0;
2497 }
2498 
2499 static int ipv6_route_show(struct seq_file *m, void *v)
2500 {
2501 	struct net *net = (struct net *)m->private;
2502 	fib6_clean_all(net, rt6_info_route, 0, m);
2503 	return 0;
2504 }
2505 
2506 static int ipv6_route_open(struct inode *inode, struct file *file)
2507 {
2508 	return single_open_net(inode, file, ipv6_route_show);
2509 }
2510 
2511 static const struct file_operations ipv6_route_proc_fops = {
2512 	.owner		= THIS_MODULE,
2513 	.open		= ipv6_route_open,
2514 	.read		= seq_read,
2515 	.llseek		= seq_lseek,
2516 	.release	= single_release_net,
2517 };
2518 
2519 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2520 {
2521 	struct net *net = (struct net *)seq->private;
2522 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2523 		   net->ipv6.rt6_stats->fib_nodes,
2524 		   net->ipv6.rt6_stats->fib_route_nodes,
2525 		   net->ipv6.rt6_stats->fib_rt_alloc,
2526 		   net->ipv6.rt6_stats->fib_rt_entries,
2527 		   net->ipv6.rt6_stats->fib_rt_cache,
2528 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2529 		   net->ipv6.rt6_stats->fib_discarded_routes);
2530 
2531 	return 0;
2532 }
2533 
2534 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2535 {
2536 	return single_open_net(inode, file, rt6_stats_seq_show);
2537 }
2538 
2539 static const struct file_operations rt6_stats_seq_fops = {
2540 	.owner	 = THIS_MODULE,
2541 	.open	 = rt6_stats_seq_open,
2542 	.read	 = seq_read,
2543 	.llseek	 = seq_lseek,
2544 	.release = single_release_net,
2545 };
2546 #endif	/* CONFIG_PROC_FS */
2547 
2548 #ifdef CONFIG_SYSCTL
2549 
2550 static
2551 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2552 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2553 {
2554 	struct net *net = current->nsproxy->net_ns;
2555 	int delay = net->ipv6.sysctl.flush_delay;
2556 	if (write) {
2557 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2558 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2559 		return 0;
2560 	} else
2561 		return -EINVAL;
2562 }
2563 
2564 ctl_table ipv6_route_table_template[] = {
2565 	{
2566 		.procname	=	"flush",
2567 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2568 		.maxlen		=	sizeof(int),
2569 		.mode		=	0200,
2570 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2571 	},
2572 	{
2573 		.procname	=	"gc_thresh",
2574 		.data		=	&ip6_dst_ops_template.gc_thresh,
2575 		.maxlen		=	sizeof(int),
2576 		.mode		=	0644,
2577 		.proc_handler	=	proc_dointvec,
2578 	},
2579 	{
2580 		.procname	=	"max_size",
2581 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2582 		.maxlen		=	sizeof(int),
2583 		.mode		=	0644,
2584 		.proc_handler	=	proc_dointvec,
2585 	},
2586 	{
2587 		.procname	=	"gc_min_interval",
2588 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2589 		.maxlen		=	sizeof(int),
2590 		.mode		=	0644,
2591 		.proc_handler	=	proc_dointvec_jiffies,
2592 	},
2593 	{
2594 		.procname	=	"gc_timeout",
2595 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2596 		.maxlen		=	sizeof(int),
2597 		.mode		=	0644,
2598 		.proc_handler	=	proc_dointvec_jiffies,
2599 	},
2600 	{
2601 		.procname	=	"gc_interval",
2602 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2603 		.maxlen		=	sizeof(int),
2604 		.mode		=	0644,
2605 		.proc_handler	=	proc_dointvec_jiffies,
2606 	},
2607 	{
2608 		.procname	=	"gc_elasticity",
2609 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2610 		.maxlen		=	sizeof(int),
2611 		.mode		=	0644,
2612 		.proc_handler	=	proc_dointvec,
2613 	},
2614 	{
2615 		.procname	=	"mtu_expires",
2616 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2617 		.maxlen		=	sizeof(int),
2618 		.mode		=	0644,
2619 		.proc_handler	=	proc_dointvec_jiffies,
2620 	},
2621 	{
2622 		.procname	=	"min_adv_mss",
2623 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2624 		.maxlen		=	sizeof(int),
2625 		.mode		=	0644,
2626 		.proc_handler	=	proc_dointvec,
2627 	},
2628 	{
2629 		.procname	=	"gc_min_interval_ms",
2630 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2631 		.maxlen		=	sizeof(int),
2632 		.mode		=	0644,
2633 		.proc_handler	=	proc_dointvec_ms_jiffies,
2634 	},
2635 	{ }
2636 };
2637 
2638 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2639 {
2640 	struct ctl_table *table;
2641 
2642 	table = kmemdup(ipv6_route_table_template,
2643 			sizeof(ipv6_route_table_template),
2644 			GFP_KERNEL);
2645 
2646 	if (table) {
2647 		table[0].data = &net->ipv6.sysctl.flush_delay;
2648 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2649 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2650 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2651 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2652 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2653 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2654 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2655 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2656 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2657 	}
2658 
2659 	return table;
2660 }
2661 #endif
2662 
2663 static int __net_init ip6_route_net_init(struct net *net)
2664 {
2665 	int ret = -ENOMEM;
2666 
2667 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2668 	       sizeof(net->ipv6.ip6_dst_ops));
2669 
2670 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2671 		goto out_ip6_dst_ops;
2672 
2673 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2674 					   sizeof(*net->ipv6.ip6_null_entry),
2675 					   GFP_KERNEL);
2676 	if (!net->ipv6.ip6_null_entry)
2677 		goto out_ip6_dst_entries;
2678 	net->ipv6.ip6_null_entry->dst.path =
2679 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2680 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2681 
2682 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2683 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2684 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2685 					       GFP_KERNEL);
2686 	if (!net->ipv6.ip6_prohibit_entry)
2687 		goto out_ip6_null_entry;
2688 	net->ipv6.ip6_prohibit_entry->dst.path =
2689 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2690 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2691 
2692 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2693 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2694 					       GFP_KERNEL);
2695 	if (!net->ipv6.ip6_blk_hole_entry)
2696 		goto out_ip6_prohibit_entry;
2697 	net->ipv6.ip6_blk_hole_entry->dst.path =
2698 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2699 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2700 #endif
2701 
2702 	net->ipv6.sysctl.flush_delay = 0;
2703 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2704 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2705 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2706 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2707 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2708 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2709 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2710 
2711 #ifdef CONFIG_PROC_FS
2712 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2713 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2714 #endif
2715 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2716 
2717 	ret = 0;
2718 out:
2719 	return ret;
2720 
2721 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2722 out_ip6_prohibit_entry:
2723 	kfree(net->ipv6.ip6_prohibit_entry);
2724 out_ip6_null_entry:
2725 	kfree(net->ipv6.ip6_null_entry);
2726 #endif
2727 out_ip6_dst_entries:
2728 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2729 out_ip6_dst_ops:
2730 	goto out;
2731 }
2732 
2733 static void __net_exit ip6_route_net_exit(struct net *net)
2734 {
2735 #ifdef CONFIG_PROC_FS
2736 	proc_net_remove(net, "ipv6_route");
2737 	proc_net_remove(net, "rt6_stats");
2738 #endif
2739 	kfree(net->ipv6.ip6_null_entry);
2740 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2741 	kfree(net->ipv6.ip6_prohibit_entry);
2742 	kfree(net->ipv6.ip6_blk_hole_entry);
2743 #endif
2744 }
2745 
2746 static struct pernet_operations ip6_route_net_ops = {
2747 	.init = ip6_route_net_init,
2748 	.exit = ip6_route_net_exit,
2749 };
2750 
2751 static struct notifier_block ip6_route_dev_notifier = {
2752 	.notifier_call = ip6_route_dev_notify,
2753 	.priority = 0,
2754 };
2755 
2756 int __init ip6_route_init(void)
2757 {
2758 	int ret;
2759 
2760 	ret = -ENOMEM;
2761 	ip6_dst_ops_template.kmem_cachep =
2762 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2763 				  SLAB_HWCACHE_ALIGN, NULL);
2764 	if (!ip6_dst_ops_template.kmem_cachep)
2765 		goto out;
2766 
2767 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2768 	if (ret)
2769 		goto out_kmem_cache;
2770 
2771 	ret = register_pernet_subsys(&ip6_route_net_ops);
2772 	if (ret)
2773 		goto out_dst_entries;
2774 
2775 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2776 
2777 	/* Registering of the loopback is done before this portion of code,
2778 	 * the loopback reference in rt6_info will not be taken, do it
2779 	 * manually for init_net */
2780 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2781 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2782   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2783 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2784 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2785 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2786 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2787   #endif
2788 	ret = fib6_init();
2789 	if (ret)
2790 		goto out_register_subsys;
2791 
2792 	ret = xfrm6_init();
2793 	if (ret)
2794 		goto out_fib6_init;
2795 
2796 	ret = fib6_rules_init();
2797 	if (ret)
2798 		goto xfrm6_init;
2799 
2800 	ret = -ENOBUFS;
2801 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2802 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2803 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2804 		goto fib6_rules_init;
2805 
2806 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2807 	if (ret)
2808 		goto fib6_rules_init;
2809 
2810 out:
2811 	return ret;
2812 
2813 fib6_rules_init:
2814 	fib6_rules_cleanup();
2815 xfrm6_init:
2816 	xfrm6_fini();
2817 out_fib6_init:
2818 	fib6_gc_cleanup();
2819 out_register_subsys:
2820 	unregister_pernet_subsys(&ip6_route_net_ops);
2821 out_dst_entries:
2822 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2823 out_kmem_cache:
2824 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2825 	goto out;
2826 }
2827 
2828 void ip6_route_cleanup(void)
2829 {
2830 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2831 	fib6_rules_cleanup();
2832 	xfrm6_fini();
2833 	fib6_gc_cleanup();
2834 	unregister_pernet_subsys(&ip6_route_net_ops);
2835 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2836 }
2837