xref: /linux/net/ipv6/route.c (revision 98f4a2c27c76e7eaf75c2f3f25487fabca62ef3d)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   const struct in6_addr *prefix, int prefixlen,
93 					   const struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   const struct in6_addr *prefix, int prefixlen,
97 					   const struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102 	struct rt6_info *rt = (struct rt6_info *) dst;
103 	struct inet_peer *peer;
104 	u32 *p = NULL;
105 
106 	if (!rt->rt6i_peer)
107 		rt6_bind_peer(rt, 1);
108 
109 	peer = rt->rt6i_peer;
110 	if (peer) {
111 		u32 *old_p = __DST_METRICS_PTR(old);
112 		unsigned long prev, new;
113 
114 		p = peer->metrics;
115 		if (inet_metrics_new(peer))
116 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117 
118 		new = (unsigned long) p;
119 		prev = cmpxchg(&dst->_metrics, old, new);
120 
121 		if (prev != old) {
122 			p = __DST_METRICS_PTR(prev);
123 			if (prev & DST_METRICS_READ_ONLY)
124 				p = NULL;
125 		}
126 	}
127 	return p;
128 }
129 
130 static struct dst_ops ip6_dst_ops_template = {
131 	.family			=	AF_INET6,
132 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
133 	.gc			=	ip6_dst_gc,
134 	.gc_thresh		=	1024,
135 	.check			=	ip6_dst_check,
136 	.default_advmss		=	ip6_default_advmss,
137 	.default_mtu		=	ip6_default_mtu,
138 	.cow_metrics		=	ipv6_cow_metrics,
139 	.destroy		=	ip6_dst_destroy,
140 	.ifdown			=	ip6_dst_ifdown,
141 	.negative_advice	=	ip6_negative_advice,
142 	.link_failure		=	ip6_link_failure,
143 	.update_pmtu		=	ip6_rt_update_pmtu,
144 	.local_out		=	__ip6_local_out,
145 };
146 
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149 	return 0;
150 }
151 
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155 
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157 					 unsigned long old)
158 {
159 	return NULL;
160 }
161 
162 static struct dst_ops ip6_dst_blackhole_ops = {
163 	.family			=	AF_INET6,
164 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
165 	.destroy		=	ip6_dst_destroy,
166 	.check			=	ip6_dst_check,
167 	.default_mtu		=	ip6_blackhole_default_mtu,
168 	.default_advmss		=	ip6_default_advmss,
169 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
170 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
171 };
172 
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174 	[RTAX_HOPLIMIT - 1] = 255,
175 };
176 
177 static struct rt6_info ip6_null_entry_template = {
178 	.dst = {
179 		.__refcnt	= ATOMIC_INIT(1),
180 		.__use		= 1,
181 		.obsolete	= -1,
182 		.error		= -ENETUNREACH,
183 		.input		= ip6_pkt_discard,
184 		.output		= ip6_pkt_discard_out,
185 	},
186 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
187 	.rt6i_protocol  = RTPROT_KERNEL,
188 	.rt6i_metric	= ~(u32) 0,
189 	.rt6i_ref	= ATOMIC_INIT(1),
190 };
191 
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193 
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196 
197 static struct rt6_info ip6_prohibit_entry_template = {
198 	.dst = {
199 		.__refcnt	= ATOMIC_INIT(1),
200 		.__use		= 1,
201 		.obsolete	= -1,
202 		.error		= -EACCES,
203 		.input		= ip6_pkt_prohibit,
204 		.output		= ip6_pkt_prohibit_out,
205 	},
206 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
207 	.rt6i_protocol  = RTPROT_KERNEL,
208 	.rt6i_metric	= ~(u32) 0,
209 	.rt6i_ref	= ATOMIC_INIT(1),
210 };
211 
212 static struct rt6_info ip6_blk_hole_entry_template = {
213 	.dst = {
214 		.__refcnt	= ATOMIC_INIT(1),
215 		.__use		= 1,
216 		.obsolete	= -1,
217 		.error		= -EINVAL,
218 		.input		= dst_discard,
219 		.output		= dst_discard,
220 	},
221 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
222 	.rt6i_protocol  = RTPROT_KERNEL,
223 	.rt6i_metric	= ~(u32) 0,
224 	.rt6i_ref	= ATOMIC_INIT(1),
225 };
226 
227 #endif
228 
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231 					     struct net_device *dev,
232 					     int flags)
233 {
234 	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
235 
236 	memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
237 
238 	return rt;
239 }
240 
241 static void ip6_dst_destroy(struct dst_entry *dst)
242 {
243 	struct rt6_info *rt = (struct rt6_info *)dst;
244 	struct inet6_dev *idev = rt->rt6i_idev;
245 	struct inet_peer *peer = rt->rt6i_peer;
246 
247 	if (idev != NULL) {
248 		rt->rt6i_idev = NULL;
249 		in6_dev_put(idev);
250 	}
251 	if (peer) {
252 		rt->rt6i_peer = NULL;
253 		inet_putpeer(peer);
254 	}
255 }
256 
257 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
258 
259 static u32 rt6_peer_genid(void)
260 {
261 	return atomic_read(&__rt6_peer_genid);
262 }
263 
264 void rt6_bind_peer(struct rt6_info *rt, int create)
265 {
266 	struct inet_peer *peer;
267 
268 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
269 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
270 		inet_putpeer(peer);
271 	else
272 		rt->rt6i_peer_genid = rt6_peer_genid();
273 }
274 
275 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
276 			   int how)
277 {
278 	struct rt6_info *rt = (struct rt6_info *)dst;
279 	struct inet6_dev *idev = rt->rt6i_idev;
280 	struct net_device *loopback_dev =
281 		dev_net(dev)->loopback_dev;
282 
283 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
284 		struct inet6_dev *loopback_idev =
285 			in6_dev_get(loopback_dev);
286 		if (loopback_idev != NULL) {
287 			rt->rt6i_idev = loopback_idev;
288 			in6_dev_put(idev);
289 		}
290 	}
291 }
292 
293 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
294 {
295 	return (rt->rt6i_flags & RTF_EXPIRES) &&
296 		time_after(jiffies, rt->rt6i_expires);
297 }
298 
299 static inline int rt6_need_strict(const struct in6_addr *daddr)
300 {
301 	return ipv6_addr_type(daddr) &
302 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
303 }
304 
305 /*
306  *	Route lookup. Any table->tb6_lock is implied.
307  */
308 
309 static inline struct rt6_info *rt6_device_match(struct net *net,
310 						    struct rt6_info *rt,
311 						    const struct in6_addr *saddr,
312 						    int oif,
313 						    int flags)
314 {
315 	struct rt6_info *local = NULL;
316 	struct rt6_info *sprt;
317 
318 	if (!oif && ipv6_addr_any(saddr))
319 		goto out;
320 
321 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
322 		struct net_device *dev = sprt->rt6i_dev;
323 
324 		if (oif) {
325 			if (dev->ifindex == oif)
326 				return sprt;
327 			if (dev->flags & IFF_LOOPBACK) {
328 				if (sprt->rt6i_idev == NULL ||
329 				    sprt->rt6i_idev->dev->ifindex != oif) {
330 					if (flags & RT6_LOOKUP_F_IFACE && oif)
331 						continue;
332 					if (local && (!oif ||
333 						      local->rt6i_idev->dev->ifindex == oif))
334 						continue;
335 				}
336 				local = sprt;
337 			}
338 		} else {
339 			if (ipv6_chk_addr(net, saddr, dev,
340 					  flags & RT6_LOOKUP_F_IFACE))
341 				return sprt;
342 		}
343 	}
344 
345 	if (oif) {
346 		if (local)
347 			return local;
348 
349 		if (flags & RT6_LOOKUP_F_IFACE)
350 			return net->ipv6.ip6_null_entry;
351 	}
352 out:
353 	return rt;
354 }
355 
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 static void rt6_probe(struct rt6_info *rt)
358 {
359 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
360 	/*
361 	 * Okay, this does not seem to be appropriate
362 	 * for now, however, we need to check if it
363 	 * is really so; aka Router Reachability Probing.
364 	 *
365 	 * Router Reachability Probe MUST be rate-limited
366 	 * to no more than one per minute.
367 	 */
368 	if (!neigh || (neigh->nud_state & NUD_VALID))
369 		return;
370 	read_lock_bh(&neigh->lock);
371 	if (!(neigh->nud_state & NUD_VALID) &&
372 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
373 		struct in6_addr mcaddr;
374 		struct in6_addr *target;
375 
376 		neigh->updated = jiffies;
377 		read_unlock_bh(&neigh->lock);
378 
379 		target = (struct in6_addr *)&neigh->primary_key;
380 		addrconf_addr_solict_mult(target, &mcaddr);
381 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
382 	} else
383 		read_unlock_bh(&neigh->lock);
384 }
385 #else
386 static inline void rt6_probe(struct rt6_info *rt)
387 {
388 }
389 #endif
390 
391 /*
392  * Default Router Selection (RFC 2461 6.3.6)
393  */
394 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
395 {
396 	struct net_device *dev = rt->rt6i_dev;
397 	if (!oif || dev->ifindex == oif)
398 		return 2;
399 	if ((dev->flags & IFF_LOOPBACK) &&
400 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
401 		return 1;
402 	return 0;
403 }
404 
405 static inline int rt6_check_neigh(struct rt6_info *rt)
406 {
407 	struct neighbour *neigh = rt->rt6i_nexthop;
408 	int m;
409 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
410 	    !(rt->rt6i_flags & RTF_GATEWAY))
411 		m = 1;
412 	else if (neigh) {
413 		read_lock_bh(&neigh->lock);
414 		if (neigh->nud_state & NUD_VALID)
415 			m = 2;
416 #ifdef CONFIG_IPV6_ROUTER_PREF
417 		else if (neigh->nud_state & NUD_FAILED)
418 			m = 0;
419 #endif
420 		else
421 			m = 1;
422 		read_unlock_bh(&neigh->lock);
423 	} else
424 		m = 0;
425 	return m;
426 }
427 
428 static int rt6_score_route(struct rt6_info *rt, int oif,
429 			   int strict)
430 {
431 	int m, n;
432 
433 	m = rt6_check_dev(rt, oif);
434 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
435 		return -1;
436 #ifdef CONFIG_IPV6_ROUTER_PREF
437 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
438 #endif
439 	n = rt6_check_neigh(rt);
440 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
441 		return -1;
442 	return m;
443 }
444 
445 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
446 				   int *mpri, struct rt6_info *match)
447 {
448 	int m;
449 
450 	if (rt6_check_expired(rt))
451 		goto out;
452 
453 	m = rt6_score_route(rt, oif, strict);
454 	if (m < 0)
455 		goto out;
456 
457 	if (m > *mpri) {
458 		if (strict & RT6_LOOKUP_F_REACHABLE)
459 			rt6_probe(match);
460 		*mpri = m;
461 		match = rt;
462 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
463 		rt6_probe(rt);
464 	}
465 
466 out:
467 	return match;
468 }
469 
470 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
471 				     struct rt6_info *rr_head,
472 				     u32 metric, int oif, int strict)
473 {
474 	struct rt6_info *rt, *match;
475 	int mpri = -1;
476 
477 	match = NULL;
478 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
479 	     rt = rt->dst.rt6_next)
480 		match = find_match(rt, oif, strict, &mpri, match);
481 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
482 	     rt = rt->dst.rt6_next)
483 		match = find_match(rt, oif, strict, &mpri, match);
484 
485 	return match;
486 }
487 
488 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
489 {
490 	struct rt6_info *match, *rt0;
491 	struct net *net;
492 
493 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
494 		  __func__, fn->leaf, oif);
495 
496 	rt0 = fn->rr_ptr;
497 	if (!rt0)
498 		fn->rr_ptr = rt0 = fn->leaf;
499 
500 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
501 
502 	if (!match &&
503 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
504 		struct rt6_info *next = rt0->dst.rt6_next;
505 
506 		/* no entries matched; do round-robin */
507 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
508 			next = fn->leaf;
509 
510 		if (next != rt0)
511 			fn->rr_ptr = next;
512 	}
513 
514 	RT6_TRACE("%s() => %p\n",
515 		  __func__, match);
516 
517 	net = dev_net(rt0->rt6i_dev);
518 	return match ? match : net->ipv6.ip6_null_entry;
519 }
520 
521 #ifdef CONFIG_IPV6_ROUTE_INFO
522 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
523 		  const struct in6_addr *gwaddr)
524 {
525 	struct net *net = dev_net(dev);
526 	struct route_info *rinfo = (struct route_info *) opt;
527 	struct in6_addr prefix_buf, *prefix;
528 	unsigned int pref;
529 	unsigned long lifetime;
530 	struct rt6_info *rt;
531 
532 	if (len < sizeof(struct route_info)) {
533 		return -EINVAL;
534 	}
535 
536 	/* Sanity check for prefix_len and length */
537 	if (rinfo->length > 3) {
538 		return -EINVAL;
539 	} else if (rinfo->prefix_len > 128) {
540 		return -EINVAL;
541 	} else if (rinfo->prefix_len > 64) {
542 		if (rinfo->length < 2) {
543 			return -EINVAL;
544 		}
545 	} else if (rinfo->prefix_len > 0) {
546 		if (rinfo->length < 1) {
547 			return -EINVAL;
548 		}
549 	}
550 
551 	pref = rinfo->route_pref;
552 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
553 		return -EINVAL;
554 
555 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
556 
557 	if (rinfo->length == 3)
558 		prefix = (struct in6_addr *)rinfo->prefix;
559 	else {
560 		/* this function is safe */
561 		ipv6_addr_prefix(&prefix_buf,
562 				 (struct in6_addr *)rinfo->prefix,
563 				 rinfo->prefix_len);
564 		prefix = &prefix_buf;
565 	}
566 
567 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
568 				dev->ifindex);
569 
570 	if (rt && !lifetime) {
571 		ip6_del_rt(rt);
572 		rt = NULL;
573 	}
574 
575 	if (!rt && lifetime)
576 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
577 					pref);
578 	else if (rt)
579 		rt->rt6i_flags = RTF_ROUTEINFO |
580 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
581 
582 	if (rt) {
583 		if (!addrconf_finite_timeout(lifetime)) {
584 			rt->rt6i_flags &= ~RTF_EXPIRES;
585 		} else {
586 			rt->rt6i_expires = jiffies + HZ * lifetime;
587 			rt->rt6i_flags |= RTF_EXPIRES;
588 		}
589 		dst_release(&rt->dst);
590 	}
591 	return 0;
592 }
593 #endif
594 
595 #define BACKTRACK(__net, saddr)			\
596 do { \
597 	if (rt == __net->ipv6.ip6_null_entry) {	\
598 		struct fib6_node *pn; \
599 		while (1) { \
600 			if (fn->fn_flags & RTN_TL_ROOT) \
601 				goto out; \
602 			pn = fn->parent; \
603 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
604 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
605 			else \
606 				fn = pn; \
607 			if (fn->fn_flags & RTN_RTINFO) \
608 				goto restart; \
609 		} \
610 	} \
611 } while(0)
612 
613 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
614 					     struct fib6_table *table,
615 					     struct flowi6 *fl6, int flags)
616 {
617 	struct fib6_node *fn;
618 	struct rt6_info *rt;
619 
620 	read_lock_bh(&table->tb6_lock);
621 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
622 restart:
623 	rt = fn->leaf;
624 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
625 	BACKTRACK(net, &fl6->saddr);
626 out:
627 	dst_use(&rt->dst, jiffies);
628 	read_unlock_bh(&table->tb6_lock);
629 	return rt;
630 
631 }
632 
633 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
634 			    const struct in6_addr *saddr, int oif, int strict)
635 {
636 	struct flowi6 fl6 = {
637 		.flowi6_oif = oif,
638 		.daddr = *daddr,
639 	};
640 	struct dst_entry *dst;
641 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
642 
643 	if (saddr) {
644 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
645 		flags |= RT6_LOOKUP_F_HAS_SADDR;
646 	}
647 
648 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
649 	if (dst->error == 0)
650 		return (struct rt6_info *) dst;
651 
652 	dst_release(dst);
653 
654 	return NULL;
655 }
656 
657 EXPORT_SYMBOL(rt6_lookup);
658 
659 /* ip6_ins_rt is called with FREE table->tb6_lock.
660    It takes new route entry, the addition fails by any reason the
661    route is freed. In any case, if caller does not hold it, it may
662    be destroyed.
663  */
664 
665 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
666 {
667 	int err;
668 	struct fib6_table *table;
669 
670 	table = rt->rt6i_table;
671 	write_lock_bh(&table->tb6_lock);
672 	err = fib6_add(&table->tb6_root, rt, info);
673 	write_unlock_bh(&table->tb6_lock);
674 
675 	return err;
676 }
677 
678 int ip6_ins_rt(struct rt6_info *rt)
679 {
680 	struct nl_info info = {
681 		.nl_net = dev_net(rt->rt6i_dev),
682 	};
683 	return __ip6_ins_rt(rt, &info);
684 }
685 
686 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
687 				      const struct in6_addr *saddr)
688 {
689 	struct rt6_info *rt;
690 
691 	/*
692 	 *	Clone the route.
693 	 */
694 
695 	rt = ip6_rt_copy(ort);
696 
697 	if (rt) {
698 		struct neighbour *neigh;
699 		int attempts = !in_softirq();
700 
701 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
702 			if (rt->rt6i_dst.plen != 128 &&
703 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
704 				rt->rt6i_flags |= RTF_ANYCAST;
705 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
706 		}
707 
708 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
709 		rt->rt6i_dst.plen = 128;
710 		rt->rt6i_flags |= RTF_CACHE;
711 		rt->dst.flags |= DST_HOST;
712 
713 #ifdef CONFIG_IPV6_SUBTREES
714 		if (rt->rt6i_src.plen && saddr) {
715 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
716 			rt->rt6i_src.plen = 128;
717 		}
718 #endif
719 
720 	retry:
721 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
722 		if (IS_ERR(neigh)) {
723 			struct net *net = dev_net(rt->rt6i_dev);
724 			int saved_rt_min_interval =
725 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
726 			int saved_rt_elasticity =
727 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
728 
729 			if (attempts-- > 0) {
730 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
731 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
732 
733 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
734 
735 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
736 					saved_rt_elasticity;
737 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
738 					saved_rt_min_interval;
739 				goto retry;
740 			}
741 
742 			if (net_ratelimit())
743 				printk(KERN_WARNING
744 				       "ipv6: Neighbour table overflow.\n");
745 			dst_free(&rt->dst);
746 			return NULL;
747 		}
748 		rt->rt6i_nexthop = neigh;
749 
750 	}
751 
752 	return rt;
753 }
754 
755 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
756 {
757 	struct rt6_info *rt = ip6_rt_copy(ort);
758 	if (rt) {
759 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
760 		rt->rt6i_dst.plen = 128;
761 		rt->rt6i_flags |= RTF_CACHE;
762 		rt->dst.flags |= DST_HOST;
763 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
764 	}
765 	return rt;
766 }
767 
768 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
769 				      struct flowi6 *fl6, int flags)
770 {
771 	struct fib6_node *fn;
772 	struct rt6_info *rt, *nrt;
773 	int strict = 0;
774 	int attempts = 3;
775 	int err;
776 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
777 
778 	strict |= flags & RT6_LOOKUP_F_IFACE;
779 
780 relookup:
781 	read_lock_bh(&table->tb6_lock);
782 
783 restart_2:
784 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
785 
786 restart:
787 	rt = rt6_select(fn, oif, strict | reachable);
788 
789 	BACKTRACK(net, &fl6->saddr);
790 	if (rt == net->ipv6.ip6_null_entry ||
791 	    rt->rt6i_flags & RTF_CACHE)
792 		goto out;
793 
794 	dst_hold(&rt->dst);
795 	read_unlock_bh(&table->tb6_lock);
796 
797 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
798 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
799 	else if (!(rt->dst.flags & DST_HOST))
800 		nrt = rt6_alloc_clone(rt, &fl6->daddr);
801 	else
802 		goto out2;
803 
804 	dst_release(&rt->dst);
805 	rt = nrt ? : net->ipv6.ip6_null_entry;
806 
807 	dst_hold(&rt->dst);
808 	if (nrt) {
809 		err = ip6_ins_rt(nrt);
810 		if (!err)
811 			goto out2;
812 	}
813 
814 	if (--attempts <= 0)
815 		goto out2;
816 
817 	/*
818 	 * Race condition! In the gap, when table->tb6_lock was
819 	 * released someone could insert this route.  Relookup.
820 	 */
821 	dst_release(&rt->dst);
822 	goto relookup;
823 
824 out:
825 	if (reachable) {
826 		reachable = 0;
827 		goto restart_2;
828 	}
829 	dst_hold(&rt->dst);
830 	read_unlock_bh(&table->tb6_lock);
831 out2:
832 	rt->dst.lastuse = jiffies;
833 	rt->dst.__use++;
834 
835 	return rt;
836 }
837 
838 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
839 					    struct flowi6 *fl6, int flags)
840 {
841 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
842 }
843 
844 void ip6_route_input(struct sk_buff *skb)
845 {
846 	const struct ipv6hdr *iph = ipv6_hdr(skb);
847 	struct net *net = dev_net(skb->dev);
848 	int flags = RT6_LOOKUP_F_HAS_SADDR;
849 	struct flowi6 fl6 = {
850 		.flowi6_iif = skb->dev->ifindex,
851 		.daddr = iph->daddr,
852 		.saddr = iph->saddr,
853 		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
854 		.flowi6_mark = skb->mark,
855 		.flowi6_proto = iph->nexthdr,
856 	};
857 
858 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
859 		flags |= RT6_LOOKUP_F_IFACE;
860 
861 	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
862 }
863 
864 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
865 					     struct flowi6 *fl6, int flags)
866 {
867 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
868 }
869 
870 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
871 				    struct flowi6 *fl6)
872 {
873 	int flags = 0;
874 
875 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
876 		flags |= RT6_LOOKUP_F_IFACE;
877 
878 	if (!ipv6_addr_any(&fl6->saddr))
879 		flags |= RT6_LOOKUP_F_HAS_SADDR;
880 	else if (sk)
881 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
882 
883 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
884 }
885 
886 EXPORT_SYMBOL(ip6_route_output);
887 
888 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
889 {
890 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
891 	struct dst_entry *new = NULL;
892 
893 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
894 	if (rt) {
895 		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
896 
897 		new = &rt->dst;
898 
899 		new->__use = 1;
900 		new->input = dst_discard;
901 		new->output = dst_discard;
902 
903 		dst_copy_metrics(new, &ort->dst);
904 		rt->rt6i_idev = ort->rt6i_idev;
905 		if (rt->rt6i_idev)
906 			in6_dev_hold(rt->rt6i_idev);
907 		rt->rt6i_expires = 0;
908 
909 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
910 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
911 		rt->rt6i_metric = 0;
912 
913 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
914 #ifdef CONFIG_IPV6_SUBTREES
915 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
916 #endif
917 
918 		dst_free(new);
919 	}
920 
921 	dst_release(dst_orig);
922 	return new ? new : ERR_PTR(-ENOMEM);
923 }
924 
925 /*
926  *	Destination cache support functions
927  */
928 
929 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
930 {
931 	struct rt6_info *rt;
932 
933 	rt = (struct rt6_info *) dst;
934 
935 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
936 		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
937 			if (!rt->rt6i_peer)
938 				rt6_bind_peer(rt, 0);
939 			rt->rt6i_peer_genid = rt6_peer_genid();
940 		}
941 		return dst;
942 	}
943 	return NULL;
944 }
945 
946 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
947 {
948 	struct rt6_info *rt = (struct rt6_info *) dst;
949 
950 	if (rt) {
951 		if (rt->rt6i_flags & RTF_CACHE) {
952 			if (rt6_check_expired(rt)) {
953 				ip6_del_rt(rt);
954 				dst = NULL;
955 			}
956 		} else {
957 			dst_release(dst);
958 			dst = NULL;
959 		}
960 	}
961 	return dst;
962 }
963 
964 static void ip6_link_failure(struct sk_buff *skb)
965 {
966 	struct rt6_info *rt;
967 
968 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
969 
970 	rt = (struct rt6_info *) skb_dst(skb);
971 	if (rt) {
972 		if (rt->rt6i_flags&RTF_CACHE) {
973 			dst_set_expires(&rt->dst, 0);
974 			rt->rt6i_flags |= RTF_EXPIRES;
975 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
976 			rt->rt6i_node->fn_sernum = -1;
977 	}
978 }
979 
980 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
981 {
982 	struct rt6_info *rt6 = (struct rt6_info*)dst;
983 
984 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
985 		rt6->rt6i_flags |= RTF_MODIFIED;
986 		if (mtu < IPV6_MIN_MTU) {
987 			u32 features = dst_metric(dst, RTAX_FEATURES);
988 			mtu = IPV6_MIN_MTU;
989 			features |= RTAX_FEATURE_ALLFRAG;
990 			dst_metric_set(dst, RTAX_FEATURES, features);
991 		}
992 		dst_metric_set(dst, RTAX_MTU, mtu);
993 	}
994 }
995 
996 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
997 {
998 	struct net_device *dev = dst->dev;
999 	unsigned int mtu = dst_mtu(dst);
1000 	struct net *net = dev_net(dev);
1001 
1002 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1003 
1004 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1005 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1006 
1007 	/*
1008 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1009 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1010 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1011 	 * rely only on pmtu discovery"
1012 	 */
1013 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1014 		mtu = IPV6_MAXPLEN;
1015 	return mtu;
1016 }
1017 
1018 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1019 {
1020 	unsigned int mtu = IPV6_MIN_MTU;
1021 	struct inet6_dev *idev;
1022 
1023 	rcu_read_lock();
1024 	idev = __in6_dev_get(dst->dev);
1025 	if (idev)
1026 		mtu = idev->cnf.mtu6;
1027 	rcu_read_unlock();
1028 
1029 	return mtu;
1030 }
1031 
1032 static struct dst_entry *icmp6_dst_gc_list;
1033 static DEFINE_SPINLOCK(icmp6_dst_lock);
1034 
1035 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1036 				  struct neighbour *neigh,
1037 				  const struct in6_addr *addr)
1038 {
1039 	struct rt6_info *rt;
1040 	struct inet6_dev *idev = in6_dev_get(dev);
1041 	struct net *net = dev_net(dev);
1042 
1043 	if (unlikely(idev == NULL))
1044 		return NULL;
1045 
1046 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1047 	if (unlikely(rt == NULL)) {
1048 		in6_dev_put(idev);
1049 		goto out;
1050 	}
1051 
1052 	if (neigh)
1053 		neigh_hold(neigh);
1054 	else {
1055 		neigh = ndisc_get_neigh(dev, addr);
1056 		if (IS_ERR(neigh))
1057 			neigh = NULL;
1058 	}
1059 
1060 	rt->rt6i_idev     = idev;
1061 	rt->rt6i_nexthop  = neigh;
1062 	atomic_set(&rt->dst.__refcnt, 1);
1063 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1064 	rt->dst.output  = ip6_output;
1065 
1066 	spin_lock_bh(&icmp6_dst_lock);
1067 	rt->dst.next = icmp6_dst_gc_list;
1068 	icmp6_dst_gc_list = &rt->dst;
1069 	spin_unlock_bh(&icmp6_dst_lock);
1070 
1071 	fib6_force_start_gc(net);
1072 
1073 out:
1074 	return &rt->dst;
1075 }
1076 
1077 int icmp6_dst_gc(void)
1078 {
1079 	struct dst_entry *dst, **pprev;
1080 	int more = 0;
1081 
1082 	spin_lock_bh(&icmp6_dst_lock);
1083 	pprev = &icmp6_dst_gc_list;
1084 
1085 	while ((dst = *pprev) != NULL) {
1086 		if (!atomic_read(&dst->__refcnt)) {
1087 			*pprev = dst->next;
1088 			dst_free(dst);
1089 		} else {
1090 			pprev = &dst->next;
1091 			++more;
1092 		}
1093 	}
1094 
1095 	spin_unlock_bh(&icmp6_dst_lock);
1096 
1097 	return more;
1098 }
1099 
1100 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1101 			    void *arg)
1102 {
1103 	struct dst_entry *dst, **pprev;
1104 
1105 	spin_lock_bh(&icmp6_dst_lock);
1106 	pprev = &icmp6_dst_gc_list;
1107 	while ((dst = *pprev) != NULL) {
1108 		struct rt6_info *rt = (struct rt6_info *) dst;
1109 		if (func(rt, arg)) {
1110 			*pprev = dst->next;
1111 			dst_free(dst);
1112 		} else {
1113 			pprev = &dst->next;
1114 		}
1115 	}
1116 	spin_unlock_bh(&icmp6_dst_lock);
1117 }
1118 
1119 static int ip6_dst_gc(struct dst_ops *ops)
1120 {
1121 	unsigned long now = jiffies;
1122 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1123 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1124 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1125 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1126 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1127 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1128 	int entries;
1129 
1130 	entries = dst_entries_get_fast(ops);
1131 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1132 	    entries <= rt_max_size)
1133 		goto out;
1134 
1135 	net->ipv6.ip6_rt_gc_expire++;
1136 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1137 	net->ipv6.ip6_rt_last_gc = now;
1138 	entries = dst_entries_get_slow(ops);
1139 	if (entries < ops->gc_thresh)
1140 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1141 out:
1142 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1143 	return entries > rt_max_size;
1144 }
1145 
1146 /* Clean host part of a prefix. Not necessary in radix tree,
1147    but results in cleaner routing tables.
1148 
1149    Remove it only when all the things will work!
1150  */
1151 
1152 int ip6_dst_hoplimit(struct dst_entry *dst)
1153 {
1154 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1155 	if (hoplimit == 0) {
1156 		struct net_device *dev = dst->dev;
1157 		struct inet6_dev *idev;
1158 
1159 		rcu_read_lock();
1160 		idev = __in6_dev_get(dev);
1161 		if (idev)
1162 			hoplimit = idev->cnf.hop_limit;
1163 		else
1164 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1165 		rcu_read_unlock();
1166 	}
1167 	return hoplimit;
1168 }
1169 EXPORT_SYMBOL(ip6_dst_hoplimit);
1170 
1171 /*
1172  *
1173  */
1174 
1175 int ip6_route_add(struct fib6_config *cfg)
1176 {
1177 	int err;
1178 	struct net *net = cfg->fc_nlinfo.nl_net;
1179 	struct rt6_info *rt = NULL;
1180 	struct net_device *dev = NULL;
1181 	struct inet6_dev *idev = NULL;
1182 	struct fib6_table *table;
1183 	int addr_type;
1184 
1185 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1186 		return -EINVAL;
1187 #ifndef CONFIG_IPV6_SUBTREES
1188 	if (cfg->fc_src_len)
1189 		return -EINVAL;
1190 #endif
1191 	if (cfg->fc_ifindex) {
1192 		err = -ENODEV;
1193 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1194 		if (!dev)
1195 			goto out;
1196 		idev = in6_dev_get(dev);
1197 		if (!idev)
1198 			goto out;
1199 	}
1200 
1201 	if (cfg->fc_metric == 0)
1202 		cfg->fc_metric = IP6_RT_PRIO_USER;
1203 
1204 	table = fib6_new_table(net, cfg->fc_table);
1205 	if (table == NULL) {
1206 		err = -ENOBUFS;
1207 		goto out;
1208 	}
1209 
1210 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1211 
1212 	if (rt == NULL) {
1213 		err = -ENOMEM;
1214 		goto out;
1215 	}
1216 
1217 	rt->dst.obsolete = -1;
1218 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1219 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1220 				0;
1221 
1222 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1223 		cfg->fc_protocol = RTPROT_BOOT;
1224 	rt->rt6i_protocol = cfg->fc_protocol;
1225 
1226 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1227 
1228 	if (addr_type & IPV6_ADDR_MULTICAST)
1229 		rt->dst.input = ip6_mc_input;
1230 	else if (cfg->fc_flags & RTF_LOCAL)
1231 		rt->dst.input = ip6_input;
1232 	else
1233 		rt->dst.input = ip6_forward;
1234 
1235 	rt->dst.output = ip6_output;
1236 
1237 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1238 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1239 	if (rt->rt6i_dst.plen == 128)
1240 	       rt->dst.flags |= DST_HOST;
1241 
1242 #ifdef CONFIG_IPV6_SUBTREES
1243 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1244 	rt->rt6i_src.plen = cfg->fc_src_len;
1245 #endif
1246 
1247 	rt->rt6i_metric = cfg->fc_metric;
1248 
1249 	/* We cannot add true routes via loopback here,
1250 	   they would result in kernel looping; promote them to reject routes
1251 	 */
1252 	if ((cfg->fc_flags & RTF_REJECT) ||
1253 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1254 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1255 		/* hold loopback dev/idev if we haven't done so. */
1256 		if (dev != net->loopback_dev) {
1257 			if (dev) {
1258 				dev_put(dev);
1259 				in6_dev_put(idev);
1260 			}
1261 			dev = net->loopback_dev;
1262 			dev_hold(dev);
1263 			idev = in6_dev_get(dev);
1264 			if (!idev) {
1265 				err = -ENODEV;
1266 				goto out;
1267 			}
1268 		}
1269 		rt->dst.output = ip6_pkt_discard_out;
1270 		rt->dst.input = ip6_pkt_discard;
1271 		rt->dst.error = -ENETUNREACH;
1272 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1273 		goto install_route;
1274 	}
1275 
1276 	if (cfg->fc_flags & RTF_GATEWAY) {
1277 		const struct in6_addr *gw_addr;
1278 		int gwa_type;
1279 
1280 		gw_addr = &cfg->fc_gateway;
1281 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1282 		gwa_type = ipv6_addr_type(gw_addr);
1283 
1284 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1285 			struct rt6_info *grt;
1286 
1287 			/* IPv6 strictly inhibits using not link-local
1288 			   addresses as nexthop address.
1289 			   Otherwise, router will not able to send redirects.
1290 			   It is very good, but in some (rare!) circumstances
1291 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1292 			   some exceptions. --ANK
1293 			 */
1294 			err = -EINVAL;
1295 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1296 				goto out;
1297 
1298 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1299 
1300 			err = -EHOSTUNREACH;
1301 			if (grt == NULL)
1302 				goto out;
1303 			if (dev) {
1304 				if (dev != grt->rt6i_dev) {
1305 					dst_release(&grt->dst);
1306 					goto out;
1307 				}
1308 			} else {
1309 				dev = grt->rt6i_dev;
1310 				idev = grt->rt6i_idev;
1311 				dev_hold(dev);
1312 				in6_dev_hold(grt->rt6i_idev);
1313 			}
1314 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1315 				err = 0;
1316 			dst_release(&grt->dst);
1317 
1318 			if (err)
1319 				goto out;
1320 		}
1321 		err = -EINVAL;
1322 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1323 			goto out;
1324 	}
1325 
1326 	err = -ENODEV;
1327 	if (dev == NULL)
1328 		goto out;
1329 
1330 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1331 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1332 			err = -EINVAL;
1333 			goto out;
1334 		}
1335 		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1336 		rt->rt6i_prefsrc.plen = 128;
1337 	} else
1338 		rt->rt6i_prefsrc.plen = 0;
1339 
1340 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1341 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1342 		if (IS_ERR(rt->rt6i_nexthop)) {
1343 			err = PTR_ERR(rt->rt6i_nexthop);
1344 			rt->rt6i_nexthop = NULL;
1345 			goto out;
1346 		}
1347 	}
1348 
1349 	rt->rt6i_flags = cfg->fc_flags;
1350 
1351 install_route:
1352 	if (cfg->fc_mx) {
1353 		struct nlattr *nla;
1354 		int remaining;
1355 
1356 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1357 			int type = nla_type(nla);
1358 
1359 			if (type) {
1360 				if (type > RTAX_MAX) {
1361 					err = -EINVAL;
1362 					goto out;
1363 				}
1364 
1365 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1366 			}
1367 		}
1368 	}
1369 
1370 	rt->dst.dev = dev;
1371 	rt->rt6i_idev = idev;
1372 	rt->rt6i_table = table;
1373 
1374 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1375 
1376 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1377 
1378 out:
1379 	if (dev)
1380 		dev_put(dev);
1381 	if (idev)
1382 		in6_dev_put(idev);
1383 	if (rt)
1384 		dst_free(&rt->dst);
1385 	return err;
1386 }
1387 
1388 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1389 {
1390 	int err;
1391 	struct fib6_table *table;
1392 	struct net *net = dev_net(rt->rt6i_dev);
1393 
1394 	if (rt == net->ipv6.ip6_null_entry)
1395 		return -ENOENT;
1396 
1397 	table = rt->rt6i_table;
1398 	write_lock_bh(&table->tb6_lock);
1399 
1400 	err = fib6_del(rt, info);
1401 	dst_release(&rt->dst);
1402 
1403 	write_unlock_bh(&table->tb6_lock);
1404 
1405 	return err;
1406 }
1407 
1408 int ip6_del_rt(struct rt6_info *rt)
1409 {
1410 	struct nl_info info = {
1411 		.nl_net = dev_net(rt->rt6i_dev),
1412 	};
1413 	return __ip6_del_rt(rt, &info);
1414 }
1415 
1416 static int ip6_route_del(struct fib6_config *cfg)
1417 {
1418 	struct fib6_table *table;
1419 	struct fib6_node *fn;
1420 	struct rt6_info *rt;
1421 	int err = -ESRCH;
1422 
1423 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1424 	if (table == NULL)
1425 		return err;
1426 
1427 	read_lock_bh(&table->tb6_lock);
1428 
1429 	fn = fib6_locate(&table->tb6_root,
1430 			 &cfg->fc_dst, cfg->fc_dst_len,
1431 			 &cfg->fc_src, cfg->fc_src_len);
1432 
1433 	if (fn) {
1434 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1435 			if (cfg->fc_ifindex &&
1436 			    (rt->rt6i_dev == NULL ||
1437 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1438 				continue;
1439 			if (cfg->fc_flags & RTF_GATEWAY &&
1440 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1441 				continue;
1442 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1443 				continue;
1444 			dst_hold(&rt->dst);
1445 			read_unlock_bh(&table->tb6_lock);
1446 
1447 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1448 		}
1449 	}
1450 	read_unlock_bh(&table->tb6_lock);
1451 
1452 	return err;
1453 }
1454 
1455 /*
1456  *	Handle redirects
1457  */
1458 struct ip6rd_flowi {
1459 	struct flowi6 fl6;
1460 	struct in6_addr gateway;
1461 };
1462 
1463 static struct rt6_info *__ip6_route_redirect(struct net *net,
1464 					     struct fib6_table *table,
1465 					     struct flowi6 *fl6,
1466 					     int flags)
1467 {
1468 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1469 	struct rt6_info *rt;
1470 	struct fib6_node *fn;
1471 
1472 	/*
1473 	 * Get the "current" route for this destination and
1474 	 * check if the redirect has come from approriate router.
1475 	 *
1476 	 * RFC 2461 specifies that redirects should only be
1477 	 * accepted if they come from the nexthop to the target.
1478 	 * Due to the way the routes are chosen, this notion
1479 	 * is a bit fuzzy and one might need to check all possible
1480 	 * routes.
1481 	 */
1482 
1483 	read_lock_bh(&table->tb6_lock);
1484 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1485 restart:
1486 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1487 		/*
1488 		 * Current route is on-link; redirect is always invalid.
1489 		 *
1490 		 * Seems, previous statement is not true. It could
1491 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1492 		 * But then router serving it might decide, that we should
1493 		 * know truth 8)8) --ANK (980726).
1494 		 */
1495 		if (rt6_check_expired(rt))
1496 			continue;
1497 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1498 			continue;
1499 		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1500 			continue;
1501 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1502 			continue;
1503 		break;
1504 	}
1505 
1506 	if (!rt)
1507 		rt = net->ipv6.ip6_null_entry;
1508 	BACKTRACK(net, &fl6->saddr);
1509 out:
1510 	dst_hold(&rt->dst);
1511 
1512 	read_unlock_bh(&table->tb6_lock);
1513 
1514 	return rt;
1515 };
1516 
1517 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1518 					   const struct in6_addr *src,
1519 					   const struct in6_addr *gateway,
1520 					   struct net_device *dev)
1521 {
1522 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1523 	struct net *net = dev_net(dev);
1524 	struct ip6rd_flowi rdfl = {
1525 		.fl6 = {
1526 			.flowi6_oif = dev->ifindex,
1527 			.daddr = *dest,
1528 			.saddr = *src,
1529 		},
1530 	};
1531 
1532 	ipv6_addr_copy(&rdfl.gateway, gateway);
1533 
1534 	if (rt6_need_strict(dest))
1535 		flags |= RT6_LOOKUP_F_IFACE;
1536 
1537 	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1538 						   flags, __ip6_route_redirect);
1539 }
1540 
1541 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1542 		  const struct in6_addr *saddr,
1543 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1544 {
1545 	struct rt6_info *rt, *nrt = NULL;
1546 	struct netevent_redirect netevent;
1547 	struct net *net = dev_net(neigh->dev);
1548 
1549 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1550 
1551 	if (rt == net->ipv6.ip6_null_entry) {
1552 		if (net_ratelimit())
1553 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1554 			       "for redirect target\n");
1555 		goto out;
1556 	}
1557 
1558 	/*
1559 	 *	We have finally decided to accept it.
1560 	 */
1561 
1562 	neigh_update(neigh, lladdr, NUD_STALE,
1563 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1564 		     NEIGH_UPDATE_F_OVERRIDE|
1565 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1566 				     NEIGH_UPDATE_F_ISROUTER))
1567 		     );
1568 
1569 	/*
1570 	 * Redirect received -> path was valid.
1571 	 * Look, redirects are sent only in response to data packets,
1572 	 * so that this nexthop apparently is reachable. --ANK
1573 	 */
1574 	dst_confirm(&rt->dst);
1575 
1576 	/* Duplicate redirect: silently ignore. */
1577 	if (neigh == rt->dst.neighbour)
1578 		goto out;
1579 
1580 	nrt = ip6_rt_copy(rt);
1581 	if (nrt == NULL)
1582 		goto out;
1583 
1584 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1585 	if (on_link)
1586 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1587 
1588 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1589 	nrt->rt6i_dst.plen = 128;
1590 	nrt->dst.flags |= DST_HOST;
1591 
1592 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1593 	nrt->rt6i_nexthop = neigh_clone(neigh);
1594 
1595 	if (ip6_ins_rt(nrt))
1596 		goto out;
1597 
1598 	netevent.old = &rt->dst;
1599 	netevent.new = &nrt->dst;
1600 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1601 
1602 	if (rt->rt6i_flags&RTF_CACHE) {
1603 		ip6_del_rt(rt);
1604 		return;
1605 	}
1606 
1607 out:
1608 	dst_release(&rt->dst);
1609 }
1610 
1611 /*
1612  *	Handle ICMP "packet too big" messages
1613  *	i.e. Path MTU discovery
1614  */
1615 
1616 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1617 			     struct net *net, u32 pmtu, int ifindex)
1618 {
1619 	struct rt6_info *rt, *nrt;
1620 	int allfrag = 0;
1621 again:
1622 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1623 	if (rt == NULL)
1624 		return;
1625 
1626 	if (rt6_check_expired(rt)) {
1627 		ip6_del_rt(rt);
1628 		goto again;
1629 	}
1630 
1631 	if (pmtu >= dst_mtu(&rt->dst))
1632 		goto out;
1633 
1634 	if (pmtu < IPV6_MIN_MTU) {
1635 		/*
1636 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1637 		 * MTU (1280) and a fragment header should always be included
1638 		 * after a node receiving Too Big message reporting PMTU is
1639 		 * less than the IPv6 Minimum Link MTU.
1640 		 */
1641 		pmtu = IPV6_MIN_MTU;
1642 		allfrag = 1;
1643 	}
1644 
1645 	/* New mtu received -> path was valid.
1646 	   They are sent only in response to data packets,
1647 	   so that this nexthop apparently is reachable. --ANK
1648 	 */
1649 	dst_confirm(&rt->dst);
1650 
1651 	/* Host route. If it is static, it would be better
1652 	   not to override it, but add new one, so that
1653 	   when cache entry will expire old pmtu
1654 	   would return automatically.
1655 	 */
1656 	if (rt->rt6i_flags & RTF_CACHE) {
1657 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1658 		if (allfrag) {
1659 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1660 			features |= RTAX_FEATURE_ALLFRAG;
1661 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1662 		}
1663 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1664 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1665 		goto out;
1666 	}
1667 
1668 	/* Network route.
1669 	   Two cases are possible:
1670 	   1. It is connected route. Action: COW
1671 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1672 	 */
1673 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1674 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1675 	else
1676 		nrt = rt6_alloc_clone(rt, daddr);
1677 
1678 	if (nrt) {
1679 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1680 		if (allfrag) {
1681 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1682 			features |= RTAX_FEATURE_ALLFRAG;
1683 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1684 		}
1685 
1686 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1687 		 * happened within 5 mins, the recommended timer is 10 mins.
1688 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1689 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1690 		 * and detecting PMTU increase will be automatically happened.
1691 		 */
1692 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1693 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1694 
1695 		ip6_ins_rt(nrt);
1696 	}
1697 out:
1698 	dst_release(&rt->dst);
1699 }
1700 
1701 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1702 			struct net_device *dev, u32 pmtu)
1703 {
1704 	struct net *net = dev_net(dev);
1705 
1706 	/*
1707 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1708 	 * is sending along the path" that caused the Packet Too Big message.
1709 	 * Since it's not possible in the general case to determine which
1710 	 * interface was used to send the original packet, we update the MTU
1711 	 * on the interface that will be used to send future packets. We also
1712 	 * update the MTU on the interface that received the Packet Too Big in
1713 	 * case the original packet was forced out that interface with
1714 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1715 	 * correct behaviour, which would be to update the MTU on all
1716 	 * interfaces.
1717 	 */
1718 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1719 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1720 }
1721 
1722 /*
1723  *	Misc support functions
1724  */
1725 
1726 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1727 {
1728 	struct net *net = dev_net(ort->rt6i_dev);
1729 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1730 					    ort->dst.dev, 0);
1731 
1732 	if (rt) {
1733 		rt->dst.input = ort->dst.input;
1734 		rt->dst.output = ort->dst.output;
1735 
1736 		dst_copy_metrics(&rt->dst, &ort->dst);
1737 		rt->dst.error = ort->dst.error;
1738 		rt->rt6i_idev = ort->rt6i_idev;
1739 		if (rt->rt6i_idev)
1740 			in6_dev_hold(rt->rt6i_idev);
1741 		rt->dst.lastuse = jiffies;
1742 		rt->rt6i_expires = 0;
1743 
1744 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1745 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1746 		rt->rt6i_metric = 0;
1747 
1748 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1749 #ifdef CONFIG_IPV6_SUBTREES
1750 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1751 #endif
1752 		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1753 		rt->rt6i_table = ort->rt6i_table;
1754 	}
1755 	return rt;
1756 }
1757 
1758 #ifdef CONFIG_IPV6_ROUTE_INFO
1759 static struct rt6_info *rt6_get_route_info(struct net *net,
1760 					   const struct in6_addr *prefix, int prefixlen,
1761 					   const struct in6_addr *gwaddr, int ifindex)
1762 {
1763 	struct fib6_node *fn;
1764 	struct rt6_info *rt = NULL;
1765 	struct fib6_table *table;
1766 
1767 	table = fib6_get_table(net, RT6_TABLE_INFO);
1768 	if (table == NULL)
1769 		return NULL;
1770 
1771 	write_lock_bh(&table->tb6_lock);
1772 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1773 	if (!fn)
1774 		goto out;
1775 
1776 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1777 		if (rt->rt6i_dev->ifindex != ifindex)
1778 			continue;
1779 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1780 			continue;
1781 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1782 			continue;
1783 		dst_hold(&rt->dst);
1784 		break;
1785 	}
1786 out:
1787 	write_unlock_bh(&table->tb6_lock);
1788 	return rt;
1789 }
1790 
1791 static struct rt6_info *rt6_add_route_info(struct net *net,
1792 					   const struct in6_addr *prefix, int prefixlen,
1793 					   const struct in6_addr *gwaddr, int ifindex,
1794 					   unsigned pref)
1795 {
1796 	struct fib6_config cfg = {
1797 		.fc_table	= RT6_TABLE_INFO,
1798 		.fc_metric	= IP6_RT_PRIO_USER,
1799 		.fc_ifindex	= ifindex,
1800 		.fc_dst_len	= prefixlen,
1801 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1802 				  RTF_UP | RTF_PREF(pref),
1803 		.fc_nlinfo.pid = 0,
1804 		.fc_nlinfo.nlh = NULL,
1805 		.fc_nlinfo.nl_net = net,
1806 	};
1807 
1808 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1809 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1810 
1811 	/* We should treat it as a default route if prefix length is 0. */
1812 	if (!prefixlen)
1813 		cfg.fc_flags |= RTF_DEFAULT;
1814 
1815 	ip6_route_add(&cfg);
1816 
1817 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1818 }
1819 #endif
1820 
1821 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1822 {
1823 	struct rt6_info *rt;
1824 	struct fib6_table *table;
1825 
1826 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1827 	if (table == NULL)
1828 		return NULL;
1829 
1830 	write_lock_bh(&table->tb6_lock);
1831 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1832 		if (dev == rt->rt6i_dev &&
1833 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1834 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1835 			break;
1836 	}
1837 	if (rt)
1838 		dst_hold(&rt->dst);
1839 	write_unlock_bh(&table->tb6_lock);
1840 	return rt;
1841 }
1842 
1843 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1844 				     struct net_device *dev,
1845 				     unsigned int pref)
1846 {
1847 	struct fib6_config cfg = {
1848 		.fc_table	= RT6_TABLE_DFLT,
1849 		.fc_metric	= IP6_RT_PRIO_USER,
1850 		.fc_ifindex	= dev->ifindex,
1851 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1852 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1853 		.fc_nlinfo.pid = 0,
1854 		.fc_nlinfo.nlh = NULL,
1855 		.fc_nlinfo.nl_net = dev_net(dev),
1856 	};
1857 
1858 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1859 
1860 	ip6_route_add(&cfg);
1861 
1862 	return rt6_get_dflt_router(gwaddr, dev);
1863 }
1864 
1865 void rt6_purge_dflt_routers(struct net *net)
1866 {
1867 	struct rt6_info *rt;
1868 	struct fib6_table *table;
1869 
1870 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1871 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1872 	if (table == NULL)
1873 		return;
1874 
1875 restart:
1876 	read_lock_bh(&table->tb6_lock);
1877 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1878 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1879 			dst_hold(&rt->dst);
1880 			read_unlock_bh(&table->tb6_lock);
1881 			ip6_del_rt(rt);
1882 			goto restart;
1883 		}
1884 	}
1885 	read_unlock_bh(&table->tb6_lock);
1886 }
1887 
1888 static void rtmsg_to_fib6_config(struct net *net,
1889 				 struct in6_rtmsg *rtmsg,
1890 				 struct fib6_config *cfg)
1891 {
1892 	memset(cfg, 0, sizeof(*cfg));
1893 
1894 	cfg->fc_table = RT6_TABLE_MAIN;
1895 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1896 	cfg->fc_metric = rtmsg->rtmsg_metric;
1897 	cfg->fc_expires = rtmsg->rtmsg_info;
1898 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1899 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1900 	cfg->fc_flags = rtmsg->rtmsg_flags;
1901 
1902 	cfg->fc_nlinfo.nl_net = net;
1903 
1904 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1905 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1906 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1907 }
1908 
1909 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1910 {
1911 	struct fib6_config cfg;
1912 	struct in6_rtmsg rtmsg;
1913 	int err;
1914 
1915 	switch(cmd) {
1916 	case SIOCADDRT:		/* Add a route */
1917 	case SIOCDELRT:		/* Delete a route */
1918 		if (!capable(CAP_NET_ADMIN))
1919 			return -EPERM;
1920 		err = copy_from_user(&rtmsg, arg,
1921 				     sizeof(struct in6_rtmsg));
1922 		if (err)
1923 			return -EFAULT;
1924 
1925 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1926 
1927 		rtnl_lock();
1928 		switch (cmd) {
1929 		case SIOCADDRT:
1930 			err = ip6_route_add(&cfg);
1931 			break;
1932 		case SIOCDELRT:
1933 			err = ip6_route_del(&cfg);
1934 			break;
1935 		default:
1936 			err = -EINVAL;
1937 		}
1938 		rtnl_unlock();
1939 
1940 		return err;
1941 	}
1942 
1943 	return -EINVAL;
1944 }
1945 
1946 /*
1947  *	Drop the packet on the floor
1948  */
1949 
1950 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1951 {
1952 	int type;
1953 	struct dst_entry *dst = skb_dst(skb);
1954 	switch (ipstats_mib_noroutes) {
1955 	case IPSTATS_MIB_INNOROUTES:
1956 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1957 		if (type == IPV6_ADDR_ANY) {
1958 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1959 				      IPSTATS_MIB_INADDRERRORS);
1960 			break;
1961 		}
1962 		/* FALLTHROUGH */
1963 	case IPSTATS_MIB_OUTNOROUTES:
1964 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1965 			      ipstats_mib_noroutes);
1966 		break;
1967 	}
1968 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1969 	kfree_skb(skb);
1970 	return 0;
1971 }
1972 
1973 static int ip6_pkt_discard(struct sk_buff *skb)
1974 {
1975 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1976 }
1977 
1978 static int ip6_pkt_discard_out(struct sk_buff *skb)
1979 {
1980 	skb->dev = skb_dst(skb)->dev;
1981 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1982 }
1983 
1984 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1985 
1986 static int ip6_pkt_prohibit(struct sk_buff *skb)
1987 {
1988 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1989 }
1990 
1991 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1992 {
1993 	skb->dev = skb_dst(skb)->dev;
1994 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1995 }
1996 
1997 #endif
1998 
1999 /*
2000  *	Allocate a dst for local (unicast / anycast) address.
2001  */
2002 
2003 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2004 				    const struct in6_addr *addr,
2005 				    int anycast)
2006 {
2007 	struct net *net = dev_net(idev->dev);
2008 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2009 					    net->loopback_dev, 0);
2010 	struct neighbour *neigh;
2011 
2012 	if (rt == NULL) {
2013 		if (net_ratelimit())
2014 			pr_warning("IPv6:  Maximum number of routes reached,"
2015 				   " consider increasing route/max_size.\n");
2016 		return ERR_PTR(-ENOMEM);
2017 	}
2018 
2019 	in6_dev_hold(idev);
2020 
2021 	rt->dst.flags |= DST_HOST;
2022 	rt->dst.input = ip6_input;
2023 	rt->dst.output = ip6_output;
2024 	rt->rt6i_idev = idev;
2025 	rt->dst.obsolete = -1;
2026 
2027 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2028 	if (anycast)
2029 		rt->rt6i_flags |= RTF_ANYCAST;
2030 	else
2031 		rt->rt6i_flags |= RTF_LOCAL;
2032 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2033 	if (IS_ERR(neigh)) {
2034 		dst_free(&rt->dst);
2035 
2036 		return ERR_CAST(neigh);
2037 	}
2038 	rt->rt6i_nexthop = neigh;
2039 
2040 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2041 	rt->rt6i_dst.plen = 128;
2042 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2043 
2044 	atomic_set(&rt->dst.__refcnt, 1);
2045 
2046 	return rt;
2047 }
2048 
2049 int ip6_route_get_saddr(struct net *net,
2050 			struct rt6_info *rt,
2051 			const struct in6_addr *daddr,
2052 			unsigned int prefs,
2053 			struct in6_addr *saddr)
2054 {
2055 	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2056 	int err = 0;
2057 	if (rt->rt6i_prefsrc.plen)
2058 		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2059 	else
2060 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2061 					 daddr, prefs, saddr);
2062 	return err;
2063 }
2064 
2065 /* remove deleted ip from prefsrc entries */
2066 struct arg_dev_net_ip {
2067 	struct net_device *dev;
2068 	struct net *net;
2069 	struct in6_addr *addr;
2070 };
2071 
2072 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2073 {
2074 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2075 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2076 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2077 
2078 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2079 	    rt != net->ipv6.ip6_null_entry &&
2080 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2081 		/* remove prefsrc entry */
2082 		rt->rt6i_prefsrc.plen = 0;
2083 	}
2084 	return 0;
2085 }
2086 
2087 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2088 {
2089 	struct net *net = dev_net(ifp->idev->dev);
2090 	struct arg_dev_net_ip adni = {
2091 		.dev = ifp->idev->dev,
2092 		.net = net,
2093 		.addr = &ifp->addr,
2094 	};
2095 	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2096 }
2097 
2098 struct arg_dev_net {
2099 	struct net_device *dev;
2100 	struct net *net;
2101 };
2102 
2103 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2104 {
2105 	const struct arg_dev_net *adn = arg;
2106 	const struct net_device *dev = adn->dev;
2107 
2108 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2109 	    rt != adn->net->ipv6.ip6_null_entry) {
2110 		RT6_TRACE("deleted by ifdown %p\n", rt);
2111 		return -1;
2112 	}
2113 	return 0;
2114 }
2115 
2116 void rt6_ifdown(struct net *net, struct net_device *dev)
2117 {
2118 	struct arg_dev_net adn = {
2119 		.dev = dev,
2120 		.net = net,
2121 	};
2122 
2123 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2124 	icmp6_clean_all(fib6_ifdown, &adn);
2125 }
2126 
2127 struct rt6_mtu_change_arg
2128 {
2129 	struct net_device *dev;
2130 	unsigned mtu;
2131 };
2132 
2133 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2134 {
2135 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2136 	struct inet6_dev *idev;
2137 
2138 	/* In IPv6 pmtu discovery is not optional,
2139 	   so that RTAX_MTU lock cannot disable it.
2140 	   We still use this lock to block changes
2141 	   caused by addrconf/ndisc.
2142 	*/
2143 
2144 	idev = __in6_dev_get(arg->dev);
2145 	if (idev == NULL)
2146 		return 0;
2147 
2148 	/* For administrative MTU increase, there is no way to discover
2149 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2150 	   Since RFC 1981 doesn't include administrative MTU increase
2151 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2152 	 */
2153 	/*
2154 	   If new MTU is less than route PMTU, this new MTU will be the
2155 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2156 	   decreases; if new MTU is greater than route PMTU, and the
2157 	   old MTU is the lowest MTU in the path, update the route PMTU
2158 	   to reflect the increase. In this case if the other nodes' MTU
2159 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2160 	   PMTU discouvery.
2161 	 */
2162 	if (rt->rt6i_dev == arg->dev &&
2163 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2164 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2165 	     (dst_mtu(&rt->dst) < arg->mtu &&
2166 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2167 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2168 	}
2169 	return 0;
2170 }
2171 
2172 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2173 {
2174 	struct rt6_mtu_change_arg arg = {
2175 		.dev = dev,
2176 		.mtu = mtu,
2177 	};
2178 
2179 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2180 }
2181 
2182 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2183 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2184 	[RTA_OIF]               = { .type = NLA_U32 },
2185 	[RTA_IIF]		= { .type = NLA_U32 },
2186 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2187 	[RTA_METRICS]           = { .type = NLA_NESTED },
2188 };
2189 
2190 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2191 			      struct fib6_config *cfg)
2192 {
2193 	struct rtmsg *rtm;
2194 	struct nlattr *tb[RTA_MAX+1];
2195 	int err;
2196 
2197 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2198 	if (err < 0)
2199 		goto errout;
2200 
2201 	err = -EINVAL;
2202 	rtm = nlmsg_data(nlh);
2203 	memset(cfg, 0, sizeof(*cfg));
2204 
2205 	cfg->fc_table = rtm->rtm_table;
2206 	cfg->fc_dst_len = rtm->rtm_dst_len;
2207 	cfg->fc_src_len = rtm->rtm_src_len;
2208 	cfg->fc_flags = RTF_UP;
2209 	cfg->fc_protocol = rtm->rtm_protocol;
2210 
2211 	if (rtm->rtm_type == RTN_UNREACHABLE)
2212 		cfg->fc_flags |= RTF_REJECT;
2213 
2214 	if (rtm->rtm_type == RTN_LOCAL)
2215 		cfg->fc_flags |= RTF_LOCAL;
2216 
2217 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2218 	cfg->fc_nlinfo.nlh = nlh;
2219 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2220 
2221 	if (tb[RTA_GATEWAY]) {
2222 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2223 		cfg->fc_flags |= RTF_GATEWAY;
2224 	}
2225 
2226 	if (tb[RTA_DST]) {
2227 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2228 
2229 		if (nla_len(tb[RTA_DST]) < plen)
2230 			goto errout;
2231 
2232 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2233 	}
2234 
2235 	if (tb[RTA_SRC]) {
2236 		int plen = (rtm->rtm_src_len + 7) >> 3;
2237 
2238 		if (nla_len(tb[RTA_SRC]) < plen)
2239 			goto errout;
2240 
2241 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2242 	}
2243 
2244 	if (tb[RTA_PREFSRC])
2245 		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2246 
2247 	if (tb[RTA_OIF])
2248 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2249 
2250 	if (tb[RTA_PRIORITY])
2251 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2252 
2253 	if (tb[RTA_METRICS]) {
2254 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2255 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2256 	}
2257 
2258 	if (tb[RTA_TABLE])
2259 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2260 
2261 	err = 0;
2262 errout:
2263 	return err;
2264 }
2265 
2266 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2267 {
2268 	struct fib6_config cfg;
2269 	int err;
2270 
2271 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2272 	if (err < 0)
2273 		return err;
2274 
2275 	return ip6_route_del(&cfg);
2276 }
2277 
2278 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2279 {
2280 	struct fib6_config cfg;
2281 	int err;
2282 
2283 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2284 	if (err < 0)
2285 		return err;
2286 
2287 	return ip6_route_add(&cfg);
2288 }
2289 
2290 static inline size_t rt6_nlmsg_size(void)
2291 {
2292 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2293 	       + nla_total_size(16) /* RTA_SRC */
2294 	       + nla_total_size(16) /* RTA_DST */
2295 	       + nla_total_size(16) /* RTA_GATEWAY */
2296 	       + nla_total_size(16) /* RTA_PREFSRC */
2297 	       + nla_total_size(4) /* RTA_TABLE */
2298 	       + nla_total_size(4) /* RTA_IIF */
2299 	       + nla_total_size(4) /* RTA_OIF */
2300 	       + nla_total_size(4) /* RTA_PRIORITY */
2301 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2302 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2303 }
2304 
2305 static int rt6_fill_node(struct net *net,
2306 			 struct sk_buff *skb, struct rt6_info *rt,
2307 			 struct in6_addr *dst, struct in6_addr *src,
2308 			 int iif, int type, u32 pid, u32 seq,
2309 			 int prefix, int nowait, unsigned int flags)
2310 {
2311 	struct rtmsg *rtm;
2312 	struct nlmsghdr *nlh;
2313 	long expires;
2314 	u32 table;
2315 
2316 	if (prefix) {	/* user wants prefix routes only */
2317 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2318 			/* success since this is not a prefix route */
2319 			return 1;
2320 		}
2321 	}
2322 
2323 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2324 	if (nlh == NULL)
2325 		return -EMSGSIZE;
2326 
2327 	rtm = nlmsg_data(nlh);
2328 	rtm->rtm_family = AF_INET6;
2329 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2330 	rtm->rtm_src_len = rt->rt6i_src.plen;
2331 	rtm->rtm_tos = 0;
2332 	if (rt->rt6i_table)
2333 		table = rt->rt6i_table->tb6_id;
2334 	else
2335 		table = RT6_TABLE_UNSPEC;
2336 	rtm->rtm_table = table;
2337 	NLA_PUT_U32(skb, RTA_TABLE, table);
2338 	if (rt->rt6i_flags&RTF_REJECT)
2339 		rtm->rtm_type = RTN_UNREACHABLE;
2340 	else if (rt->rt6i_flags&RTF_LOCAL)
2341 		rtm->rtm_type = RTN_LOCAL;
2342 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2343 		rtm->rtm_type = RTN_LOCAL;
2344 	else
2345 		rtm->rtm_type = RTN_UNICAST;
2346 	rtm->rtm_flags = 0;
2347 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2348 	rtm->rtm_protocol = rt->rt6i_protocol;
2349 	if (rt->rt6i_flags&RTF_DYNAMIC)
2350 		rtm->rtm_protocol = RTPROT_REDIRECT;
2351 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2352 		rtm->rtm_protocol = RTPROT_KERNEL;
2353 	else if (rt->rt6i_flags&RTF_DEFAULT)
2354 		rtm->rtm_protocol = RTPROT_RA;
2355 
2356 	if (rt->rt6i_flags&RTF_CACHE)
2357 		rtm->rtm_flags |= RTM_F_CLONED;
2358 
2359 	if (dst) {
2360 		NLA_PUT(skb, RTA_DST, 16, dst);
2361 		rtm->rtm_dst_len = 128;
2362 	} else if (rtm->rtm_dst_len)
2363 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2364 #ifdef CONFIG_IPV6_SUBTREES
2365 	if (src) {
2366 		NLA_PUT(skb, RTA_SRC, 16, src);
2367 		rtm->rtm_src_len = 128;
2368 	} else if (rtm->rtm_src_len)
2369 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2370 #endif
2371 	if (iif) {
2372 #ifdef CONFIG_IPV6_MROUTE
2373 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2374 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2375 			if (err <= 0) {
2376 				if (!nowait) {
2377 					if (err == 0)
2378 						return 0;
2379 					goto nla_put_failure;
2380 				} else {
2381 					if (err == -EMSGSIZE)
2382 						goto nla_put_failure;
2383 				}
2384 			}
2385 		} else
2386 #endif
2387 			NLA_PUT_U32(skb, RTA_IIF, iif);
2388 	} else if (dst) {
2389 		struct in6_addr saddr_buf;
2390 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2391 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2392 	}
2393 
2394 	if (rt->rt6i_prefsrc.plen) {
2395 		struct in6_addr saddr_buf;
2396 		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2397 		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2398 	}
2399 
2400 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2401 		goto nla_put_failure;
2402 
2403 	if (rt->dst.neighbour)
2404 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2405 
2406 	if (rt->dst.dev)
2407 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2408 
2409 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2410 
2411 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2412 		expires = 0;
2413 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2414 		expires = rt->rt6i_expires - jiffies;
2415 	else
2416 		expires = INT_MAX;
2417 
2418 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2419 			       expires, rt->dst.error) < 0)
2420 		goto nla_put_failure;
2421 
2422 	return nlmsg_end(skb, nlh);
2423 
2424 nla_put_failure:
2425 	nlmsg_cancel(skb, nlh);
2426 	return -EMSGSIZE;
2427 }
2428 
2429 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2430 {
2431 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2432 	int prefix;
2433 
2434 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2435 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2436 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2437 	} else
2438 		prefix = 0;
2439 
2440 	return rt6_fill_node(arg->net,
2441 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2442 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2443 		     prefix, 0, NLM_F_MULTI);
2444 }
2445 
2446 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2447 {
2448 	struct net *net = sock_net(in_skb->sk);
2449 	struct nlattr *tb[RTA_MAX+1];
2450 	struct rt6_info *rt;
2451 	struct sk_buff *skb;
2452 	struct rtmsg *rtm;
2453 	struct flowi6 fl6;
2454 	int err, iif = 0;
2455 
2456 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2457 	if (err < 0)
2458 		goto errout;
2459 
2460 	err = -EINVAL;
2461 	memset(&fl6, 0, sizeof(fl6));
2462 
2463 	if (tb[RTA_SRC]) {
2464 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2465 			goto errout;
2466 
2467 		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2468 	}
2469 
2470 	if (tb[RTA_DST]) {
2471 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2472 			goto errout;
2473 
2474 		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2475 	}
2476 
2477 	if (tb[RTA_IIF])
2478 		iif = nla_get_u32(tb[RTA_IIF]);
2479 
2480 	if (tb[RTA_OIF])
2481 		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2482 
2483 	if (iif) {
2484 		struct net_device *dev;
2485 		dev = __dev_get_by_index(net, iif);
2486 		if (!dev) {
2487 			err = -ENODEV;
2488 			goto errout;
2489 		}
2490 	}
2491 
2492 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2493 	if (skb == NULL) {
2494 		err = -ENOBUFS;
2495 		goto errout;
2496 	}
2497 
2498 	/* Reserve room for dummy headers, this skb can pass
2499 	   through good chunk of routing engine.
2500 	 */
2501 	skb_reset_mac_header(skb);
2502 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2503 
2504 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2505 	skb_dst_set(skb, &rt->dst);
2506 
2507 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2508 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2509 			    nlh->nlmsg_seq, 0, 0, 0);
2510 	if (err < 0) {
2511 		kfree_skb(skb);
2512 		goto errout;
2513 	}
2514 
2515 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2516 errout:
2517 	return err;
2518 }
2519 
2520 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2521 {
2522 	struct sk_buff *skb;
2523 	struct net *net = info->nl_net;
2524 	u32 seq;
2525 	int err;
2526 
2527 	err = -ENOBUFS;
2528 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2529 
2530 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2531 	if (skb == NULL)
2532 		goto errout;
2533 
2534 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2535 				event, info->pid, seq, 0, 0, 0);
2536 	if (err < 0) {
2537 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2538 		WARN_ON(err == -EMSGSIZE);
2539 		kfree_skb(skb);
2540 		goto errout;
2541 	}
2542 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2543 		    info->nlh, gfp_any());
2544 	return;
2545 errout:
2546 	if (err < 0)
2547 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2548 }
2549 
2550 static int ip6_route_dev_notify(struct notifier_block *this,
2551 				unsigned long event, void *data)
2552 {
2553 	struct net_device *dev = (struct net_device *)data;
2554 	struct net *net = dev_net(dev);
2555 
2556 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2557 		net->ipv6.ip6_null_entry->dst.dev = dev;
2558 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2559 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2560 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2561 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2562 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2563 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2564 #endif
2565 	}
2566 
2567 	return NOTIFY_OK;
2568 }
2569 
2570 /*
2571  *	/proc
2572  */
2573 
2574 #ifdef CONFIG_PROC_FS
2575 
2576 struct rt6_proc_arg
2577 {
2578 	char *buffer;
2579 	int offset;
2580 	int length;
2581 	int skip;
2582 	int len;
2583 };
2584 
2585 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2586 {
2587 	struct seq_file *m = p_arg;
2588 
2589 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2590 
2591 #ifdef CONFIG_IPV6_SUBTREES
2592 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2593 #else
2594 	seq_puts(m, "00000000000000000000000000000000 00 ");
2595 #endif
2596 
2597 	if (rt->rt6i_nexthop) {
2598 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2599 	} else {
2600 		seq_puts(m, "00000000000000000000000000000000");
2601 	}
2602 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2603 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2604 		   rt->dst.__use, rt->rt6i_flags,
2605 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2606 	return 0;
2607 }
2608 
2609 static int ipv6_route_show(struct seq_file *m, void *v)
2610 {
2611 	struct net *net = (struct net *)m->private;
2612 	fib6_clean_all(net, rt6_info_route, 0, m);
2613 	return 0;
2614 }
2615 
2616 static int ipv6_route_open(struct inode *inode, struct file *file)
2617 {
2618 	return single_open_net(inode, file, ipv6_route_show);
2619 }
2620 
2621 static const struct file_operations ipv6_route_proc_fops = {
2622 	.owner		= THIS_MODULE,
2623 	.open		= ipv6_route_open,
2624 	.read		= seq_read,
2625 	.llseek		= seq_lseek,
2626 	.release	= single_release_net,
2627 };
2628 
2629 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2630 {
2631 	struct net *net = (struct net *)seq->private;
2632 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2633 		   net->ipv6.rt6_stats->fib_nodes,
2634 		   net->ipv6.rt6_stats->fib_route_nodes,
2635 		   net->ipv6.rt6_stats->fib_rt_alloc,
2636 		   net->ipv6.rt6_stats->fib_rt_entries,
2637 		   net->ipv6.rt6_stats->fib_rt_cache,
2638 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2639 		   net->ipv6.rt6_stats->fib_discarded_routes);
2640 
2641 	return 0;
2642 }
2643 
2644 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2645 {
2646 	return single_open_net(inode, file, rt6_stats_seq_show);
2647 }
2648 
2649 static const struct file_operations rt6_stats_seq_fops = {
2650 	.owner	 = THIS_MODULE,
2651 	.open	 = rt6_stats_seq_open,
2652 	.read	 = seq_read,
2653 	.llseek	 = seq_lseek,
2654 	.release = single_release_net,
2655 };
2656 #endif	/* CONFIG_PROC_FS */
2657 
2658 #ifdef CONFIG_SYSCTL
2659 
2660 static
2661 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2662 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2663 {
2664 	struct net *net;
2665 	int delay;
2666 	if (!write)
2667 		return -EINVAL;
2668 
2669 	net = (struct net *)ctl->extra1;
2670 	delay = net->ipv6.sysctl.flush_delay;
2671 	proc_dointvec(ctl, write, buffer, lenp, ppos);
2672 	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2673 	return 0;
2674 }
2675 
2676 ctl_table ipv6_route_table_template[] = {
2677 	{
2678 		.procname	=	"flush",
2679 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2680 		.maxlen		=	sizeof(int),
2681 		.mode		=	0200,
2682 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2683 	},
2684 	{
2685 		.procname	=	"gc_thresh",
2686 		.data		=	&ip6_dst_ops_template.gc_thresh,
2687 		.maxlen		=	sizeof(int),
2688 		.mode		=	0644,
2689 		.proc_handler	=	proc_dointvec,
2690 	},
2691 	{
2692 		.procname	=	"max_size",
2693 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2694 		.maxlen		=	sizeof(int),
2695 		.mode		=	0644,
2696 		.proc_handler	=	proc_dointvec,
2697 	},
2698 	{
2699 		.procname	=	"gc_min_interval",
2700 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2701 		.maxlen		=	sizeof(int),
2702 		.mode		=	0644,
2703 		.proc_handler	=	proc_dointvec_jiffies,
2704 	},
2705 	{
2706 		.procname	=	"gc_timeout",
2707 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2708 		.maxlen		=	sizeof(int),
2709 		.mode		=	0644,
2710 		.proc_handler	=	proc_dointvec_jiffies,
2711 	},
2712 	{
2713 		.procname	=	"gc_interval",
2714 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2715 		.maxlen		=	sizeof(int),
2716 		.mode		=	0644,
2717 		.proc_handler	=	proc_dointvec_jiffies,
2718 	},
2719 	{
2720 		.procname	=	"gc_elasticity",
2721 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2722 		.maxlen		=	sizeof(int),
2723 		.mode		=	0644,
2724 		.proc_handler	=	proc_dointvec,
2725 	},
2726 	{
2727 		.procname	=	"mtu_expires",
2728 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2729 		.maxlen		=	sizeof(int),
2730 		.mode		=	0644,
2731 		.proc_handler	=	proc_dointvec_jiffies,
2732 	},
2733 	{
2734 		.procname	=	"min_adv_mss",
2735 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2736 		.maxlen		=	sizeof(int),
2737 		.mode		=	0644,
2738 		.proc_handler	=	proc_dointvec,
2739 	},
2740 	{
2741 		.procname	=	"gc_min_interval_ms",
2742 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2743 		.maxlen		=	sizeof(int),
2744 		.mode		=	0644,
2745 		.proc_handler	=	proc_dointvec_ms_jiffies,
2746 	},
2747 	{ }
2748 };
2749 
2750 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2751 {
2752 	struct ctl_table *table;
2753 
2754 	table = kmemdup(ipv6_route_table_template,
2755 			sizeof(ipv6_route_table_template),
2756 			GFP_KERNEL);
2757 
2758 	if (table) {
2759 		table[0].data = &net->ipv6.sysctl.flush_delay;
2760 		table[0].extra1 = net;
2761 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2762 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2763 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2764 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2765 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2766 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2767 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2768 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2769 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2770 	}
2771 
2772 	return table;
2773 }
2774 #endif
2775 
2776 static int __net_init ip6_route_net_init(struct net *net)
2777 {
2778 	int ret = -ENOMEM;
2779 
2780 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2781 	       sizeof(net->ipv6.ip6_dst_ops));
2782 
2783 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2784 		goto out_ip6_dst_ops;
2785 
2786 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2787 					   sizeof(*net->ipv6.ip6_null_entry),
2788 					   GFP_KERNEL);
2789 	if (!net->ipv6.ip6_null_entry)
2790 		goto out_ip6_dst_entries;
2791 	net->ipv6.ip6_null_entry->dst.path =
2792 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2793 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2794 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2795 			 ip6_template_metrics, true);
2796 
2797 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2798 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2799 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2800 					       GFP_KERNEL);
2801 	if (!net->ipv6.ip6_prohibit_entry)
2802 		goto out_ip6_null_entry;
2803 	net->ipv6.ip6_prohibit_entry->dst.path =
2804 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2805 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2806 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2807 			 ip6_template_metrics, true);
2808 
2809 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2810 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2811 					       GFP_KERNEL);
2812 	if (!net->ipv6.ip6_blk_hole_entry)
2813 		goto out_ip6_prohibit_entry;
2814 	net->ipv6.ip6_blk_hole_entry->dst.path =
2815 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2816 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2817 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2818 			 ip6_template_metrics, true);
2819 #endif
2820 
2821 	net->ipv6.sysctl.flush_delay = 0;
2822 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2823 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2824 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2825 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2826 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2827 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2828 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2829 
2830 #ifdef CONFIG_PROC_FS
2831 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2832 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2833 #endif
2834 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2835 
2836 	ret = 0;
2837 out:
2838 	return ret;
2839 
2840 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841 out_ip6_prohibit_entry:
2842 	kfree(net->ipv6.ip6_prohibit_entry);
2843 out_ip6_null_entry:
2844 	kfree(net->ipv6.ip6_null_entry);
2845 #endif
2846 out_ip6_dst_entries:
2847 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2848 out_ip6_dst_ops:
2849 	goto out;
2850 }
2851 
2852 static void __net_exit ip6_route_net_exit(struct net *net)
2853 {
2854 #ifdef CONFIG_PROC_FS
2855 	proc_net_remove(net, "ipv6_route");
2856 	proc_net_remove(net, "rt6_stats");
2857 #endif
2858 	kfree(net->ipv6.ip6_null_entry);
2859 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2860 	kfree(net->ipv6.ip6_prohibit_entry);
2861 	kfree(net->ipv6.ip6_blk_hole_entry);
2862 #endif
2863 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2864 }
2865 
2866 static struct pernet_operations ip6_route_net_ops = {
2867 	.init = ip6_route_net_init,
2868 	.exit = ip6_route_net_exit,
2869 };
2870 
2871 static struct notifier_block ip6_route_dev_notifier = {
2872 	.notifier_call = ip6_route_dev_notify,
2873 	.priority = 0,
2874 };
2875 
2876 int __init ip6_route_init(void)
2877 {
2878 	int ret;
2879 
2880 	ret = -ENOMEM;
2881 	ip6_dst_ops_template.kmem_cachep =
2882 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2883 				  SLAB_HWCACHE_ALIGN, NULL);
2884 	if (!ip6_dst_ops_template.kmem_cachep)
2885 		goto out;
2886 
2887 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2888 	if (ret)
2889 		goto out_kmem_cache;
2890 
2891 	ret = register_pernet_subsys(&ip6_route_net_ops);
2892 	if (ret)
2893 		goto out_dst_entries;
2894 
2895 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2896 
2897 	/* Registering of the loopback is done before this portion of code,
2898 	 * the loopback reference in rt6_info will not be taken, do it
2899 	 * manually for init_net */
2900 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2901 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2902   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2904 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2905 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2906 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2907   #endif
2908 	ret = fib6_init();
2909 	if (ret)
2910 		goto out_register_subsys;
2911 
2912 	ret = xfrm6_init();
2913 	if (ret)
2914 		goto out_fib6_init;
2915 
2916 	ret = fib6_rules_init();
2917 	if (ret)
2918 		goto xfrm6_init;
2919 
2920 	ret = -ENOBUFS;
2921 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2922 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2923 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2924 		goto fib6_rules_init;
2925 
2926 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2927 	if (ret)
2928 		goto fib6_rules_init;
2929 
2930 out:
2931 	return ret;
2932 
2933 fib6_rules_init:
2934 	fib6_rules_cleanup();
2935 xfrm6_init:
2936 	xfrm6_fini();
2937 out_fib6_init:
2938 	fib6_gc_cleanup();
2939 out_register_subsys:
2940 	unregister_pernet_subsys(&ip6_route_net_ops);
2941 out_dst_entries:
2942 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2943 out_kmem_cache:
2944 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2945 	goto out;
2946 }
2947 
2948 void ip6_route_cleanup(void)
2949 {
2950 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2951 	fib6_rules_cleanup();
2952 	xfrm6_fini();
2953 	fib6_gc_cleanup();
2954 	unregister_pernet_subsys(&ip6_route_net_ops);
2955 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2956 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2957 }
2958