xref: /linux/net/ipv6/route.c (revision 12871a0bd67dd4db4418e1daafcd46e9d329ef10)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   const struct in6_addr *prefix, int prefixlen,
93 					   const struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   const struct in6_addr *prefix, int prefixlen,
97 					   const struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102 	struct rt6_info *rt = (struct rt6_info *) dst;
103 	struct inet_peer *peer;
104 	u32 *p = NULL;
105 
106 	if (!rt->rt6i_peer)
107 		rt6_bind_peer(rt, 1);
108 
109 	peer = rt->rt6i_peer;
110 	if (peer) {
111 		u32 *old_p = __DST_METRICS_PTR(old);
112 		unsigned long prev, new;
113 
114 		p = peer->metrics;
115 		if (inet_metrics_new(peer))
116 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117 
118 		new = (unsigned long) p;
119 		prev = cmpxchg(&dst->_metrics, old, new);
120 
121 		if (prev != old) {
122 			p = __DST_METRICS_PTR(prev);
123 			if (prev & DST_METRICS_READ_ONLY)
124 				p = NULL;
125 		}
126 	}
127 	return p;
128 }
129 
130 static struct dst_ops ip6_dst_ops_template = {
131 	.family			=	AF_INET6,
132 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
133 	.gc			=	ip6_dst_gc,
134 	.gc_thresh		=	1024,
135 	.check			=	ip6_dst_check,
136 	.default_advmss		=	ip6_default_advmss,
137 	.default_mtu		=	ip6_default_mtu,
138 	.cow_metrics		=	ipv6_cow_metrics,
139 	.destroy		=	ip6_dst_destroy,
140 	.ifdown			=	ip6_dst_ifdown,
141 	.negative_advice	=	ip6_negative_advice,
142 	.link_failure		=	ip6_link_failure,
143 	.update_pmtu		=	ip6_rt_update_pmtu,
144 	.local_out		=	__ip6_local_out,
145 };
146 
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149 	return 0;
150 }
151 
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155 
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157 					 unsigned long old)
158 {
159 	return NULL;
160 }
161 
162 static struct dst_ops ip6_dst_blackhole_ops = {
163 	.family			=	AF_INET6,
164 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
165 	.destroy		=	ip6_dst_destroy,
166 	.check			=	ip6_dst_check,
167 	.default_mtu		=	ip6_blackhole_default_mtu,
168 	.default_advmss		=	ip6_default_advmss,
169 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
170 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
171 };
172 
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174 	[RTAX_HOPLIMIT - 1] = 255,
175 };
176 
177 static struct rt6_info ip6_null_entry_template = {
178 	.dst = {
179 		.__refcnt	= ATOMIC_INIT(1),
180 		.__use		= 1,
181 		.obsolete	= -1,
182 		.error		= -ENETUNREACH,
183 		.input		= ip6_pkt_discard,
184 		.output		= ip6_pkt_discard_out,
185 	},
186 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
187 	.rt6i_protocol  = RTPROT_KERNEL,
188 	.rt6i_metric	= ~(u32) 0,
189 	.rt6i_ref	= ATOMIC_INIT(1),
190 };
191 
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193 
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196 
197 static struct rt6_info ip6_prohibit_entry_template = {
198 	.dst = {
199 		.__refcnt	= ATOMIC_INIT(1),
200 		.__use		= 1,
201 		.obsolete	= -1,
202 		.error		= -EACCES,
203 		.input		= ip6_pkt_prohibit,
204 		.output		= ip6_pkt_prohibit_out,
205 	},
206 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
207 	.rt6i_protocol  = RTPROT_KERNEL,
208 	.rt6i_metric	= ~(u32) 0,
209 	.rt6i_ref	= ATOMIC_INIT(1),
210 };
211 
212 static struct rt6_info ip6_blk_hole_entry_template = {
213 	.dst = {
214 		.__refcnt	= ATOMIC_INIT(1),
215 		.__use		= 1,
216 		.obsolete	= -1,
217 		.error		= -EINVAL,
218 		.input		= dst_discard,
219 		.output		= dst_discard,
220 	},
221 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
222 	.rt6i_protocol  = RTPROT_KERNEL,
223 	.rt6i_metric	= ~(u32) 0,
224 	.rt6i_ref	= ATOMIC_INIT(1),
225 };
226 
227 #endif
228 
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231 					     struct net_device *dev)
232 {
233 	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, 0);
234 
235 	memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
236 
237 	return rt;
238 }
239 
240 static void ip6_dst_destroy(struct dst_entry *dst)
241 {
242 	struct rt6_info *rt = (struct rt6_info *)dst;
243 	struct inet6_dev *idev = rt->rt6i_idev;
244 	struct inet_peer *peer = rt->rt6i_peer;
245 
246 	if (idev != NULL) {
247 		rt->rt6i_idev = NULL;
248 		in6_dev_put(idev);
249 	}
250 	if (peer) {
251 		rt->rt6i_peer = NULL;
252 		inet_putpeer(peer);
253 	}
254 }
255 
256 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
257 
258 static u32 rt6_peer_genid(void)
259 {
260 	return atomic_read(&__rt6_peer_genid);
261 }
262 
263 void rt6_bind_peer(struct rt6_info *rt, int create)
264 {
265 	struct inet_peer *peer;
266 
267 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
268 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
269 		inet_putpeer(peer);
270 	else
271 		rt->rt6i_peer_genid = rt6_peer_genid();
272 }
273 
274 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
275 			   int how)
276 {
277 	struct rt6_info *rt = (struct rt6_info *)dst;
278 	struct inet6_dev *idev = rt->rt6i_idev;
279 	struct net_device *loopback_dev =
280 		dev_net(dev)->loopback_dev;
281 
282 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
283 		struct inet6_dev *loopback_idev =
284 			in6_dev_get(loopback_dev);
285 		if (loopback_idev != NULL) {
286 			rt->rt6i_idev = loopback_idev;
287 			in6_dev_put(idev);
288 		}
289 	}
290 }
291 
292 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
293 {
294 	return (rt->rt6i_flags & RTF_EXPIRES) &&
295 		time_after(jiffies, rt->rt6i_expires);
296 }
297 
298 static inline int rt6_need_strict(const struct in6_addr *daddr)
299 {
300 	return ipv6_addr_type(daddr) &
301 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
302 }
303 
304 /*
305  *	Route lookup. Any table->tb6_lock is implied.
306  */
307 
308 static inline struct rt6_info *rt6_device_match(struct net *net,
309 						    struct rt6_info *rt,
310 						    const struct in6_addr *saddr,
311 						    int oif,
312 						    int flags)
313 {
314 	struct rt6_info *local = NULL;
315 	struct rt6_info *sprt;
316 
317 	if (!oif && ipv6_addr_any(saddr))
318 		goto out;
319 
320 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
321 		struct net_device *dev = sprt->rt6i_dev;
322 
323 		if (oif) {
324 			if (dev->ifindex == oif)
325 				return sprt;
326 			if (dev->flags & IFF_LOOPBACK) {
327 				if (sprt->rt6i_idev == NULL ||
328 				    sprt->rt6i_idev->dev->ifindex != oif) {
329 					if (flags & RT6_LOOKUP_F_IFACE && oif)
330 						continue;
331 					if (local && (!oif ||
332 						      local->rt6i_idev->dev->ifindex == oif))
333 						continue;
334 				}
335 				local = sprt;
336 			}
337 		} else {
338 			if (ipv6_chk_addr(net, saddr, dev,
339 					  flags & RT6_LOOKUP_F_IFACE))
340 				return sprt;
341 		}
342 	}
343 
344 	if (oif) {
345 		if (local)
346 			return local;
347 
348 		if (flags & RT6_LOOKUP_F_IFACE)
349 			return net->ipv6.ip6_null_entry;
350 	}
351 out:
352 	return rt;
353 }
354 
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356 static void rt6_probe(struct rt6_info *rt)
357 {
358 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
359 	/*
360 	 * Okay, this does not seem to be appropriate
361 	 * for now, however, we need to check if it
362 	 * is really so; aka Router Reachability Probing.
363 	 *
364 	 * Router Reachability Probe MUST be rate-limited
365 	 * to no more than one per minute.
366 	 */
367 	if (!neigh || (neigh->nud_state & NUD_VALID))
368 		return;
369 	read_lock_bh(&neigh->lock);
370 	if (!(neigh->nud_state & NUD_VALID) &&
371 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
372 		struct in6_addr mcaddr;
373 		struct in6_addr *target;
374 
375 		neigh->updated = jiffies;
376 		read_unlock_bh(&neigh->lock);
377 
378 		target = (struct in6_addr *)&neigh->primary_key;
379 		addrconf_addr_solict_mult(target, &mcaddr);
380 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
381 	} else
382 		read_unlock_bh(&neigh->lock);
383 }
384 #else
385 static inline void rt6_probe(struct rt6_info *rt)
386 {
387 }
388 #endif
389 
390 /*
391  * Default Router Selection (RFC 2461 6.3.6)
392  */
393 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
394 {
395 	struct net_device *dev = rt->rt6i_dev;
396 	if (!oif || dev->ifindex == oif)
397 		return 2;
398 	if ((dev->flags & IFF_LOOPBACK) &&
399 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
400 		return 1;
401 	return 0;
402 }
403 
404 static inline int rt6_check_neigh(struct rt6_info *rt)
405 {
406 	struct neighbour *neigh = rt->rt6i_nexthop;
407 	int m;
408 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
409 	    !(rt->rt6i_flags & RTF_GATEWAY))
410 		m = 1;
411 	else if (neigh) {
412 		read_lock_bh(&neigh->lock);
413 		if (neigh->nud_state & NUD_VALID)
414 			m = 2;
415 #ifdef CONFIG_IPV6_ROUTER_PREF
416 		else if (neigh->nud_state & NUD_FAILED)
417 			m = 0;
418 #endif
419 		else
420 			m = 1;
421 		read_unlock_bh(&neigh->lock);
422 	} else
423 		m = 0;
424 	return m;
425 }
426 
427 static int rt6_score_route(struct rt6_info *rt, int oif,
428 			   int strict)
429 {
430 	int m, n;
431 
432 	m = rt6_check_dev(rt, oif);
433 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
434 		return -1;
435 #ifdef CONFIG_IPV6_ROUTER_PREF
436 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
437 #endif
438 	n = rt6_check_neigh(rt);
439 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
440 		return -1;
441 	return m;
442 }
443 
444 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
445 				   int *mpri, struct rt6_info *match)
446 {
447 	int m;
448 
449 	if (rt6_check_expired(rt))
450 		goto out;
451 
452 	m = rt6_score_route(rt, oif, strict);
453 	if (m < 0)
454 		goto out;
455 
456 	if (m > *mpri) {
457 		if (strict & RT6_LOOKUP_F_REACHABLE)
458 			rt6_probe(match);
459 		*mpri = m;
460 		match = rt;
461 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
462 		rt6_probe(rt);
463 	}
464 
465 out:
466 	return match;
467 }
468 
469 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
470 				     struct rt6_info *rr_head,
471 				     u32 metric, int oif, int strict)
472 {
473 	struct rt6_info *rt, *match;
474 	int mpri = -1;
475 
476 	match = NULL;
477 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
478 	     rt = rt->dst.rt6_next)
479 		match = find_match(rt, oif, strict, &mpri, match);
480 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
481 	     rt = rt->dst.rt6_next)
482 		match = find_match(rt, oif, strict, &mpri, match);
483 
484 	return match;
485 }
486 
487 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
488 {
489 	struct rt6_info *match, *rt0;
490 	struct net *net;
491 
492 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
493 		  __func__, fn->leaf, oif);
494 
495 	rt0 = fn->rr_ptr;
496 	if (!rt0)
497 		fn->rr_ptr = rt0 = fn->leaf;
498 
499 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
500 
501 	if (!match &&
502 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
503 		struct rt6_info *next = rt0->dst.rt6_next;
504 
505 		/* no entries matched; do round-robin */
506 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
507 			next = fn->leaf;
508 
509 		if (next != rt0)
510 			fn->rr_ptr = next;
511 	}
512 
513 	RT6_TRACE("%s() => %p\n",
514 		  __func__, match);
515 
516 	net = dev_net(rt0->rt6i_dev);
517 	return match ? match : net->ipv6.ip6_null_entry;
518 }
519 
520 #ifdef CONFIG_IPV6_ROUTE_INFO
521 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
522 		  const struct in6_addr *gwaddr)
523 {
524 	struct net *net = dev_net(dev);
525 	struct route_info *rinfo = (struct route_info *) opt;
526 	struct in6_addr prefix_buf, *prefix;
527 	unsigned int pref;
528 	unsigned long lifetime;
529 	struct rt6_info *rt;
530 
531 	if (len < sizeof(struct route_info)) {
532 		return -EINVAL;
533 	}
534 
535 	/* Sanity check for prefix_len and length */
536 	if (rinfo->length > 3) {
537 		return -EINVAL;
538 	} else if (rinfo->prefix_len > 128) {
539 		return -EINVAL;
540 	} else if (rinfo->prefix_len > 64) {
541 		if (rinfo->length < 2) {
542 			return -EINVAL;
543 		}
544 	} else if (rinfo->prefix_len > 0) {
545 		if (rinfo->length < 1) {
546 			return -EINVAL;
547 		}
548 	}
549 
550 	pref = rinfo->route_pref;
551 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
552 		return -EINVAL;
553 
554 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
555 
556 	if (rinfo->length == 3)
557 		prefix = (struct in6_addr *)rinfo->prefix;
558 	else {
559 		/* this function is safe */
560 		ipv6_addr_prefix(&prefix_buf,
561 				 (struct in6_addr *)rinfo->prefix,
562 				 rinfo->prefix_len);
563 		prefix = &prefix_buf;
564 	}
565 
566 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
567 				dev->ifindex);
568 
569 	if (rt && !lifetime) {
570 		ip6_del_rt(rt);
571 		rt = NULL;
572 	}
573 
574 	if (!rt && lifetime)
575 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
576 					pref);
577 	else if (rt)
578 		rt->rt6i_flags = RTF_ROUTEINFO |
579 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
580 
581 	if (rt) {
582 		if (!addrconf_finite_timeout(lifetime)) {
583 			rt->rt6i_flags &= ~RTF_EXPIRES;
584 		} else {
585 			rt->rt6i_expires = jiffies + HZ * lifetime;
586 			rt->rt6i_flags |= RTF_EXPIRES;
587 		}
588 		dst_release(&rt->dst);
589 	}
590 	return 0;
591 }
592 #endif
593 
594 #define BACKTRACK(__net, saddr)			\
595 do { \
596 	if (rt == __net->ipv6.ip6_null_entry) {	\
597 		struct fib6_node *pn; \
598 		while (1) { \
599 			if (fn->fn_flags & RTN_TL_ROOT) \
600 				goto out; \
601 			pn = fn->parent; \
602 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
603 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
604 			else \
605 				fn = pn; \
606 			if (fn->fn_flags & RTN_RTINFO) \
607 				goto restart; \
608 		} \
609 	} \
610 } while(0)
611 
612 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
613 					     struct fib6_table *table,
614 					     struct flowi6 *fl6, int flags)
615 {
616 	struct fib6_node *fn;
617 	struct rt6_info *rt;
618 
619 	read_lock_bh(&table->tb6_lock);
620 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
621 restart:
622 	rt = fn->leaf;
623 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
624 	BACKTRACK(net, &fl6->saddr);
625 out:
626 	dst_use(&rt->dst, jiffies);
627 	read_unlock_bh(&table->tb6_lock);
628 	return rt;
629 
630 }
631 
632 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
633 			    const struct in6_addr *saddr, int oif, int strict)
634 {
635 	struct flowi6 fl6 = {
636 		.flowi6_oif = oif,
637 		.daddr = *daddr,
638 	};
639 	struct dst_entry *dst;
640 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
641 
642 	if (saddr) {
643 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
644 		flags |= RT6_LOOKUP_F_HAS_SADDR;
645 	}
646 
647 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
648 	if (dst->error == 0)
649 		return (struct rt6_info *) dst;
650 
651 	dst_release(dst);
652 
653 	return NULL;
654 }
655 
656 EXPORT_SYMBOL(rt6_lookup);
657 
658 /* ip6_ins_rt is called with FREE table->tb6_lock.
659    It takes new route entry, the addition fails by any reason the
660    route is freed. In any case, if caller does not hold it, it may
661    be destroyed.
662  */
663 
664 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
665 {
666 	int err;
667 	struct fib6_table *table;
668 
669 	table = rt->rt6i_table;
670 	write_lock_bh(&table->tb6_lock);
671 	err = fib6_add(&table->tb6_root, rt, info);
672 	write_unlock_bh(&table->tb6_lock);
673 
674 	return err;
675 }
676 
677 int ip6_ins_rt(struct rt6_info *rt)
678 {
679 	struct nl_info info = {
680 		.nl_net = dev_net(rt->rt6i_dev),
681 	};
682 	return __ip6_ins_rt(rt, &info);
683 }
684 
685 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
686 				      const struct in6_addr *saddr)
687 {
688 	struct rt6_info *rt;
689 
690 	/*
691 	 *	Clone the route.
692 	 */
693 
694 	rt = ip6_rt_copy(ort);
695 
696 	if (rt) {
697 		struct neighbour *neigh;
698 		int attempts = !in_softirq();
699 
700 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
701 			if (rt->rt6i_dst.plen != 128 &&
702 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
703 				rt->rt6i_flags |= RTF_ANYCAST;
704 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
705 		}
706 
707 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
708 		rt->rt6i_dst.plen = 128;
709 		rt->rt6i_flags |= RTF_CACHE;
710 		rt->dst.flags |= DST_HOST;
711 
712 #ifdef CONFIG_IPV6_SUBTREES
713 		if (rt->rt6i_src.plen && saddr) {
714 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
715 			rt->rt6i_src.plen = 128;
716 		}
717 #endif
718 
719 	retry:
720 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
721 		if (IS_ERR(neigh)) {
722 			struct net *net = dev_net(rt->rt6i_dev);
723 			int saved_rt_min_interval =
724 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
725 			int saved_rt_elasticity =
726 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
727 
728 			if (attempts-- > 0) {
729 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
730 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
731 
732 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
733 
734 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
735 					saved_rt_elasticity;
736 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
737 					saved_rt_min_interval;
738 				goto retry;
739 			}
740 
741 			if (net_ratelimit())
742 				printk(KERN_WARNING
743 				       "ipv6: Neighbour table overflow.\n");
744 			dst_free(&rt->dst);
745 			return NULL;
746 		}
747 		rt->rt6i_nexthop = neigh;
748 
749 	}
750 
751 	return rt;
752 }
753 
754 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
755 {
756 	struct rt6_info *rt = ip6_rt_copy(ort);
757 	if (rt) {
758 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
759 		rt->rt6i_dst.plen = 128;
760 		rt->rt6i_flags |= RTF_CACHE;
761 		rt->dst.flags |= DST_HOST;
762 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
763 	}
764 	return rt;
765 }
766 
767 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
768 				      struct flowi6 *fl6, int flags)
769 {
770 	struct fib6_node *fn;
771 	struct rt6_info *rt, *nrt;
772 	int strict = 0;
773 	int attempts = 3;
774 	int err;
775 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
776 
777 	strict |= flags & RT6_LOOKUP_F_IFACE;
778 
779 relookup:
780 	read_lock_bh(&table->tb6_lock);
781 
782 restart_2:
783 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
784 
785 restart:
786 	rt = rt6_select(fn, oif, strict | reachable);
787 
788 	BACKTRACK(net, &fl6->saddr);
789 	if (rt == net->ipv6.ip6_null_entry ||
790 	    rt->rt6i_flags & RTF_CACHE)
791 		goto out;
792 
793 	dst_hold(&rt->dst);
794 	read_unlock_bh(&table->tb6_lock);
795 
796 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
797 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
798 	else if (!(rt->dst.flags & DST_HOST))
799 		nrt = rt6_alloc_clone(rt, &fl6->daddr);
800 	else
801 		goto out2;
802 
803 	dst_release(&rt->dst);
804 	rt = nrt ? : net->ipv6.ip6_null_entry;
805 
806 	dst_hold(&rt->dst);
807 	if (nrt) {
808 		err = ip6_ins_rt(nrt);
809 		if (!err)
810 			goto out2;
811 	}
812 
813 	if (--attempts <= 0)
814 		goto out2;
815 
816 	/*
817 	 * Race condition! In the gap, when table->tb6_lock was
818 	 * released someone could insert this route.  Relookup.
819 	 */
820 	dst_release(&rt->dst);
821 	goto relookup;
822 
823 out:
824 	if (reachable) {
825 		reachable = 0;
826 		goto restart_2;
827 	}
828 	dst_hold(&rt->dst);
829 	read_unlock_bh(&table->tb6_lock);
830 out2:
831 	rt->dst.lastuse = jiffies;
832 	rt->dst.__use++;
833 
834 	return rt;
835 }
836 
837 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
838 					    struct flowi6 *fl6, int flags)
839 {
840 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
841 }
842 
843 void ip6_route_input(struct sk_buff *skb)
844 {
845 	const struct ipv6hdr *iph = ipv6_hdr(skb);
846 	struct net *net = dev_net(skb->dev);
847 	int flags = RT6_LOOKUP_F_HAS_SADDR;
848 	struct flowi6 fl6 = {
849 		.flowi6_iif = skb->dev->ifindex,
850 		.daddr = iph->daddr,
851 		.saddr = iph->saddr,
852 		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
853 		.flowi6_mark = skb->mark,
854 		.flowi6_proto = iph->nexthdr,
855 	};
856 
857 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
858 		flags |= RT6_LOOKUP_F_IFACE;
859 
860 	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
861 }
862 
863 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
864 					     struct flowi6 *fl6, int flags)
865 {
866 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
867 }
868 
869 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
870 				    struct flowi6 *fl6)
871 {
872 	int flags = 0;
873 
874 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
875 		flags |= RT6_LOOKUP_F_IFACE;
876 
877 	if (!ipv6_addr_any(&fl6->saddr))
878 		flags |= RT6_LOOKUP_F_HAS_SADDR;
879 	else if (sk)
880 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
881 
882 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
883 }
884 
885 EXPORT_SYMBOL(ip6_route_output);
886 
887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
888 {
889 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
890 	struct dst_entry *new = NULL;
891 
892 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
893 	if (rt) {
894 		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
895 
896 		new = &rt->dst;
897 
898 		new->__use = 1;
899 		new->input = dst_discard;
900 		new->output = dst_discard;
901 
902 		dst_copy_metrics(new, &ort->dst);
903 		rt->rt6i_idev = ort->rt6i_idev;
904 		if (rt->rt6i_idev)
905 			in6_dev_hold(rt->rt6i_idev);
906 		rt->rt6i_expires = 0;
907 
908 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
909 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
910 		rt->rt6i_metric = 0;
911 
912 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
913 #ifdef CONFIG_IPV6_SUBTREES
914 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
915 #endif
916 
917 		dst_free(new);
918 	}
919 
920 	dst_release(dst_orig);
921 	return new ? new : ERR_PTR(-ENOMEM);
922 }
923 
924 /*
925  *	Destination cache support functions
926  */
927 
928 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
929 {
930 	struct rt6_info *rt;
931 
932 	rt = (struct rt6_info *) dst;
933 
934 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
935 		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
936 			if (!rt->rt6i_peer)
937 				rt6_bind_peer(rt, 0);
938 			rt->rt6i_peer_genid = rt6_peer_genid();
939 		}
940 		return dst;
941 	}
942 	return NULL;
943 }
944 
945 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
946 {
947 	struct rt6_info *rt = (struct rt6_info *) dst;
948 
949 	if (rt) {
950 		if (rt->rt6i_flags & RTF_CACHE) {
951 			if (rt6_check_expired(rt)) {
952 				ip6_del_rt(rt);
953 				dst = NULL;
954 			}
955 		} else {
956 			dst_release(dst);
957 			dst = NULL;
958 		}
959 	}
960 	return dst;
961 }
962 
963 static void ip6_link_failure(struct sk_buff *skb)
964 {
965 	struct rt6_info *rt;
966 
967 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
968 
969 	rt = (struct rt6_info *) skb_dst(skb);
970 	if (rt) {
971 		if (rt->rt6i_flags&RTF_CACHE) {
972 			dst_set_expires(&rt->dst, 0);
973 			rt->rt6i_flags |= RTF_EXPIRES;
974 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
975 			rt->rt6i_node->fn_sernum = -1;
976 	}
977 }
978 
979 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
980 {
981 	struct rt6_info *rt6 = (struct rt6_info*)dst;
982 
983 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
984 		rt6->rt6i_flags |= RTF_MODIFIED;
985 		if (mtu < IPV6_MIN_MTU) {
986 			u32 features = dst_metric(dst, RTAX_FEATURES);
987 			mtu = IPV6_MIN_MTU;
988 			features |= RTAX_FEATURE_ALLFRAG;
989 			dst_metric_set(dst, RTAX_FEATURES, features);
990 		}
991 		dst_metric_set(dst, RTAX_MTU, mtu);
992 	}
993 }
994 
995 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
996 {
997 	struct net_device *dev = dst->dev;
998 	unsigned int mtu = dst_mtu(dst);
999 	struct net *net = dev_net(dev);
1000 
1001 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1002 
1003 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1004 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1005 
1006 	/*
1007 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1008 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1009 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1010 	 * rely only on pmtu discovery"
1011 	 */
1012 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1013 		mtu = IPV6_MAXPLEN;
1014 	return mtu;
1015 }
1016 
1017 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1018 {
1019 	unsigned int mtu = IPV6_MIN_MTU;
1020 	struct inet6_dev *idev;
1021 
1022 	rcu_read_lock();
1023 	idev = __in6_dev_get(dst->dev);
1024 	if (idev)
1025 		mtu = idev->cnf.mtu6;
1026 	rcu_read_unlock();
1027 
1028 	return mtu;
1029 }
1030 
1031 static struct dst_entry *icmp6_dst_gc_list;
1032 static DEFINE_SPINLOCK(icmp6_dst_lock);
1033 
1034 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1035 				  struct neighbour *neigh,
1036 				  const struct in6_addr *addr)
1037 {
1038 	struct rt6_info *rt;
1039 	struct inet6_dev *idev = in6_dev_get(dev);
1040 	struct net *net = dev_net(dev);
1041 
1042 	if (unlikely(idev == NULL))
1043 		return NULL;
1044 
1045 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev);
1046 	if (unlikely(rt == NULL)) {
1047 		in6_dev_put(idev);
1048 		goto out;
1049 	}
1050 
1051 	if (neigh)
1052 		neigh_hold(neigh);
1053 	else {
1054 		neigh = ndisc_get_neigh(dev, addr);
1055 		if (IS_ERR(neigh))
1056 			neigh = NULL;
1057 	}
1058 
1059 	rt->rt6i_idev     = idev;
1060 	rt->rt6i_nexthop  = neigh;
1061 	atomic_set(&rt->dst.__refcnt, 1);
1062 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1063 	rt->dst.output  = ip6_output;
1064 
1065 #if 0	/* there's no chance to use these for ndisc */
1066 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1067 				? DST_HOST
1068 				: 0;
1069 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1070 	rt->rt6i_dst.plen = 128;
1071 #endif
1072 
1073 	spin_lock_bh(&icmp6_dst_lock);
1074 	rt->dst.next = icmp6_dst_gc_list;
1075 	icmp6_dst_gc_list = &rt->dst;
1076 	spin_unlock_bh(&icmp6_dst_lock);
1077 
1078 	fib6_force_start_gc(net);
1079 
1080 out:
1081 	return &rt->dst;
1082 }
1083 
1084 int icmp6_dst_gc(void)
1085 {
1086 	struct dst_entry *dst, **pprev;
1087 	int more = 0;
1088 
1089 	spin_lock_bh(&icmp6_dst_lock);
1090 	pprev = &icmp6_dst_gc_list;
1091 
1092 	while ((dst = *pprev) != NULL) {
1093 		if (!atomic_read(&dst->__refcnt)) {
1094 			*pprev = dst->next;
1095 			dst_free(dst);
1096 		} else {
1097 			pprev = &dst->next;
1098 			++more;
1099 		}
1100 	}
1101 
1102 	spin_unlock_bh(&icmp6_dst_lock);
1103 
1104 	return more;
1105 }
1106 
1107 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1108 			    void *arg)
1109 {
1110 	struct dst_entry *dst, **pprev;
1111 
1112 	spin_lock_bh(&icmp6_dst_lock);
1113 	pprev = &icmp6_dst_gc_list;
1114 	while ((dst = *pprev) != NULL) {
1115 		struct rt6_info *rt = (struct rt6_info *) dst;
1116 		if (func(rt, arg)) {
1117 			*pprev = dst->next;
1118 			dst_free(dst);
1119 		} else {
1120 			pprev = &dst->next;
1121 		}
1122 	}
1123 	spin_unlock_bh(&icmp6_dst_lock);
1124 }
1125 
1126 static int ip6_dst_gc(struct dst_ops *ops)
1127 {
1128 	unsigned long now = jiffies;
1129 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1130 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1131 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1132 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1133 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1134 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1135 	int entries;
1136 
1137 	entries = dst_entries_get_fast(ops);
1138 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1139 	    entries <= rt_max_size)
1140 		goto out;
1141 
1142 	net->ipv6.ip6_rt_gc_expire++;
1143 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1144 	net->ipv6.ip6_rt_last_gc = now;
1145 	entries = dst_entries_get_slow(ops);
1146 	if (entries < ops->gc_thresh)
1147 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1148 out:
1149 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1150 	return entries > rt_max_size;
1151 }
1152 
1153 /* Clean host part of a prefix. Not necessary in radix tree,
1154    but results in cleaner routing tables.
1155 
1156    Remove it only when all the things will work!
1157  */
1158 
1159 int ip6_dst_hoplimit(struct dst_entry *dst)
1160 {
1161 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1162 	if (hoplimit == 0) {
1163 		struct net_device *dev = dst->dev;
1164 		struct inet6_dev *idev;
1165 
1166 		rcu_read_lock();
1167 		idev = __in6_dev_get(dev);
1168 		if (idev)
1169 			hoplimit = idev->cnf.hop_limit;
1170 		else
1171 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1172 		rcu_read_unlock();
1173 	}
1174 	return hoplimit;
1175 }
1176 EXPORT_SYMBOL(ip6_dst_hoplimit);
1177 
1178 /*
1179  *
1180  */
1181 
1182 int ip6_route_add(struct fib6_config *cfg)
1183 {
1184 	int err;
1185 	struct net *net = cfg->fc_nlinfo.nl_net;
1186 	struct rt6_info *rt = NULL;
1187 	struct net_device *dev = NULL;
1188 	struct inet6_dev *idev = NULL;
1189 	struct fib6_table *table;
1190 	int addr_type;
1191 
1192 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1193 		return -EINVAL;
1194 #ifndef CONFIG_IPV6_SUBTREES
1195 	if (cfg->fc_src_len)
1196 		return -EINVAL;
1197 #endif
1198 	if (cfg->fc_ifindex) {
1199 		err = -ENODEV;
1200 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1201 		if (!dev)
1202 			goto out;
1203 		idev = in6_dev_get(dev);
1204 		if (!idev)
1205 			goto out;
1206 	}
1207 
1208 	if (cfg->fc_metric == 0)
1209 		cfg->fc_metric = IP6_RT_PRIO_USER;
1210 
1211 	table = fib6_new_table(net, cfg->fc_table);
1212 	if (table == NULL) {
1213 		err = -ENOBUFS;
1214 		goto out;
1215 	}
1216 
1217 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL);
1218 
1219 	if (rt == NULL) {
1220 		err = -ENOMEM;
1221 		goto out;
1222 	}
1223 
1224 	rt->dst.obsolete = -1;
1225 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1226 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1227 				0;
1228 
1229 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1230 		cfg->fc_protocol = RTPROT_BOOT;
1231 	rt->rt6i_protocol = cfg->fc_protocol;
1232 
1233 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1234 
1235 	if (addr_type & IPV6_ADDR_MULTICAST)
1236 		rt->dst.input = ip6_mc_input;
1237 	else if (cfg->fc_flags & RTF_LOCAL)
1238 		rt->dst.input = ip6_input;
1239 	else
1240 		rt->dst.input = ip6_forward;
1241 
1242 	rt->dst.output = ip6_output;
1243 
1244 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1245 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1246 	if (rt->rt6i_dst.plen == 128)
1247 	       rt->dst.flags = DST_HOST;
1248 
1249 #ifdef CONFIG_IPV6_SUBTREES
1250 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1251 	rt->rt6i_src.plen = cfg->fc_src_len;
1252 #endif
1253 
1254 	rt->rt6i_metric = cfg->fc_metric;
1255 
1256 	/* We cannot add true routes via loopback here,
1257 	   they would result in kernel looping; promote them to reject routes
1258 	 */
1259 	if ((cfg->fc_flags & RTF_REJECT) ||
1260 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1261 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1262 		/* hold loopback dev/idev if we haven't done so. */
1263 		if (dev != net->loopback_dev) {
1264 			if (dev) {
1265 				dev_put(dev);
1266 				in6_dev_put(idev);
1267 			}
1268 			dev = net->loopback_dev;
1269 			dev_hold(dev);
1270 			idev = in6_dev_get(dev);
1271 			if (!idev) {
1272 				err = -ENODEV;
1273 				goto out;
1274 			}
1275 		}
1276 		rt->dst.output = ip6_pkt_discard_out;
1277 		rt->dst.input = ip6_pkt_discard;
1278 		rt->dst.error = -ENETUNREACH;
1279 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1280 		goto install_route;
1281 	}
1282 
1283 	if (cfg->fc_flags & RTF_GATEWAY) {
1284 		const struct in6_addr *gw_addr;
1285 		int gwa_type;
1286 
1287 		gw_addr = &cfg->fc_gateway;
1288 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1289 		gwa_type = ipv6_addr_type(gw_addr);
1290 
1291 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1292 			struct rt6_info *grt;
1293 
1294 			/* IPv6 strictly inhibits using not link-local
1295 			   addresses as nexthop address.
1296 			   Otherwise, router will not able to send redirects.
1297 			   It is very good, but in some (rare!) circumstances
1298 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1299 			   some exceptions. --ANK
1300 			 */
1301 			err = -EINVAL;
1302 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1303 				goto out;
1304 
1305 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1306 
1307 			err = -EHOSTUNREACH;
1308 			if (grt == NULL)
1309 				goto out;
1310 			if (dev) {
1311 				if (dev != grt->rt6i_dev) {
1312 					dst_release(&grt->dst);
1313 					goto out;
1314 				}
1315 			} else {
1316 				dev = grt->rt6i_dev;
1317 				idev = grt->rt6i_idev;
1318 				dev_hold(dev);
1319 				in6_dev_hold(grt->rt6i_idev);
1320 			}
1321 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1322 				err = 0;
1323 			dst_release(&grt->dst);
1324 
1325 			if (err)
1326 				goto out;
1327 		}
1328 		err = -EINVAL;
1329 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1330 			goto out;
1331 	}
1332 
1333 	err = -ENODEV;
1334 	if (dev == NULL)
1335 		goto out;
1336 
1337 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1338 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1339 			err = -EINVAL;
1340 			goto out;
1341 		}
1342 		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1343 		rt->rt6i_prefsrc.plen = 128;
1344 	} else
1345 		rt->rt6i_prefsrc.plen = 0;
1346 
1347 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1348 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1349 		if (IS_ERR(rt->rt6i_nexthop)) {
1350 			err = PTR_ERR(rt->rt6i_nexthop);
1351 			rt->rt6i_nexthop = NULL;
1352 			goto out;
1353 		}
1354 	}
1355 
1356 	rt->rt6i_flags = cfg->fc_flags;
1357 
1358 install_route:
1359 	if (cfg->fc_mx) {
1360 		struct nlattr *nla;
1361 		int remaining;
1362 
1363 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1364 			int type = nla_type(nla);
1365 
1366 			if (type) {
1367 				if (type > RTAX_MAX) {
1368 					err = -EINVAL;
1369 					goto out;
1370 				}
1371 
1372 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1373 			}
1374 		}
1375 	}
1376 
1377 	rt->dst.dev = dev;
1378 	rt->rt6i_idev = idev;
1379 	rt->rt6i_table = table;
1380 
1381 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1382 
1383 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1384 
1385 out:
1386 	if (dev)
1387 		dev_put(dev);
1388 	if (idev)
1389 		in6_dev_put(idev);
1390 	if (rt)
1391 		dst_free(&rt->dst);
1392 	return err;
1393 }
1394 
1395 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1396 {
1397 	int err;
1398 	struct fib6_table *table;
1399 	struct net *net = dev_net(rt->rt6i_dev);
1400 
1401 	if (rt == net->ipv6.ip6_null_entry)
1402 		return -ENOENT;
1403 
1404 	table = rt->rt6i_table;
1405 	write_lock_bh(&table->tb6_lock);
1406 
1407 	err = fib6_del(rt, info);
1408 	dst_release(&rt->dst);
1409 
1410 	write_unlock_bh(&table->tb6_lock);
1411 
1412 	return err;
1413 }
1414 
1415 int ip6_del_rt(struct rt6_info *rt)
1416 {
1417 	struct nl_info info = {
1418 		.nl_net = dev_net(rt->rt6i_dev),
1419 	};
1420 	return __ip6_del_rt(rt, &info);
1421 }
1422 
1423 static int ip6_route_del(struct fib6_config *cfg)
1424 {
1425 	struct fib6_table *table;
1426 	struct fib6_node *fn;
1427 	struct rt6_info *rt;
1428 	int err = -ESRCH;
1429 
1430 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1431 	if (table == NULL)
1432 		return err;
1433 
1434 	read_lock_bh(&table->tb6_lock);
1435 
1436 	fn = fib6_locate(&table->tb6_root,
1437 			 &cfg->fc_dst, cfg->fc_dst_len,
1438 			 &cfg->fc_src, cfg->fc_src_len);
1439 
1440 	if (fn) {
1441 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1442 			if (cfg->fc_ifindex &&
1443 			    (rt->rt6i_dev == NULL ||
1444 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1445 				continue;
1446 			if (cfg->fc_flags & RTF_GATEWAY &&
1447 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1448 				continue;
1449 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1450 				continue;
1451 			dst_hold(&rt->dst);
1452 			read_unlock_bh(&table->tb6_lock);
1453 
1454 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1455 		}
1456 	}
1457 	read_unlock_bh(&table->tb6_lock);
1458 
1459 	return err;
1460 }
1461 
1462 /*
1463  *	Handle redirects
1464  */
1465 struct ip6rd_flowi {
1466 	struct flowi6 fl6;
1467 	struct in6_addr gateway;
1468 };
1469 
1470 static struct rt6_info *__ip6_route_redirect(struct net *net,
1471 					     struct fib6_table *table,
1472 					     struct flowi6 *fl6,
1473 					     int flags)
1474 {
1475 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1476 	struct rt6_info *rt;
1477 	struct fib6_node *fn;
1478 
1479 	/*
1480 	 * Get the "current" route for this destination and
1481 	 * check if the redirect has come from approriate router.
1482 	 *
1483 	 * RFC 2461 specifies that redirects should only be
1484 	 * accepted if they come from the nexthop to the target.
1485 	 * Due to the way the routes are chosen, this notion
1486 	 * is a bit fuzzy and one might need to check all possible
1487 	 * routes.
1488 	 */
1489 
1490 	read_lock_bh(&table->tb6_lock);
1491 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1492 restart:
1493 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494 		/*
1495 		 * Current route is on-link; redirect is always invalid.
1496 		 *
1497 		 * Seems, previous statement is not true. It could
1498 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1499 		 * But then router serving it might decide, that we should
1500 		 * know truth 8)8) --ANK (980726).
1501 		 */
1502 		if (rt6_check_expired(rt))
1503 			continue;
1504 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1505 			continue;
1506 		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1507 			continue;
1508 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1509 			continue;
1510 		break;
1511 	}
1512 
1513 	if (!rt)
1514 		rt = net->ipv6.ip6_null_entry;
1515 	BACKTRACK(net, &fl6->saddr);
1516 out:
1517 	dst_hold(&rt->dst);
1518 
1519 	read_unlock_bh(&table->tb6_lock);
1520 
1521 	return rt;
1522 };
1523 
1524 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1525 					   const struct in6_addr *src,
1526 					   const struct in6_addr *gateway,
1527 					   struct net_device *dev)
1528 {
1529 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1530 	struct net *net = dev_net(dev);
1531 	struct ip6rd_flowi rdfl = {
1532 		.fl6 = {
1533 			.flowi6_oif = dev->ifindex,
1534 			.daddr = *dest,
1535 			.saddr = *src,
1536 		},
1537 	};
1538 
1539 	ipv6_addr_copy(&rdfl.gateway, gateway);
1540 
1541 	if (rt6_need_strict(dest))
1542 		flags |= RT6_LOOKUP_F_IFACE;
1543 
1544 	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1545 						   flags, __ip6_route_redirect);
1546 }
1547 
1548 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1549 		  const struct in6_addr *saddr,
1550 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1551 {
1552 	struct rt6_info *rt, *nrt = NULL;
1553 	struct netevent_redirect netevent;
1554 	struct net *net = dev_net(neigh->dev);
1555 
1556 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1557 
1558 	if (rt == net->ipv6.ip6_null_entry) {
1559 		if (net_ratelimit())
1560 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1561 			       "for redirect target\n");
1562 		goto out;
1563 	}
1564 
1565 	/*
1566 	 *	We have finally decided to accept it.
1567 	 */
1568 
1569 	neigh_update(neigh, lladdr, NUD_STALE,
1570 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1571 		     NEIGH_UPDATE_F_OVERRIDE|
1572 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1573 				     NEIGH_UPDATE_F_ISROUTER))
1574 		     );
1575 
1576 	/*
1577 	 * Redirect received -> path was valid.
1578 	 * Look, redirects are sent only in response to data packets,
1579 	 * so that this nexthop apparently is reachable. --ANK
1580 	 */
1581 	dst_confirm(&rt->dst);
1582 
1583 	/* Duplicate redirect: silently ignore. */
1584 	if (neigh == rt->dst.neighbour)
1585 		goto out;
1586 
1587 	nrt = ip6_rt_copy(rt);
1588 	if (nrt == NULL)
1589 		goto out;
1590 
1591 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1592 	if (on_link)
1593 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1594 
1595 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1596 	nrt->rt6i_dst.plen = 128;
1597 	nrt->dst.flags |= DST_HOST;
1598 
1599 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1600 	nrt->rt6i_nexthop = neigh_clone(neigh);
1601 
1602 	if (ip6_ins_rt(nrt))
1603 		goto out;
1604 
1605 	netevent.old = &rt->dst;
1606 	netevent.new = &nrt->dst;
1607 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1608 
1609 	if (rt->rt6i_flags&RTF_CACHE) {
1610 		ip6_del_rt(rt);
1611 		return;
1612 	}
1613 
1614 out:
1615 	dst_release(&rt->dst);
1616 }
1617 
1618 /*
1619  *	Handle ICMP "packet too big" messages
1620  *	i.e. Path MTU discovery
1621  */
1622 
1623 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1624 			     struct net *net, u32 pmtu, int ifindex)
1625 {
1626 	struct rt6_info *rt, *nrt;
1627 	int allfrag = 0;
1628 again:
1629 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1630 	if (rt == NULL)
1631 		return;
1632 
1633 	if (rt6_check_expired(rt)) {
1634 		ip6_del_rt(rt);
1635 		goto again;
1636 	}
1637 
1638 	if (pmtu >= dst_mtu(&rt->dst))
1639 		goto out;
1640 
1641 	if (pmtu < IPV6_MIN_MTU) {
1642 		/*
1643 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1644 		 * MTU (1280) and a fragment header should always be included
1645 		 * after a node receiving Too Big message reporting PMTU is
1646 		 * less than the IPv6 Minimum Link MTU.
1647 		 */
1648 		pmtu = IPV6_MIN_MTU;
1649 		allfrag = 1;
1650 	}
1651 
1652 	/* New mtu received -> path was valid.
1653 	   They are sent only in response to data packets,
1654 	   so that this nexthop apparently is reachable. --ANK
1655 	 */
1656 	dst_confirm(&rt->dst);
1657 
1658 	/* Host route. If it is static, it would be better
1659 	   not to override it, but add new one, so that
1660 	   when cache entry will expire old pmtu
1661 	   would return automatically.
1662 	 */
1663 	if (rt->rt6i_flags & RTF_CACHE) {
1664 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1665 		if (allfrag) {
1666 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1667 			features |= RTAX_FEATURE_ALLFRAG;
1668 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1669 		}
1670 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1671 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1672 		goto out;
1673 	}
1674 
1675 	/* Network route.
1676 	   Two cases are possible:
1677 	   1. It is connected route. Action: COW
1678 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1679 	 */
1680 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1681 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1682 	else
1683 		nrt = rt6_alloc_clone(rt, daddr);
1684 
1685 	if (nrt) {
1686 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1687 		if (allfrag) {
1688 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1689 			features |= RTAX_FEATURE_ALLFRAG;
1690 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1691 		}
1692 
1693 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1694 		 * happened within 5 mins, the recommended timer is 10 mins.
1695 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1696 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1697 		 * and detecting PMTU increase will be automatically happened.
1698 		 */
1699 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1700 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1701 
1702 		ip6_ins_rt(nrt);
1703 	}
1704 out:
1705 	dst_release(&rt->dst);
1706 }
1707 
1708 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709 			struct net_device *dev, u32 pmtu)
1710 {
1711 	struct net *net = dev_net(dev);
1712 
1713 	/*
1714 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1715 	 * is sending along the path" that caused the Packet Too Big message.
1716 	 * Since it's not possible in the general case to determine which
1717 	 * interface was used to send the original packet, we update the MTU
1718 	 * on the interface that will be used to send future packets. We also
1719 	 * update the MTU on the interface that received the Packet Too Big in
1720 	 * case the original packet was forced out that interface with
1721 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1722 	 * correct behaviour, which would be to update the MTU on all
1723 	 * interfaces.
1724 	 */
1725 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1726 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1727 }
1728 
1729 /*
1730  *	Misc support functions
1731  */
1732 
1733 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1734 {
1735 	struct net *net = dev_net(ort->rt6i_dev);
1736 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1737 					    ort->dst.dev);
1738 
1739 	if (rt) {
1740 		rt->dst.input = ort->dst.input;
1741 		rt->dst.output = ort->dst.output;
1742 
1743 		dst_copy_metrics(&rt->dst, &ort->dst);
1744 		rt->dst.error = ort->dst.error;
1745 		rt->rt6i_idev = ort->rt6i_idev;
1746 		if (rt->rt6i_idev)
1747 			in6_dev_hold(rt->rt6i_idev);
1748 		rt->dst.lastuse = jiffies;
1749 		rt->rt6i_expires = 0;
1750 
1751 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1752 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1753 		rt->rt6i_metric = 0;
1754 
1755 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1756 #ifdef CONFIG_IPV6_SUBTREES
1757 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1758 #endif
1759 		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1760 		rt->rt6i_table = ort->rt6i_table;
1761 	}
1762 	return rt;
1763 }
1764 
1765 #ifdef CONFIG_IPV6_ROUTE_INFO
1766 static struct rt6_info *rt6_get_route_info(struct net *net,
1767 					   const struct in6_addr *prefix, int prefixlen,
1768 					   const struct in6_addr *gwaddr, int ifindex)
1769 {
1770 	struct fib6_node *fn;
1771 	struct rt6_info *rt = NULL;
1772 	struct fib6_table *table;
1773 
1774 	table = fib6_get_table(net, RT6_TABLE_INFO);
1775 	if (table == NULL)
1776 		return NULL;
1777 
1778 	write_lock_bh(&table->tb6_lock);
1779 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1780 	if (!fn)
1781 		goto out;
1782 
1783 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1784 		if (rt->rt6i_dev->ifindex != ifindex)
1785 			continue;
1786 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1787 			continue;
1788 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1789 			continue;
1790 		dst_hold(&rt->dst);
1791 		break;
1792 	}
1793 out:
1794 	write_unlock_bh(&table->tb6_lock);
1795 	return rt;
1796 }
1797 
1798 static struct rt6_info *rt6_add_route_info(struct net *net,
1799 					   const struct in6_addr *prefix, int prefixlen,
1800 					   const struct in6_addr *gwaddr, int ifindex,
1801 					   unsigned pref)
1802 {
1803 	struct fib6_config cfg = {
1804 		.fc_table	= RT6_TABLE_INFO,
1805 		.fc_metric	= IP6_RT_PRIO_USER,
1806 		.fc_ifindex	= ifindex,
1807 		.fc_dst_len	= prefixlen,
1808 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1809 				  RTF_UP | RTF_PREF(pref),
1810 		.fc_nlinfo.pid = 0,
1811 		.fc_nlinfo.nlh = NULL,
1812 		.fc_nlinfo.nl_net = net,
1813 	};
1814 
1815 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1816 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1817 
1818 	/* We should treat it as a default route if prefix length is 0. */
1819 	if (!prefixlen)
1820 		cfg.fc_flags |= RTF_DEFAULT;
1821 
1822 	ip6_route_add(&cfg);
1823 
1824 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1825 }
1826 #endif
1827 
1828 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1829 {
1830 	struct rt6_info *rt;
1831 	struct fib6_table *table;
1832 
1833 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1834 	if (table == NULL)
1835 		return NULL;
1836 
1837 	write_lock_bh(&table->tb6_lock);
1838 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1839 		if (dev == rt->rt6i_dev &&
1840 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1841 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1842 			break;
1843 	}
1844 	if (rt)
1845 		dst_hold(&rt->dst);
1846 	write_unlock_bh(&table->tb6_lock);
1847 	return rt;
1848 }
1849 
1850 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1851 				     struct net_device *dev,
1852 				     unsigned int pref)
1853 {
1854 	struct fib6_config cfg = {
1855 		.fc_table	= RT6_TABLE_DFLT,
1856 		.fc_metric	= IP6_RT_PRIO_USER,
1857 		.fc_ifindex	= dev->ifindex,
1858 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1859 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1860 		.fc_nlinfo.pid = 0,
1861 		.fc_nlinfo.nlh = NULL,
1862 		.fc_nlinfo.nl_net = dev_net(dev),
1863 	};
1864 
1865 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1866 
1867 	ip6_route_add(&cfg);
1868 
1869 	return rt6_get_dflt_router(gwaddr, dev);
1870 }
1871 
1872 void rt6_purge_dflt_routers(struct net *net)
1873 {
1874 	struct rt6_info *rt;
1875 	struct fib6_table *table;
1876 
1877 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1878 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1879 	if (table == NULL)
1880 		return;
1881 
1882 restart:
1883 	read_lock_bh(&table->tb6_lock);
1884 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1885 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1886 			dst_hold(&rt->dst);
1887 			read_unlock_bh(&table->tb6_lock);
1888 			ip6_del_rt(rt);
1889 			goto restart;
1890 		}
1891 	}
1892 	read_unlock_bh(&table->tb6_lock);
1893 }
1894 
1895 static void rtmsg_to_fib6_config(struct net *net,
1896 				 struct in6_rtmsg *rtmsg,
1897 				 struct fib6_config *cfg)
1898 {
1899 	memset(cfg, 0, sizeof(*cfg));
1900 
1901 	cfg->fc_table = RT6_TABLE_MAIN;
1902 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1903 	cfg->fc_metric = rtmsg->rtmsg_metric;
1904 	cfg->fc_expires = rtmsg->rtmsg_info;
1905 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1906 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1907 	cfg->fc_flags = rtmsg->rtmsg_flags;
1908 
1909 	cfg->fc_nlinfo.nl_net = net;
1910 
1911 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1912 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1913 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1914 }
1915 
1916 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1917 {
1918 	struct fib6_config cfg;
1919 	struct in6_rtmsg rtmsg;
1920 	int err;
1921 
1922 	switch(cmd) {
1923 	case SIOCADDRT:		/* Add a route */
1924 	case SIOCDELRT:		/* Delete a route */
1925 		if (!capable(CAP_NET_ADMIN))
1926 			return -EPERM;
1927 		err = copy_from_user(&rtmsg, arg,
1928 				     sizeof(struct in6_rtmsg));
1929 		if (err)
1930 			return -EFAULT;
1931 
1932 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1933 
1934 		rtnl_lock();
1935 		switch (cmd) {
1936 		case SIOCADDRT:
1937 			err = ip6_route_add(&cfg);
1938 			break;
1939 		case SIOCDELRT:
1940 			err = ip6_route_del(&cfg);
1941 			break;
1942 		default:
1943 			err = -EINVAL;
1944 		}
1945 		rtnl_unlock();
1946 
1947 		return err;
1948 	}
1949 
1950 	return -EINVAL;
1951 }
1952 
1953 /*
1954  *	Drop the packet on the floor
1955  */
1956 
1957 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1958 {
1959 	int type;
1960 	struct dst_entry *dst = skb_dst(skb);
1961 	switch (ipstats_mib_noroutes) {
1962 	case IPSTATS_MIB_INNOROUTES:
1963 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1964 		if (type == IPV6_ADDR_ANY) {
1965 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1966 				      IPSTATS_MIB_INADDRERRORS);
1967 			break;
1968 		}
1969 		/* FALLTHROUGH */
1970 	case IPSTATS_MIB_OUTNOROUTES:
1971 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1972 			      ipstats_mib_noroutes);
1973 		break;
1974 	}
1975 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1976 	kfree_skb(skb);
1977 	return 0;
1978 }
1979 
1980 static int ip6_pkt_discard(struct sk_buff *skb)
1981 {
1982 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1983 }
1984 
1985 static int ip6_pkt_discard_out(struct sk_buff *skb)
1986 {
1987 	skb->dev = skb_dst(skb)->dev;
1988 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1989 }
1990 
1991 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1992 
1993 static int ip6_pkt_prohibit(struct sk_buff *skb)
1994 {
1995 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1996 }
1997 
1998 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1999 {
2000 	skb->dev = skb_dst(skb)->dev;
2001 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2002 }
2003 
2004 #endif
2005 
2006 /*
2007  *	Allocate a dst for local (unicast / anycast) address.
2008  */
2009 
2010 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2011 				    const struct in6_addr *addr,
2012 				    int anycast)
2013 {
2014 	struct net *net = dev_net(idev->dev);
2015 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2016 					    net->loopback_dev);
2017 	struct neighbour *neigh;
2018 
2019 	if (rt == NULL) {
2020 		if (net_ratelimit())
2021 			pr_warning("IPv6:  Maximum number of routes reached,"
2022 				   " consider increasing route/max_size.\n");
2023 		return ERR_PTR(-ENOMEM);
2024 	}
2025 
2026 	in6_dev_hold(idev);
2027 
2028 	rt->dst.flags = DST_HOST;
2029 	rt->dst.input = ip6_input;
2030 	rt->dst.output = ip6_output;
2031 	rt->rt6i_idev = idev;
2032 	rt->dst.obsolete = -1;
2033 
2034 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2035 	if (anycast)
2036 		rt->rt6i_flags |= RTF_ANYCAST;
2037 	else
2038 		rt->rt6i_flags |= RTF_LOCAL;
2039 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2040 	if (IS_ERR(neigh)) {
2041 		dst_free(&rt->dst);
2042 
2043 		return ERR_CAST(neigh);
2044 	}
2045 	rt->rt6i_nexthop = neigh;
2046 
2047 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2048 	rt->rt6i_dst.plen = 128;
2049 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2050 
2051 	atomic_set(&rt->dst.__refcnt, 1);
2052 
2053 	return rt;
2054 }
2055 
2056 int ip6_route_get_saddr(struct net *net,
2057 			struct rt6_info *rt,
2058 			const struct in6_addr *daddr,
2059 			unsigned int prefs,
2060 			struct in6_addr *saddr)
2061 {
2062 	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2063 	int err = 0;
2064 	if (rt->rt6i_prefsrc.plen)
2065 		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2066 	else
2067 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2068 					 daddr, prefs, saddr);
2069 	return err;
2070 }
2071 
2072 /* remove deleted ip from prefsrc entries */
2073 struct arg_dev_net_ip {
2074 	struct net_device *dev;
2075 	struct net *net;
2076 	struct in6_addr *addr;
2077 };
2078 
2079 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2080 {
2081 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2082 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2083 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2084 
2085 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2086 	    rt != net->ipv6.ip6_null_entry &&
2087 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2088 		/* remove prefsrc entry */
2089 		rt->rt6i_prefsrc.plen = 0;
2090 	}
2091 	return 0;
2092 }
2093 
2094 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2095 {
2096 	struct net *net = dev_net(ifp->idev->dev);
2097 	struct arg_dev_net_ip adni = {
2098 		.dev = ifp->idev->dev,
2099 		.net = net,
2100 		.addr = &ifp->addr,
2101 	};
2102 	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2103 }
2104 
2105 struct arg_dev_net {
2106 	struct net_device *dev;
2107 	struct net *net;
2108 };
2109 
2110 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2111 {
2112 	const struct arg_dev_net *adn = arg;
2113 	const struct net_device *dev = adn->dev;
2114 
2115 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2116 	    rt != adn->net->ipv6.ip6_null_entry) {
2117 		RT6_TRACE("deleted by ifdown %p\n", rt);
2118 		return -1;
2119 	}
2120 	return 0;
2121 }
2122 
2123 void rt6_ifdown(struct net *net, struct net_device *dev)
2124 {
2125 	struct arg_dev_net adn = {
2126 		.dev = dev,
2127 		.net = net,
2128 	};
2129 
2130 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2131 	icmp6_clean_all(fib6_ifdown, &adn);
2132 }
2133 
2134 struct rt6_mtu_change_arg
2135 {
2136 	struct net_device *dev;
2137 	unsigned mtu;
2138 };
2139 
2140 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2141 {
2142 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2143 	struct inet6_dev *idev;
2144 
2145 	/* In IPv6 pmtu discovery is not optional,
2146 	   so that RTAX_MTU lock cannot disable it.
2147 	   We still use this lock to block changes
2148 	   caused by addrconf/ndisc.
2149 	*/
2150 
2151 	idev = __in6_dev_get(arg->dev);
2152 	if (idev == NULL)
2153 		return 0;
2154 
2155 	/* For administrative MTU increase, there is no way to discover
2156 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2157 	   Since RFC 1981 doesn't include administrative MTU increase
2158 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2159 	 */
2160 	/*
2161 	   If new MTU is less than route PMTU, this new MTU will be the
2162 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2163 	   decreases; if new MTU is greater than route PMTU, and the
2164 	   old MTU is the lowest MTU in the path, update the route PMTU
2165 	   to reflect the increase. In this case if the other nodes' MTU
2166 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2167 	   PMTU discouvery.
2168 	 */
2169 	if (rt->rt6i_dev == arg->dev &&
2170 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2171 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2172 	     (dst_mtu(&rt->dst) < arg->mtu &&
2173 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2174 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2175 	}
2176 	return 0;
2177 }
2178 
2179 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2180 {
2181 	struct rt6_mtu_change_arg arg = {
2182 		.dev = dev,
2183 		.mtu = mtu,
2184 	};
2185 
2186 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2187 }
2188 
2189 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2190 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2191 	[RTA_OIF]               = { .type = NLA_U32 },
2192 	[RTA_IIF]		= { .type = NLA_U32 },
2193 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2194 	[RTA_METRICS]           = { .type = NLA_NESTED },
2195 };
2196 
2197 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2198 			      struct fib6_config *cfg)
2199 {
2200 	struct rtmsg *rtm;
2201 	struct nlattr *tb[RTA_MAX+1];
2202 	int err;
2203 
2204 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2205 	if (err < 0)
2206 		goto errout;
2207 
2208 	err = -EINVAL;
2209 	rtm = nlmsg_data(nlh);
2210 	memset(cfg, 0, sizeof(*cfg));
2211 
2212 	cfg->fc_table = rtm->rtm_table;
2213 	cfg->fc_dst_len = rtm->rtm_dst_len;
2214 	cfg->fc_src_len = rtm->rtm_src_len;
2215 	cfg->fc_flags = RTF_UP;
2216 	cfg->fc_protocol = rtm->rtm_protocol;
2217 
2218 	if (rtm->rtm_type == RTN_UNREACHABLE)
2219 		cfg->fc_flags |= RTF_REJECT;
2220 
2221 	if (rtm->rtm_type == RTN_LOCAL)
2222 		cfg->fc_flags |= RTF_LOCAL;
2223 
2224 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2225 	cfg->fc_nlinfo.nlh = nlh;
2226 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2227 
2228 	if (tb[RTA_GATEWAY]) {
2229 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2230 		cfg->fc_flags |= RTF_GATEWAY;
2231 	}
2232 
2233 	if (tb[RTA_DST]) {
2234 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2235 
2236 		if (nla_len(tb[RTA_DST]) < plen)
2237 			goto errout;
2238 
2239 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2240 	}
2241 
2242 	if (tb[RTA_SRC]) {
2243 		int plen = (rtm->rtm_src_len + 7) >> 3;
2244 
2245 		if (nla_len(tb[RTA_SRC]) < plen)
2246 			goto errout;
2247 
2248 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2249 	}
2250 
2251 	if (tb[RTA_PREFSRC])
2252 		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2253 
2254 	if (tb[RTA_OIF])
2255 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2256 
2257 	if (tb[RTA_PRIORITY])
2258 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2259 
2260 	if (tb[RTA_METRICS]) {
2261 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2262 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2263 	}
2264 
2265 	if (tb[RTA_TABLE])
2266 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2267 
2268 	err = 0;
2269 errout:
2270 	return err;
2271 }
2272 
2273 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2274 {
2275 	struct fib6_config cfg;
2276 	int err;
2277 
2278 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2279 	if (err < 0)
2280 		return err;
2281 
2282 	return ip6_route_del(&cfg);
2283 }
2284 
2285 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2286 {
2287 	struct fib6_config cfg;
2288 	int err;
2289 
2290 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2291 	if (err < 0)
2292 		return err;
2293 
2294 	return ip6_route_add(&cfg);
2295 }
2296 
2297 static inline size_t rt6_nlmsg_size(void)
2298 {
2299 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2300 	       + nla_total_size(16) /* RTA_SRC */
2301 	       + nla_total_size(16) /* RTA_DST */
2302 	       + nla_total_size(16) /* RTA_GATEWAY */
2303 	       + nla_total_size(16) /* RTA_PREFSRC */
2304 	       + nla_total_size(4) /* RTA_TABLE */
2305 	       + nla_total_size(4) /* RTA_IIF */
2306 	       + nla_total_size(4) /* RTA_OIF */
2307 	       + nla_total_size(4) /* RTA_PRIORITY */
2308 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2309 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2310 }
2311 
2312 static int rt6_fill_node(struct net *net,
2313 			 struct sk_buff *skb, struct rt6_info *rt,
2314 			 struct in6_addr *dst, struct in6_addr *src,
2315 			 int iif, int type, u32 pid, u32 seq,
2316 			 int prefix, int nowait, unsigned int flags)
2317 {
2318 	struct rtmsg *rtm;
2319 	struct nlmsghdr *nlh;
2320 	long expires;
2321 	u32 table;
2322 
2323 	if (prefix) {	/* user wants prefix routes only */
2324 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2325 			/* success since this is not a prefix route */
2326 			return 1;
2327 		}
2328 	}
2329 
2330 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2331 	if (nlh == NULL)
2332 		return -EMSGSIZE;
2333 
2334 	rtm = nlmsg_data(nlh);
2335 	rtm->rtm_family = AF_INET6;
2336 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2337 	rtm->rtm_src_len = rt->rt6i_src.plen;
2338 	rtm->rtm_tos = 0;
2339 	if (rt->rt6i_table)
2340 		table = rt->rt6i_table->tb6_id;
2341 	else
2342 		table = RT6_TABLE_UNSPEC;
2343 	rtm->rtm_table = table;
2344 	NLA_PUT_U32(skb, RTA_TABLE, table);
2345 	if (rt->rt6i_flags&RTF_REJECT)
2346 		rtm->rtm_type = RTN_UNREACHABLE;
2347 	else if (rt->rt6i_flags&RTF_LOCAL)
2348 		rtm->rtm_type = RTN_LOCAL;
2349 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2350 		rtm->rtm_type = RTN_LOCAL;
2351 	else
2352 		rtm->rtm_type = RTN_UNICAST;
2353 	rtm->rtm_flags = 0;
2354 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2355 	rtm->rtm_protocol = rt->rt6i_protocol;
2356 	if (rt->rt6i_flags&RTF_DYNAMIC)
2357 		rtm->rtm_protocol = RTPROT_REDIRECT;
2358 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2359 		rtm->rtm_protocol = RTPROT_KERNEL;
2360 	else if (rt->rt6i_flags&RTF_DEFAULT)
2361 		rtm->rtm_protocol = RTPROT_RA;
2362 
2363 	if (rt->rt6i_flags&RTF_CACHE)
2364 		rtm->rtm_flags |= RTM_F_CLONED;
2365 
2366 	if (dst) {
2367 		NLA_PUT(skb, RTA_DST, 16, dst);
2368 		rtm->rtm_dst_len = 128;
2369 	} else if (rtm->rtm_dst_len)
2370 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2371 #ifdef CONFIG_IPV6_SUBTREES
2372 	if (src) {
2373 		NLA_PUT(skb, RTA_SRC, 16, src);
2374 		rtm->rtm_src_len = 128;
2375 	} else if (rtm->rtm_src_len)
2376 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2377 #endif
2378 	if (iif) {
2379 #ifdef CONFIG_IPV6_MROUTE
2380 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2381 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2382 			if (err <= 0) {
2383 				if (!nowait) {
2384 					if (err == 0)
2385 						return 0;
2386 					goto nla_put_failure;
2387 				} else {
2388 					if (err == -EMSGSIZE)
2389 						goto nla_put_failure;
2390 				}
2391 			}
2392 		} else
2393 #endif
2394 			NLA_PUT_U32(skb, RTA_IIF, iif);
2395 	} else if (dst) {
2396 		struct in6_addr saddr_buf;
2397 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2398 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2399 	}
2400 
2401 	if (rt->rt6i_prefsrc.plen) {
2402 		struct in6_addr saddr_buf;
2403 		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2404 		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2405 	}
2406 
2407 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2408 		goto nla_put_failure;
2409 
2410 	if (rt->dst.neighbour)
2411 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2412 
2413 	if (rt->dst.dev)
2414 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2415 
2416 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2417 
2418 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2419 		expires = 0;
2420 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2421 		expires = rt->rt6i_expires - jiffies;
2422 	else
2423 		expires = INT_MAX;
2424 
2425 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2426 			       expires, rt->dst.error) < 0)
2427 		goto nla_put_failure;
2428 
2429 	return nlmsg_end(skb, nlh);
2430 
2431 nla_put_failure:
2432 	nlmsg_cancel(skb, nlh);
2433 	return -EMSGSIZE;
2434 }
2435 
2436 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2437 {
2438 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2439 	int prefix;
2440 
2441 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2442 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2443 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2444 	} else
2445 		prefix = 0;
2446 
2447 	return rt6_fill_node(arg->net,
2448 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2449 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2450 		     prefix, 0, NLM_F_MULTI);
2451 }
2452 
2453 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2454 {
2455 	struct net *net = sock_net(in_skb->sk);
2456 	struct nlattr *tb[RTA_MAX+1];
2457 	struct rt6_info *rt;
2458 	struct sk_buff *skb;
2459 	struct rtmsg *rtm;
2460 	struct flowi6 fl6;
2461 	int err, iif = 0;
2462 
2463 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2464 	if (err < 0)
2465 		goto errout;
2466 
2467 	err = -EINVAL;
2468 	memset(&fl6, 0, sizeof(fl6));
2469 
2470 	if (tb[RTA_SRC]) {
2471 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2472 			goto errout;
2473 
2474 		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2475 	}
2476 
2477 	if (tb[RTA_DST]) {
2478 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2479 			goto errout;
2480 
2481 		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2482 	}
2483 
2484 	if (tb[RTA_IIF])
2485 		iif = nla_get_u32(tb[RTA_IIF]);
2486 
2487 	if (tb[RTA_OIF])
2488 		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2489 
2490 	if (iif) {
2491 		struct net_device *dev;
2492 		dev = __dev_get_by_index(net, iif);
2493 		if (!dev) {
2494 			err = -ENODEV;
2495 			goto errout;
2496 		}
2497 	}
2498 
2499 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2500 	if (skb == NULL) {
2501 		err = -ENOBUFS;
2502 		goto errout;
2503 	}
2504 
2505 	/* Reserve room for dummy headers, this skb can pass
2506 	   through good chunk of routing engine.
2507 	 */
2508 	skb_reset_mac_header(skb);
2509 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2510 
2511 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2512 	skb_dst_set(skb, &rt->dst);
2513 
2514 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2515 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2516 			    nlh->nlmsg_seq, 0, 0, 0);
2517 	if (err < 0) {
2518 		kfree_skb(skb);
2519 		goto errout;
2520 	}
2521 
2522 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2523 errout:
2524 	return err;
2525 }
2526 
2527 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2528 {
2529 	struct sk_buff *skb;
2530 	struct net *net = info->nl_net;
2531 	u32 seq;
2532 	int err;
2533 
2534 	err = -ENOBUFS;
2535 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2536 
2537 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2538 	if (skb == NULL)
2539 		goto errout;
2540 
2541 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2542 				event, info->pid, seq, 0, 0, 0);
2543 	if (err < 0) {
2544 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2545 		WARN_ON(err == -EMSGSIZE);
2546 		kfree_skb(skb);
2547 		goto errout;
2548 	}
2549 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2550 		    info->nlh, gfp_any());
2551 	return;
2552 errout:
2553 	if (err < 0)
2554 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2555 }
2556 
2557 static int ip6_route_dev_notify(struct notifier_block *this,
2558 				unsigned long event, void *data)
2559 {
2560 	struct net_device *dev = (struct net_device *)data;
2561 	struct net *net = dev_net(dev);
2562 
2563 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2564 		net->ipv6.ip6_null_entry->dst.dev = dev;
2565 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2566 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2567 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2568 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2569 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2570 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2571 #endif
2572 	}
2573 
2574 	return NOTIFY_OK;
2575 }
2576 
2577 /*
2578  *	/proc
2579  */
2580 
2581 #ifdef CONFIG_PROC_FS
2582 
2583 struct rt6_proc_arg
2584 {
2585 	char *buffer;
2586 	int offset;
2587 	int length;
2588 	int skip;
2589 	int len;
2590 };
2591 
2592 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2593 {
2594 	struct seq_file *m = p_arg;
2595 
2596 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2597 
2598 #ifdef CONFIG_IPV6_SUBTREES
2599 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2600 #else
2601 	seq_puts(m, "00000000000000000000000000000000 00 ");
2602 #endif
2603 
2604 	if (rt->rt6i_nexthop) {
2605 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2606 	} else {
2607 		seq_puts(m, "00000000000000000000000000000000");
2608 	}
2609 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2610 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2611 		   rt->dst.__use, rt->rt6i_flags,
2612 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2613 	return 0;
2614 }
2615 
2616 static int ipv6_route_show(struct seq_file *m, void *v)
2617 {
2618 	struct net *net = (struct net *)m->private;
2619 	fib6_clean_all(net, rt6_info_route, 0, m);
2620 	return 0;
2621 }
2622 
2623 static int ipv6_route_open(struct inode *inode, struct file *file)
2624 {
2625 	return single_open_net(inode, file, ipv6_route_show);
2626 }
2627 
2628 static const struct file_operations ipv6_route_proc_fops = {
2629 	.owner		= THIS_MODULE,
2630 	.open		= ipv6_route_open,
2631 	.read		= seq_read,
2632 	.llseek		= seq_lseek,
2633 	.release	= single_release_net,
2634 };
2635 
2636 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2637 {
2638 	struct net *net = (struct net *)seq->private;
2639 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2640 		   net->ipv6.rt6_stats->fib_nodes,
2641 		   net->ipv6.rt6_stats->fib_route_nodes,
2642 		   net->ipv6.rt6_stats->fib_rt_alloc,
2643 		   net->ipv6.rt6_stats->fib_rt_entries,
2644 		   net->ipv6.rt6_stats->fib_rt_cache,
2645 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2646 		   net->ipv6.rt6_stats->fib_discarded_routes);
2647 
2648 	return 0;
2649 }
2650 
2651 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2652 {
2653 	return single_open_net(inode, file, rt6_stats_seq_show);
2654 }
2655 
2656 static const struct file_operations rt6_stats_seq_fops = {
2657 	.owner	 = THIS_MODULE,
2658 	.open	 = rt6_stats_seq_open,
2659 	.read	 = seq_read,
2660 	.llseek	 = seq_lseek,
2661 	.release = single_release_net,
2662 };
2663 #endif	/* CONFIG_PROC_FS */
2664 
2665 #ifdef CONFIG_SYSCTL
2666 
2667 static
2668 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2669 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2670 {
2671 	struct net *net;
2672 	int delay;
2673 	if (!write)
2674 		return -EINVAL;
2675 
2676 	net = (struct net *)ctl->extra1;
2677 	delay = net->ipv6.sysctl.flush_delay;
2678 	proc_dointvec(ctl, write, buffer, lenp, ppos);
2679 	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2680 	return 0;
2681 }
2682 
2683 ctl_table ipv6_route_table_template[] = {
2684 	{
2685 		.procname	=	"flush",
2686 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2687 		.maxlen		=	sizeof(int),
2688 		.mode		=	0200,
2689 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2690 	},
2691 	{
2692 		.procname	=	"gc_thresh",
2693 		.data		=	&ip6_dst_ops_template.gc_thresh,
2694 		.maxlen		=	sizeof(int),
2695 		.mode		=	0644,
2696 		.proc_handler	=	proc_dointvec,
2697 	},
2698 	{
2699 		.procname	=	"max_size",
2700 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2701 		.maxlen		=	sizeof(int),
2702 		.mode		=	0644,
2703 		.proc_handler	=	proc_dointvec,
2704 	},
2705 	{
2706 		.procname	=	"gc_min_interval",
2707 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2708 		.maxlen		=	sizeof(int),
2709 		.mode		=	0644,
2710 		.proc_handler	=	proc_dointvec_jiffies,
2711 	},
2712 	{
2713 		.procname	=	"gc_timeout",
2714 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2715 		.maxlen		=	sizeof(int),
2716 		.mode		=	0644,
2717 		.proc_handler	=	proc_dointvec_jiffies,
2718 	},
2719 	{
2720 		.procname	=	"gc_interval",
2721 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2722 		.maxlen		=	sizeof(int),
2723 		.mode		=	0644,
2724 		.proc_handler	=	proc_dointvec_jiffies,
2725 	},
2726 	{
2727 		.procname	=	"gc_elasticity",
2728 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2729 		.maxlen		=	sizeof(int),
2730 		.mode		=	0644,
2731 		.proc_handler	=	proc_dointvec,
2732 	},
2733 	{
2734 		.procname	=	"mtu_expires",
2735 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2736 		.maxlen		=	sizeof(int),
2737 		.mode		=	0644,
2738 		.proc_handler	=	proc_dointvec_jiffies,
2739 	},
2740 	{
2741 		.procname	=	"min_adv_mss",
2742 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2743 		.maxlen		=	sizeof(int),
2744 		.mode		=	0644,
2745 		.proc_handler	=	proc_dointvec,
2746 	},
2747 	{
2748 		.procname	=	"gc_min_interval_ms",
2749 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2750 		.maxlen		=	sizeof(int),
2751 		.mode		=	0644,
2752 		.proc_handler	=	proc_dointvec_ms_jiffies,
2753 	},
2754 	{ }
2755 };
2756 
2757 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2758 {
2759 	struct ctl_table *table;
2760 
2761 	table = kmemdup(ipv6_route_table_template,
2762 			sizeof(ipv6_route_table_template),
2763 			GFP_KERNEL);
2764 
2765 	if (table) {
2766 		table[0].data = &net->ipv6.sysctl.flush_delay;
2767 		table[0].extra1 = net;
2768 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2769 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2770 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2771 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2772 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2773 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2774 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2775 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2776 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2777 	}
2778 
2779 	return table;
2780 }
2781 #endif
2782 
2783 static int __net_init ip6_route_net_init(struct net *net)
2784 {
2785 	int ret = -ENOMEM;
2786 
2787 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2788 	       sizeof(net->ipv6.ip6_dst_ops));
2789 
2790 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2791 		goto out_ip6_dst_ops;
2792 
2793 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2794 					   sizeof(*net->ipv6.ip6_null_entry),
2795 					   GFP_KERNEL);
2796 	if (!net->ipv6.ip6_null_entry)
2797 		goto out_ip6_dst_entries;
2798 	net->ipv6.ip6_null_entry->dst.path =
2799 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2800 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2801 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2802 			 ip6_template_metrics, true);
2803 
2804 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2805 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2806 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2807 					       GFP_KERNEL);
2808 	if (!net->ipv6.ip6_prohibit_entry)
2809 		goto out_ip6_null_entry;
2810 	net->ipv6.ip6_prohibit_entry->dst.path =
2811 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2812 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2813 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2814 			 ip6_template_metrics, true);
2815 
2816 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2817 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2818 					       GFP_KERNEL);
2819 	if (!net->ipv6.ip6_blk_hole_entry)
2820 		goto out_ip6_prohibit_entry;
2821 	net->ipv6.ip6_blk_hole_entry->dst.path =
2822 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2823 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2824 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2825 			 ip6_template_metrics, true);
2826 #endif
2827 
2828 	net->ipv6.sysctl.flush_delay = 0;
2829 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2830 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2831 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2832 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2833 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2834 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2835 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2836 
2837 #ifdef CONFIG_PROC_FS
2838 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2839 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2840 #endif
2841 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2842 
2843 	ret = 0;
2844 out:
2845 	return ret;
2846 
2847 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2848 out_ip6_prohibit_entry:
2849 	kfree(net->ipv6.ip6_prohibit_entry);
2850 out_ip6_null_entry:
2851 	kfree(net->ipv6.ip6_null_entry);
2852 #endif
2853 out_ip6_dst_entries:
2854 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2855 out_ip6_dst_ops:
2856 	goto out;
2857 }
2858 
2859 static void __net_exit ip6_route_net_exit(struct net *net)
2860 {
2861 #ifdef CONFIG_PROC_FS
2862 	proc_net_remove(net, "ipv6_route");
2863 	proc_net_remove(net, "rt6_stats");
2864 #endif
2865 	kfree(net->ipv6.ip6_null_entry);
2866 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2867 	kfree(net->ipv6.ip6_prohibit_entry);
2868 	kfree(net->ipv6.ip6_blk_hole_entry);
2869 #endif
2870 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2871 }
2872 
2873 static struct pernet_operations ip6_route_net_ops = {
2874 	.init = ip6_route_net_init,
2875 	.exit = ip6_route_net_exit,
2876 };
2877 
2878 static struct notifier_block ip6_route_dev_notifier = {
2879 	.notifier_call = ip6_route_dev_notify,
2880 	.priority = 0,
2881 };
2882 
2883 int __init ip6_route_init(void)
2884 {
2885 	int ret;
2886 
2887 	ret = -ENOMEM;
2888 	ip6_dst_ops_template.kmem_cachep =
2889 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2890 				  SLAB_HWCACHE_ALIGN, NULL);
2891 	if (!ip6_dst_ops_template.kmem_cachep)
2892 		goto out;
2893 
2894 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2895 	if (ret)
2896 		goto out_kmem_cache;
2897 
2898 	ret = register_pernet_subsys(&ip6_route_net_ops);
2899 	if (ret)
2900 		goto out_dst_entries;
2901 
2902 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2903 
2904 	/* Registering of the loopback is done before this portion of code,
2905 	 * the loopback reference in rt6_info will not be taken, do it
2906 	 * manually for init_net */
2907 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2908 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2909   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2910 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2911 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2912 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2913 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2914   #endif
2915 	ret = fib6_init();
2916 	if (ret)
2917 		goto out_register_subsys;
2918 
2919 	ret = xfrm6_init();
2920 	if (ret)
2921 		goto out_fib6_init;
2922 
2923 	ret = fib6_rules_init();
2924 	if (ret)
2925 		goto xfrm6_init;
2926 
2927 	ret = -ENOBUFS;
2928 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2929 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2930 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2931 		goto fib6_rules_init;
2932 
2933 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2934 	if (ret)
2935 		goto fib6_rules_init;
2936 
2937 out:
2938 	return ret;
2939 
2940 fib6_rules_init:
2941 	fib6_rules_cleanup();
2942 xfrm6_init:
2943 	xfrm6_fini();
2944 out_fib6_init:
2945 	fib6_gc_cleanup();
2946 out_register_subsys:
2947 	unregister_pernet_subsys(&ip6_route_net_ops);
2948 out_dst_entries:
2949 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2950 out_kmem_cache:
2951 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2952 	goto out;
2953 }
2954 
2955 void ip6_route_cleanup(void)
2956 {
2957 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2958 	fib6_rules_cleanup();
2959 	xfrm6_fini();
2960 	fib6_gc_cleanup();
2961 	unregister_pernet_subsys(&ip6_route_net_ops);
2962 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2963 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2964 }
2965