xref: /linux/net/ipv6/route.c (revision de2fe5e07d58424bc286fff3fd3c1b0bf933cd58)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41 
42 #ifdef 	CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46 
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (neigh) {
284 		read_lock_bh(&neigh->lock);
285 		if (neigh->nud_state & NUD_VALID)
286 			m = 1;
287 		read_unlock_bh(&neigh->lock);
288 	}
289 	return m;
290 }
291 
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293 			   int strict)
294 {
295 	int m = rt6_check_dev(rt, oif);
296 	if (!m && (strict & RT6_SELECT_F_IFACE))
297 		return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301 	if (rt6_check_neigh(rt))
302 		m |= 16;
303 	else if (strict & RT6_SELECT_F_REACHABLE)
304 		return -1;
305 	return m;
306 }
307 
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309 				   int strict)
310 {
311 	struct rt6_info *match = NULL, *last = NULL;
312 	struct rt6_info *rt, *rt0 = *head;
313 	u32 metric;
314 	int mpri = -1;
315 
316 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317 		  __FUNCTION__, head, head ? *head : NULL, oif);
318 
319 	for (rt = rt0, metric = rt0->rt6i_metric;
320 	     rt && rt->rt6i_metric == metric;
321 	     rt = rt->u.next) {
322 		int m;
323 
324 		if (rt6_check_expired(rt))
325 			continue;
326 
327 		last = rt;
328 
329 		m = rt6_score_route(rt, oif, strict);
330 		if (m < 0)
331 			continue;
332 
333 		if (m > mpri) {
334 			rt6_probe(match);
335 			match = rt;
336 			mpri = m;
337 		} else {
338 			rt6_probe(rt);
339 		}
340 	}
341 
342 	if (!match &&
343 	    (strict & RT6_SELECT_F_REACHABLE) &&
344 	    last && last != rt0) {
345 		/* no entries matched; do round-robin */
346 		*head = rt0->u.next;
347 		rt0->u.next = last->u.next;
348 		last->u.next = rt0;
349 	}
350 
351 	RT6_TRACE("%s() => %p, score=%d\n",
352 		  __FUNCTION__, match, mpri);
353 
354 	return (match ? match : &ip6_null_entry);
355 }
356 
357 #ifdef CONFIG_IPV6_ROUTE_INFO
358 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
359 		  struct in6_addr *gwaddr)
360 {
361 	struct route_info *rinfo = (struct route_info *) opt;
362 	struct in6_addr prefix_buf, *prefix;
363 	unsigned int pref;
364 	u32 lifetime;
365 	struct rt6_info *rt;
366 
367 	if (len < sizeof(struct route_info)) {
368 		return -EINVAL;
369 	}
370 
371 	/* Sanity check for prefix_len and length */
372 	if (rinfo->length > 3) {
373 		return -EINVAL;
374 	} else if (rinfo->prefix_len > 128) {
375 		return -EINVAL;
376 	} else if (rinfo->prefix_len > 64) {
377 		if (rinfo->length < 2) {
378 			return -EINVAL;
379 		}
380 	} else if (rinfo->prefix_len > 0) {
381 		if (rinfo->length < 1) {
382 			return -EINVAL;
383 		}
384 	}
385 
386 	pref = rinfo->route_pref;
387 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
388 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
389 
390 	lifetime = htonl(rinfo->lifetime);
391 	if (lifetime == 0xffffffff) {
392 		/* infinity */
393 	} else if (lifetime > 0x7fffffff/HZ) {
394 		/* Avoid arithmetic overflow */
395 		lifetime = 0x7fffffff/HZ - 1;
396 	}
397 
398 	if (rinfo->length == 3)
399 		prefix = (struct in6_addr *)rinfo->prefix;
400 	else {
401 		/* this function is safe */
402 		ipv6_addr_prefix(&prefix_buf,
403 				 (struct in6_addr *)rinfo->prefix,
404 				 rinfo->prefix_len);
405 		prefix = &prefix_buf;
406 	}
407 
408 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
409 
410 	if (rt && !lifetime) {
411 		ip6_del_rt(rt, NULL, NULL, NULL);
412 		rt = NULL;
413 	}
414 
415 	if (!rt && lifetime)
416 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
417 					pref);
418 	else if (rt)
419 		rt->rt6i_flags = RTF_ROUTEINFO |
420 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
421 
422 	if (rt) {
423 		if (lifetime == 0xffffffff) {
424 			rt->rt6i_flags &= ~RTF_EXPIRES;
425 		} else {
426 			rt->rt6i_expires = jiffies + HZ * lifetime;
427 			rt->rt6i_flags |= RTF_EXPIRES;
428 		}
429 		dst_release(&rt->u.dst);
430 	}
431 	return 0;
432 }
433 #endif
434 
435 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
436 			    int oif, int strict)
437 {
438 	struct fib6_node *fn;
439 	struct rt6_info *rt;
440 
441 	read_lock_bh(&rt6_lock);
442 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
443 	rt = rt6_device_match(fn->leaf, oif, strict);
444 	dst_hold(&rt->u.dst);
445 	rt->u.dst.__use++;
446 	read_unlock_bh(&rt6_lock);
447 
448 	rt->u.dst.lastuse = jiffies;
449 	if (rt->u.dst.error == 0)
450 		return rt;
451 	dst_release(&rt->u.dst);
452 	return NULL;
453 }
454 
455 /* ip6_ins_rt is called with FREE rt6_lock.
456    It takes new route entry, the addition fails by any reason the
457    route is freed. In any case, if caller does not hold it, it may
458    be destroyed.
459  */
460 
461 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
462 		void *_rtattr, struct netlink_skb_parms *req)
463 {
464 	int err;
465 
466 	write_lock_bh(&rt6_lock);
467 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
468 	write_unlock_bh(&rt6_lock);
469 
470 	return err;
471 }
472 
473 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
474 				      struct in6_addr *saddr)
475 {
476 	struct rt6_info *rt;
477 
478 	/*
479 	 *	Clone the route.
480 	 */
481 
482 	rt = ip6_rt_copy(ort);
483 
484 	if (rt) {
485 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
486 			if (rt->rt6i_dst.plen != 128 &&
487 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
488 				rt->rt6i_flags |= RTF_ANYCAST;
489 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
490 		}
491 
492 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
493 		rt->rt6i_dst.plen = 128;
494 		rt->rt6i_flags |= RTF_CACHE;
495 		rt->u.dst.flags |= DST_HOST;
496 
497 #ifdef CONFIG_IPV6_SUBTREES
498 		if (rt->rt6i_src.plen && saddr) {
499 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
500 			rt->rt6i_src.plen = 128;
501 		}
502 #endif
503 
504 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
505 
506 	}
507 
508 	return rt;
509 }
510 
511 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
512 {
513 	struct rt6_info *rt = ip6_rt_copy(ort);
514 	if (rt) {
515 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
516 		rt->rt6i_dst.plen = 128;
517 		rt->rt6i_flags |= RTF_CACHE;
518 		if (rt->rt6i_flags & RTF_REJECT)
519 			rt->u.dst.error = ort->u.dst.error;
520 		rt->u.dst.flags |= DST_HOST;
521 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
522 	}
523 	return rt;
524 }
525 
526 #define BACKTRACK() \
527 if (rt == &ip6_null_entry) { \
528        while ((fn = fn->parent) != NULL) { \
529 		if (fn->fn_flags & RTN_ROOT) { \
530 			goto out; \
531 		} \
532 		if (fn->fn_flags & RTN_RTINFO) \
533 			goto restart; \
534 	} \
535 }
536 
537 
538 void ip6_route_input(struct sk_buff *skb)
539 {
540 	struct fib6_node *fn;
541 	struct rt6_info *rt, *nrt;
542 	int strict;
543 	int attempts = 3;
544 	int err;
545 	int reachable = RT6_SELECT_F_REACHABLE;
546 
547 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
548 
549 relookup:
550 	read_lock_bh(&rt6_lock);
551 
552 restart_2:
553 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
554 			 &skb->nh.ipv6h->saddr);
555 
556 restart:
557 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
558 	BACKTRACK();
559 	if (rt == &ip6_null_entry ||
560 	    rt->rt6i_flags & RTF_CACHE)
561 		goto out;
562 
563 	dst_hold(&rt->u.dst);
564 	read_unlock_bh(&rt6_lock);
565 
566 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
567 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
568 	else {
569 #if CLONE_OFFLINK_ROUTE
570 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
571 #else
572 		goto out2;
573 #endif
574 	}
575 
576 	dst_release(&rt->u.dst);
577 	rt = nrt ? : &ip6_null_entry;
578 
579 	dst_hold(&rt->u.dst);
580 	if (nrt) {
581 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
582 		if (!err)
583 			goto out2;
584 	}
585 
586 	if (--attempts <= 0)
587 		goto out2;
588 
589 	/*
590 	 * Race condition! In the gap, when rt6_lock was
591 	 * released someone could insert this route.  Relookup.
592 	 */
593 	dst_release(&rt->u.dst);
594 	goto relookup;
595 
596 out:
597 	if (reachable) {
598 		reachable = 0;
599 		goto restart_2;
600 	}
601 	dst_hold(&rt->u.dst);
602 	read_unlock_bh(&rt6_lock);
603 out2:
604 	rt->u.dst.lastuse = jiffies;
605 	rt->u.dst.__use++;
606 	skb->dst = (struct dst_entry *) rt;
607 	return;
608 }
609 
610 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
611 {
612 	struct fib6_node *fn;
613 	struct rt6_info *rt, *nrt;
614 	int strict;
615 	int attempts = 3;
616 	int err;
617 	int reachable = RT6_SELECT_F_REACHABLE;
618 
619 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
620 
621 relookup:
622 	read_lock_bh(&rt6_lock);
623 
624 restart_2:
625 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
626 
627 restart:
628 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
629 	BACKTRACK();
630 	if (rt == &ip6_null_entry ||
631 	    rt->rt6i_flags & RTF_CACHE)
632 		goto out;
633 
634 	dst_hold(&rt->u.dst);
635 	read_unlock_bh(&rt6_lock);
636 
637 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
638 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
639 	else {
640 #if CLONE_OFFLINK_ROUTE
641 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
642 #else
643 		goto out2;
644 #endif
645 	}
646 
647 	dst_release(&rt->u.dst);
648 	rt = nrt ? : &ip6_null_entry;
649 
650 	dst_hold(&rt->u.dst);
651 	if (nrt) {
652 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
653 		if (!err)
654 			goto out2;
655 	}
656 
657 	if (--attempts <= 0)
658 		goto out2;
659 
660 	/*
661 	 * Race condition! In the gap, when rt6_lock was
662 	 * released someone could insert this route.  Relookup.
663 	 */
664 	dst_release(&rt->u.dst);
665 	goto relookup;
666 
667 out:
668 	if (reachable) {
669 		reachable = 0;
670 		goto restart_2;
671 	}
672 	dst_hold(&rt->u.dst);
673 	read_unlock_bh(&rt6_lock);
674 out2:
675 	rt->u.dst.lastuse = jiffies;
676 	rt->u.dst.__use++;
677 	return &rt->u.dst;
678 }
679 
680 
681 /*
682  *	Destination cache support functions
683  */
684 
685 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
686 {
687 	struct rt6_info *rt;
688 
689 	rt = (struct rt6_info *) dst;
690 
691 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
692 		return dst;
693 
694 	return NULL;
695 }
696 
697 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
698 {
699 	struct rt6_info *rt = (struct rt6_info *) dst;
700 
701 	if (rt) {
702 		if (rt->rt6i_flags & RTF_CACHE)
703 			ip6_del_rt(rt, NULL, NULL, NULL);
704 		else
705 			dst_release(dst);
706 	}
707 	return NULL;
708 }
709 
710 static void ip6_link_failure(struct sk_buff *skb)
711 {
712 	struct rt6_info *rt;
713 
714 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
715 
716 	rt = (struct rt6_info *) skb->dst;
717 	if (rt) {
718 		if (rt->rt6i_flags&RTF_CACHE) {
719 			dst_set_expires(&rt->u.dst, 0);
720 			rt->rt6i_flags |= RTF_EXPIRES;
721 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
722 			rt->rt6i_node->fn_sernum = -1;
723 	}
724 }
725 
726 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
727 {
728 	struct rt6_info *rt6 = (struct rt6_info*)dst;
729 
730 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
731 		rt6->rt6i_flags |= RTF_MODIFIED;
732 		if (mtu < IPV6_MIN_MTU) {
733 			mtu = IPV6_MIN_MTU;
734 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
735 		}
736 		dst->metrics[RTAX_MTU-1] = mtu;
737 	}
738 }
739 
740 /* Protected by rt6_lock.  */
741 static struct dst_entry *ndisc_dst_gc_list;
742 static int ipv6_get_mtu(struct net_device *dev);
743 
744 static inline unsigned int ipv6_advmss(unsigned int mtu)
745 {
746 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
747 
748 	if (mtu < ip6_rt_min_advmss)
749 		mtu = ip6_rt_min_advmss;
750 
751 	/*
752 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
753 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
754 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
755 	 * rely only on pmtu discovery"
756 	 */
757 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
758 		mtu = IPV6_MAXPLEN;
759 	return mtu;
760 }
761 
762 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
763 				  struct neighbour *neigh,
764 				  struct in6_addr *addr,
765 				  int (*output)(struct sk_buff *))
766 {
767 	struct rt6_info *rt;
768 	struct inet6_dev *idev = in6_dev_get(dev);
769 
770 	if (unlikely(idev == NULL))
771 		return NULL;
772 
773 	rt = ip6_dst_alloc();
774 	if (unlikely(rt == NULL)) {
775 		in6_dev_put(idev);
776 		goto out;
777 	}
778 
779 	dev_hold(dev);
780 	if (neigh)
781 		neigh_hold(neigh);
782 	else
783 		neigh = ndisc_get_neigh(dev, addr);
784 
785 	rt->rt6i_dev	  = dev;
786 	rt->rt6i_idev     = idev;
787 	rt->rt6i_nexthop  = neigh;
788 	atomic_set(&rt->u.dst.__refcnt, 1);
789 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
790 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
791 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
792 	rt->u.dst.output  = output;
793 
794 #if 0	/* there's no chance to use these for ndisc */
795 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
796 				? DST_HOST
797 				: 0;
798 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
799 	rt->rt6i_dst.plen = 128;
800 #endif
801 
802 	write_lock_bh(&rt6_lock);
803 	rt->u.dst.next = ndisc_dst_gc_list;
804 	ndisc_dst_gc_list = &rt->u.dst;
805 	write_unlock_bh(&rt6_lock);
806 
807 	fib6_force_start_gc();
808 
809 out:
810 	return (struct dst_entry *)rt;
811 }
812 
813 int ndisc_dst_gc(int *more)
814 {
815 	struct dst_entry *dst, *next, **pprev;
816 	int freed;
817 
818 	next = NULL;
819 	pprev = &ndisc_dst_gc_list;
820 	freed = 0;
821 	while ((dst = *pprev) != NULL) {
822 		if (!atomic_read(&dst->__refcnt)) {
823 			*pprev = dst->next;
824 			dst_free(dst);
825 			freed++;
826 		} else {
827 			pprev = &dst->next;
828 			(*more)++;
829 		}
830 	}
831 
832 	return freed;
833 }
834 
835 static int ip6_dst_gc(void)
836 {
837 	static unsigned expire = 30*HZ;
838 	static unsigned long last_gc;
839 	unsigned long now = jiffies;
840 
841 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
842 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
843 		goto out;
844 
845 	expire++;
846 	fib6_run_gc(expire);
847 	last_gc = now;
848 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
849 		expire = ip6_rt_gc_timeout>>1;
850 
851 out:
852 	expire -= expire>>ip6_rt_gc_elasticity;
853 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
854 }
855 
856 /* Clean host part of a prefix. Not necessary in radix tree,
857    but results in cleaner routing tables.
858 
859    Remove it only when all the things will work!
860  */
861 
862 static int ipv6_get_mtu(struct net_device *dev)
863 {
864 	int mtu = IPV6_MIN_MTU;
865 	struct inet6_dev *idev;
866 
867 	idev = in6_dev_get(dev);
868 	if (idev) {
869 		mtu = idev->cnf.mtu6;
870 		in6_dev_put(idev);
871 	}
872 	return mtu;
873 }
874 
875 int ipv6_get_hoplimit(struct net_device *dev)
876 {
877 	int hoplimit = ipv6_devconf.hop_limit;
878 	struct inet6_dev *idev;
879 
880 	idev = in6_dev_get(dev);
881 	if (idev) {
882 		hoplimit = idev->cnf.hop_limit;
883 		in6_dev_put(idev);
884 	}
885 	return hoplimit;
886 }
887 
888 /*
889  *
890  */
891 
892 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
893 		void *_rtattr, struct netlink_skb_parms *req)
894 {
895 	int err;
896 	struct rtmsg *r;
897 	struct rtattr **rta;
898 	struct rt6_info *rt = NULL;
899 	struct net_device *dev = NULL;
900 	struct inet6_dev *idev = NULL;
901 	int addr_type;
902 
903 	rta = (struct rtattr **) _rtattr;
904 
905 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
906 		return -EINVAL;
907 #ifndef CONFIG_IPV6_SUBTREES
908 	if (rtmsg->rtmsg_src_len)
909 		return -EINVAL;
910 #endif
911 	if (rtmsg->rtmsg_ifindex) {
912 		err = -ENODEV;
913 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
914 		if (!dev)
915 			goto out;
916 		idev = in6_dev_get(dev);
917 		if (!idev)
918 			goto out;
919 	}
920 
921 	if (rtmsg->rtmsg_metric == 0)
922 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
923 
924 	rt = ip6_dst_alloc();
925 
926 	if (rt == NULL) {
927 		err = -ENOMEM;
928 		goto out;
929 	}
930 
931 	rt->u.dst.obsolete = -1;
932 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
933 	if (nlh && (r = NLMSG_DATA(nlh))) {
934 		rt->rt6i_protocol = r->rtm_protocol;
935 	} else {
936 		rt->rt6i_protocol = RTPROT_BOOT;
937 	}
938 
939 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
940 
941 	if (addr_type & IPV6_ADDR_MULTICAST)
942 		rt->u.dst.input = ip6_mc_input;
943 	else
944 		rt->u.dst.input = ip6_forward;
945 
946 	rt->u.dst.output = ip6_output;
947 
948 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
949 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
950 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
951 	if (rt->rt6i_dst.plen == 128)
952 	       rt->u.dst.flags = DST_HOST;
953 
954 #ifdef CONFIG_IPV6_SUBTREES
955 	ipv6_addr_prefix(&rt->rt6i_src.addr,
956 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
957 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
958 #endif
959 
960 	rt->rt6i_metric = rtmsg->rtmsg_metric;
961 
962 	/* We cannot add true routes via loopback here,
963 	   they would result in kernel looping; promote them to reject routes
964 	 */
965 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
966 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
967 		/* hold loopback dev/idev if we haven't done so. */
968 		if (dev != &loopback_dev) {
969 			if (dev) {
970 				dev_put(dev);
971 				in6_dev_put(idev);
972 			}
973 			dev = &loopback_dev;
974 			dev_hold(dev);
975 			idev = in6_dev_get(dev);
976 			if (!idev) {
977 				err = -ENODEV;
978 				goto out;
979 			}
980 		}
981 		rt->u.dst.output = ip6_pkt_discard_out;
982 		rt->u.dst.input = ip6_pkt_discard;
983 		rt->u.dst.error = -ENETUNREACH;
984 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
985 		goto install_route;
986 	}
987 
988 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
989 		struct in6_addr *gw_addr;
990 		int gwa_type;
991 
992 		gw_addr = &rtmsg->rtmsg_gateway;
993 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
994 		gwa_type = ipv6_addr_type(gw_addr);
995 
996 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
997 			struct rt6_info *grt;
998 
999 			/* IPv6 strictly inhibits using not link-local
1000 			   addresses as nexthop address.
1001 			   Otherwise, router will not able to send redirects.
1002 			   It is very good, but in some (rare!) circumstances
1003 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1004 			   some exceptions. --ANK
1005 			 */
1006 			err = -EINVAL;
1007 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1008 				goto out;
1009 
1010 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1011 
1012 			err = -EHOSTUNREACH;
1013 			if (grt == NULL)
1014 				goto out;
1015 			if (dev) {
1016 				if (dev != grt->rt6i_dev) {
1017 					dst_release(&grt->u.dst);
1018 					goto out;
1019 				}
1020 			} else {
1021 				dev = grt->rt6i_dev;
1022 				idev = grt->rt6i_idev;
1023 				dev_hold(dev);
1024 				in6_dev_hold(grt->rt6i_idev);
1025 			}
1026 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1027 				err = 0;
1028 			dst_release(&grt->u.dst);
1029 
1030 			if (err)
1031 				goto out;
1032 		}
1033 		err = -EINVAL;
1034 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1035 			goto out;
1036 	}
1037 
1038 	err = -ENODEV;
1039 	if (dev == NULL)
1040 		goto out;
1041 
1042 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1043 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1044 		if (IS_ERR(rt->rt6i_nexthop)) {
1045 			err = PTR_ERR(rt->rt6i_nexthop);
1046 			rt->rt6i_nexthop = NULL;
1047 			goto out;
1048 		}
1049 	}
1050 
1051 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1052 
1053 install_route:
1054 	if (rta && rta[RTA_METRICS-1]) {
1055 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1056 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1057 
1058 		while (RTA_OK(attr, attrlen)) {
1059 			unsigned flavor = attr->rta_type;
1060 			if (flavor) {
1061 				if (flavor > RTAX_MAX) {
1062 					err = -EINVAL;
1063 					goto out;
1064 				}
1065 				rt->u.dst.metrics[flavor-1] =
1066 					*(u32 *)RTA_DATA(attr);
1067 			}
1068 			attr = RTA_NEXT(attr, attrlen);
1069 		}
1070 	}
1071 
1072 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1073 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1074 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1075 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1076 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1077 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1078 	rt->u.dst.dev = dev;
1079 	rt->rt6i_idev = idev;
1080 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1081 
1082 out:
1083 	if (dev)
1084 		dev_put(dev);
1085 	if (idev)
1086 		in6_dev_put(idev);
1087 	if (rt)
1088 		dst_free((struct dst_entry *) rt);
1089 	return err;
1090 }
1091 
1092 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1093 {
1094 	int err;
1095 
1096 	write_lock_bh(&rt6_lock);
1097 
1098 	err = fib6_del(rt, nlh, _rtattr, req);
1099 	dst_release(&rt->u.dst);
1100 
1101 	write_unlock_bh(&rt6_lock);
1102 
1103 	return err;
1104 }
1105 
1106 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1107 {
1108 	struct fib6_node *fn;
1109 	struct rt6_info *rt;
1110 	int err = -ESRCH;
1111 
1112 	read_lock_bh(&rt6_lock);
1113 
1114 	fn = fib6_locate(&ip6_routing_table,
1115 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1116 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1117 
1118 	if (fn) {
1119 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1120 			if (rtmsg->rtmsg_ifindex &&
1121 			    (rt->rt6i_dev == NULL ||
1122 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1123 				continue;
1124 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1125 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1126 				continue;
1127 			if (rtmsg->rtmsg_metric &&
1128 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1129 				continue;
1130 			dst_hold(&rt->u.dst);
1131 			read_unlock_bh(&rt6_lock);
1132 
1133 			return ip6_del_rt(rt, nlh, _rtattr, req);
1134 		}
1135 	}
1136 	read_unlock_bh(&rt6_lock);
1137 
1138 	return err;
1139 }
1140 
1141 /*
1142  *	Handle redirects
1143  */
1144 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1145 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1146 {
1147 	struct rt6_info *rt, *nrt = NULL;
1148 	int strict;
1149 	struct fib6_node *fn;
1150 
1151 	/*
1152 	 * Get the "current" route for this destination and
1153 	 * check if the redirect has come from approriate router.
1154 	 *
1155 	 * RFC 2461 specifies that redirects should only be
1156 	 * accepted if they come from the nexthop to the target.
1157 	 * Due to the way the routes are chosen, this notion
1158 	 * is a bit fuzzy and one might need to check all possible
1159 	 * routes.
1160 	 */
1161 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1162 
1163 	read_lock_bh(&rt6_lock);
1164 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1165 restart:
1166 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1167 		/*
1168 		 * Current route is on-link; redirect is always invalid.
1169 		 *
1170 		 * Seems, previous statement is not true. It could
1171 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1172 		 * But then router serving it might decide, that we should
1173 		 * know truth 8)8) --ANK (980726).
1174 		 */
1175 		if (rt6_check_expired(rt))
1176 			continue;
1177 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1178 			continue;
1179 		if (neigh->dev != rt->rt6i_dev)
1180 			continue;
1181 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1182 			continue;
1183 		break;
1184 	}
1185 	if (rt)
1186 		dst_hold(&rt->u.dst);
1187 	else if (strict) {
1188 		while ((fn = fn->parent) != NULL) {
1189 			if (fn->fn_flags & RTN_ROOT)
1190 				break;
1191 			if (fn->fn_flags & RTN_RTINFO)
1192 				goto restart;
1193 		}
1194 	}
1195 	read_unlock_bh(&rt6_lock);
1196 
1197 	if (!rt) {
1198 		if (net_ratelimit())
1199 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1200 			       "for redirect target\n");
1201 		return;
1202 	}
1203 
1204 	/*
1205 	 *	We have finally decided to accept it.
1206 	 */
1207 
1208 	neigh_update(neigh, lladdr, NUD_STALE,
1209 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1210 		     NEIGH_UPDATE_F_OVERRIDE|
1211 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1212 				     NEIGH_UPDATE_F_ISROUTER))
1213 		     );
1214 
1215 	/*
1216 	 * Redirect received -> path was valid.
1217 	 * Look, redirects are sent only in response to data packets,
1218 	 * so that this nexthop apparently is reachable. --ANK
1219 	 */
1220 	dst_confirm(&rt->u.dst);
1221 
1222 	/* Duplicate redirect: silently ignore. */
1223 	if (neigh == rt->u.dst.neighbour)
1224 		goto out;
1225 
1226 	nrt = ip6_rt_copy(rt);
1227 	if (nrt == NULL)
1228 		goto out;
1229 
1230 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1231 	if (on_link)
1232 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1233 
1234 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1235 	nrt->rt6i_dst.plen = 128;
1236 	nrt->u.dst.flags |= DST_HOST;
1237 
1238 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1239 	nrt->rt6i_nexthop = neigh_clone(neigh);
1240 	/* Reset pmtu, it may be better */
1241 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1242 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1243 
1244 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1245 		goto out;
1246 
1247 	if (rt->rt6i_flags&RTF_CACHE) {
1248 		ip6_del_rt(rt, NULL, NULL, NULL);
1249 		return;
1250 	}
1251 
1252 out:
1253         dst_release(&rt->u.dst);
1254 	return;
1255 }
1256 
1257 /*
1258  *	Handle ICMP "packet too big" messages
1259  *	i.e. Path MTU discovery
1260  */
1261 
1262 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1263 			struct net_device *dev, u32 pmtu)
1264 {
1265 	struct rt6_info *rt, *nrt;
1266 	int allfrag = 0;
1267 
1268 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1269 	if (rt == NULL)
1270 		return;
1271 
1272 	if (pmtu >= dst_mtu(&rt->u.dst))
1273 		goto out;
1274 
1275 	if (pmtu < IPV6_MIN_MTU) {
1276 		/*
1277 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1278 		 * MTU (1280) and a fragment header should always be included
1279 		 * after a node receiving Too Big message reporting PMTU is
1280 		 * less than the IPv6 Minimum Link MTU.
1281 		 */
1282 		pmtu = IPV6_MIN_MTU;
1283 		allfrag = 1;
1284 	}
1285 
1286 	/* New mtu received -> path was valid.
1287 	   They are sent only in response to data packets,
1288 	   so that this nexthop apparently is reachable. --ANK
1289 	 */
1290 	dst_confirm(&rt->u.dst);
1291 
1292 	/* Host route. If it is static, it would be better
1293 	   not to override it, but add new one, so that
1294 	   when cache entry will expire old pmtu
1295 	   would return automatically.
1296 	 */
1297 	if (rt->rt6i_flags & RTF_CACHE) {
1298 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1299 		if (allfrag)
1300 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1301 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1302 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1303 		goto out;
1304 	}
1305 
1306 	/* Network route.
1307 	   Two cases are possible:
1308 	   1. It is connected route. Action: COW
1309 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1310 	 */
1311 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1312 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1313 	else
1314 		nrt = rt6_alloc_clone(rt, daddr);
1315 
1316 	if (nrt) {
1317 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1318 		if (allfrag)
1319 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1320 
1321 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1322 		 * happened within 5 mins, the recommended timer is 10 mins.
1323 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1324 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1325 		 * and detecting PMTU increase will be automatically happened.
1326 		 */
1327 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1328 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1329 
1330 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1331 	}
1332 out:
1333 	dst_release(&rt->u.dst);
1334 }
1335 
1336 /*
1337  *	Misc support functions
1338  */
1339 
1340 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1341 {
1342 	struct rt6_info *rt = ip6_dst_alloc();
1343 
1344 	if (rt) {
1345 		rt->u.dst.input = ort->u.dst.input;
1346 		rt->u.dst.output = ort->u.dst.output;
1347 
1348 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1349 		rt->u.dst.dev = ort->u.dst.dev;
1350 		if (rt->u.dst.dev)
1351 			dev_hold(rt->u.dst.dev);
1352 		rt->rt6i_idev = ort->rt6i_idev;
1353 		if (rt->rt6i_idev)
1354 			in6_dev_hold(rt->rt6i_idev);
1355 		rt->u.dst.lastuse = jiffies;
1356 		rt->rt6i_expires = 0;
1357 
1358 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1359 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1360 		rt->rt6i_metric = 0;
1361 
1362 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1363 #ifdef CONFIG_IPV6_SUBTREES
1364 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1365 #endif
1366 	}
1367 	return rt;
1368 }
1369 
1370 #ifdef CONFIG_IPV6_ROUTE_INFO
1371 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1372 					   struct in6_addr *gwaddr, int ifindex)
1373 {
1374 	struct fib6_node *fn;
1375 	struct rt6_info *rt = NULL;
1376 
1377 	write_lock_bh(&rt6_lock);
1378 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1379 	if (!fn)
1380 		goto out;
1381 
1382 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1383 		if (rt->rt6i_dev->ifindex != ifindex)
1384 			continue;
1385 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1386 			continue;
1387 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1388 			continue;
1389 		dst_hold(&rt->u.dst);
1390 		break;
1391 	}
1392 out:
1393 	write_unlock_bh(&rt6_lock);
1394 	return rt;
1395 }
1396 
1397 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1398 					   struct in6_addr *gwaddr, int ifindex,
1399 					   unsigned pref)
1400 {
1401 	struct in6_rtmsg rtmsg;
1402 
1403 	memset(&rtmsg, 0, sizeof(rtmsg));
1404 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1405 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1406 	rtmsg.rtmsg_dst_len = prefixlen;
1407 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1408 	rtmsg.rtmsg_metric = 1024;
1409 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1410 	/* We should treat it as a default route if prefix length is 0. */
1411 	if (!prefixlen)
1412 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1413 	rtmsg.rtmsg_ifindex = ifindex;
1414 
1415 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1416 
1417 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1418 }
1419 #endif
1420 
1421 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1422 {
1423 	struct rt6_info *rt;
1424 	struct fib6_node *fn;
1425 
1426 	fn = &ip6_routing_table;
1427 
1428 	write_lock_bh(&rt6_lock);
1429 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1430 		if (dev == rt->rt6i_dev &&
1431 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1432 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1433 			break;
1434 	}
1435 	if (rt)
1436 		dst_hold(&rt->u.dst);
1437 	write_unlock_bh(&rt6_lock);
1438 	return rt;
1439 }
1440 
1441 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1442 				     struct net_device *dev,
1443 				     unsigned int pref)
1444 {
1445 	struct in6_rtmsg rtmsg;
1446 
1447 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1448 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1449 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1450 	rtmsg.rtmsg_metric = 1024;
1451 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1452 			    RTF_PREF(pref);
1453 
1454 	rtmsg.rtmsg_ifindex = dev->ifindex;
1455 
1456 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1457 	return rt6_get_dflt_router(gwaddr, dev);
1458 }
1459 
1460 void rt6_purge_dflt_routers(void)
1461 {
1462 	struct rt6_info *rt;
1463 
1464 restart:
1465 	read_lock_bh(&rt6_lock);
1466 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1467 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1468 			dst_hold(&rt->u.dst);
1469 
1470 			read_unlock_bh(&rt6_lock);
1471 
1472 			ip6_del_rt(rt, NULL, NULL, NULL);
1473 
1474 			goto restart;
1475 		}
1476 	}
1477 	read_unlock_bh(&rt6_lock);
1478 }
1479 
1480 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1481 {
1482 	struct in6_rtmsg rtmsg;
1483 	int err;
1484 
1485 	switch(cmd) {
1486 	case SIOCADDRT:		/* Add a route */
1487 	case SIOCDELRT:		/* Delete a route */
1488 		if (!capable(CAP_NET_ADMIN))
1489 			return -EPERM;
1490 		err = copy_from_user(&rtmsg, arg,
1491 				     sizeof(struct in6_rtmsg));
1492 		if (err)
1493 			return -EFAULT;
1494 
1495 		rtnl_lock();
1496 		switch (cmd) {
1497 		case SIOCADDRT:
1498 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1499 			break;
1500 		case SIOCDELRT:
1501 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1502 			break;
1503 		default:
1504 			err = -EINVAL;
1505 		}
1506 		rtnl_unlock();
1507 
1508 		return err;
1509 	};
1510 
1511 	return -EINVAL;
1512 }
1513 
1514 /*
1515  *	Drop the packet on the floor
1516  */
1517 
1518 static int ip6_pkt_discard(struct sk_buff *skb)
1519 {
1520 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1521 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1522 	kfree_skb(skb);
1523 	return 0;
1524 }
1525 
1526 static int ip6_pkt_discard_out(struct sk_buff *skb)
1527 {
1528 	skb->dev = skb->dst->dev;
1529 	return ip6_pkt_discard(skb);
1530 }
1531 
1532 /*
1533  *	Allocate a dst for local (unicast / anycast) address.
1534  */
1535 
1536 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1537 				    const struct in6_addr *addr,
1538 				    int anycast)
1539 {
1540 	struct rt6_info *rt = ip6_dst_alloc();
1541 
1542 	if (rt == NULL)
1543 		return ERR_PTR(-ENOMEM);
1544 
1545 	dev_hold(&loopback_dev);
1546 	in6_dev_hold(idev);
1547 
1548 	rt->u.dst.flags = DST_HOST;
1549 	rt->u.dst.input = ip6_input;
1550 	rt->u.dst.output = ip6_output;
1551 	rt->rt6i_dev = &loopback_dev;
1552 	rt->rt6i_idev = idev;
1553 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1554 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1555 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1556 	rt->u.dst.obsolete = -1;
1557 
1558 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1559 	if (anycast)
1560 		rt->rt6i_flags |= RTF_ANYCAST;
1561 	else
1562 		rt->rt6i_flags |= RTF_LOCAL;
1563 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1564 	if (rt->rt6i_nexthop == NULL) {
1565 		dst_free((struct dst_entry *) rt);
1566 		return ERR_PTR(-ENOMEM);
1567 	}
1568 
1569 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1570 	rt->rt6i_dst.plen = 128;
1571 
1572 	atomic_set(&rt->u.dst.__refcnt, 1);
1573 
1574 	return rt;
1575 }
1576 
1577 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1578 {
1579 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1580 	    rt != &ip6_null_entry) {
1581 		RT6_TRACE("deleted by ifdown %p\n", rt);
1582 		return -1;
1583 	}
1584 	return 0;
1585 }
1586 
1587 void rt6_ifdown(struct net_device *dev)
1588 {
1589 	write_lock_bh(&rt6_lock);
1590 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1591 	write_unlock_bh(&rt6_lock);
1592 }
1593 
1594 struct rt6_mtu_change_arg
1595 {
1596 	struct net_device *dev;
1597 	unsigned mtu;
1598 };
1599 
1600 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1601 {
1602 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1603 	struct inet6_dev *idev;
1604 
1605 	/* In IPv6 pmtu discovery is not optional,
1606 	   so that RTAX_MTU lock cannot disable it.
1607 	   We still use this lock to block changes
1608 	   caused by addrconf/ndisc.
1609 	*/
1610 
1611 	idev = __in6_dev_get(arg->dev);
1612 	if (idev == NULL)
1613 		return 0;
1614 
1615 	/* For administrative MTU increase, there is no way to discover
1616 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1617 	   Since RFC 1981 doesn't include administrative MTU increase
1618 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1619 	 */
1620 	/*
1621 	   If new MTU is less than route PMTU, this new MTU will be the
1622 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1623 	   decreases; if new MTU is greater than route PMTU, and the
1624 	   old MTU is the lowest MTU in the path, update the route PMTU
1625 	   to reflect the increase. In this case if the other nodes' MTU
1626 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1627 	   PMTU discouvery.
1628 	 */
1629 	if (rt->rt6i_dev == arg->dev &&
1630 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1631             (dst_mtu(&rt->u.dst) > arg->mtu ||
1632              (dst_mtu(&rt->u.dst) < arg->mtu &&
1633 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1634 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1635 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1636 	return 0;
1637 }
1638 
1639 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1640 {
1641 	struct rt6_mtu_change_arg arg;
1642 
1643 	arg.dev = dev;
1644 	arg.mtu = mtu;
1645 	read_lock_bh(&rt6_lock);
1646 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1647 	read_unlock_bh(&rt6_lock);
1648 }
1649 
1650 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1651 			      struct in6_rtmsg *rtmsg)
1652 {
1653 	memset(rtmsg, 0, sizeof(*rtmsg));
1654 
1655 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1656 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1657 	rtmsg->rtmsg_flags = RTF_UP;
1658 	if (r->rtm_type == RTN_UNREACHABLE)
1659 		rtmsg->rtmsg_flags |= RTF_REJECT;
1660 
1661 	if (rta[RTA_GATEWAY-1]) {
1662 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1663 			return -EINVAL;
1664 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1665 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1666 	}
1667 	if (rta[RTA_DST-1]) {
1668 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1669 			return -EINVAL;
1670 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1671 	}
1672 	if (rta[RTA_SRC-1]) {
1673 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1674 			return -EINVAL;
1675 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1676 	}
1677 	if (rta[RTA_OIF-1]) {
1678 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1679 			return -EINVAL;
1680 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1681 	}
1682 	if (rta[RTA_PRIORITY-1]) {
1683 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1684 			return -EINVAL;
1685 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1686 	}
1687 	return 0;
1688 }
1689 
1690 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1691 {
1692 	struct rtmsg *r = NLMSG_DATA(nlh);
1693 	struct in6_rtmsg rtmsg;
1694 
1695 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1696 		return -EINVAL;
1697 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1698 }
1699 
1700 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1701 {
1702 	struct rtmsg *r = NLMSG_DATA(nlh);
1703 	struct in6_rtmsg rtmsg;
1704 
1705 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1706 		return -EINVAL;
1707 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1708 }
1709 
1710 struct rt6_rtnl_dump_arg
1711 {
1712 	struct sk_buff *skb;
1713 	struct netlink_callback *cb;
1714 };
1715 
1716 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1717 			 struct in6_addr *dst, struct in6_addr *src,
1718 			 int iif, int type, u32 pid, u32 seq,
1719 			 int prefix, unsigned int flags)
1720 {
1721 	struct rtmsg *rtm;
1722 	struct nlmsghdr  *nlh;
1723 	unsigned char	 *b = skb->tail;
1724 	struct rta_cacheinfo ci;
1725 
1726 	if (prefix) {	/* user wants prefix routes only */
1727 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1728 			/* success since this is not a prefix route */
1729 			return 1;
1730 		}
1731 	}
1732 
1733 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1734 	rtm = NLMSG_DATA(nlh);
1735 	rtm->rtm_family = AF_INET6;
1736 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1737 	rtm->rtm_src_len = rt->rt6i_src.plen;
1738 	rtm->rtm_tos = 0;
1739 	rtm->rtm_table = RT_TABLE_MAIN;
1740 	if (rt->rt6i_flags&RTF_REJECT)
1741 		rtm->rtm_type = RTN_UNREACHABLE;
1742 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1743 		rtm->rtm_type = RTN_LOCAL;
1744 	else
1745 		rtm->rtm_type = RTN_UNICAST;
1746 	rtm->rtm_flags = 0;
1747 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1748 	rtm->rtm_protocol = rt->rt6i_protocol;
1749 	if (rt->rt6i_flags&RTF_DYNAMIC)
1750 		rtm->rtm_protocol = RTPROT_REDIRECT;
1751 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1752 		rtm->rtm_protocol = RTPROT_KERNEL;
1753 	else if (rt->rt6i_flags&RTF_DEFAULT)
1754 		rtm->rtm_protocol = RTPROT_RA;
1755 
1756 	if (rt->rt6i_flags&RTF_CACHE)
1757 		rtm->rtm_flags |= RTM_F_CLONED;
1758 
1759 	if (dst) {
1760 		RTA_PUT(skb, RTA_DST, 16, dst);
1761 	        rtm->rtm_dst_len = 128;
1762 	} else if (rtm->rtm_dst_len)
1763 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1764 #ifdef CONFIG_IPV6_SUBTREES
1765 	if (src) {
1766 		RTA_PUT(skb, RTA_SRC, 16, src);
1767 	        rtm->rtm_src_len = 128;
1768 	} else if (rtm->rtm_src_len)
1769 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1770 #endif
1771 	if (iif)
1772 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1773 	else if (dst) {
1774 		struct in6_addr saddr_buf;
1775 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1776 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1777 	}
1778 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1779 		goto rtattr_failure;
1780 	if (rt->u.dst.neighbour)
1781 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1782 	if (rt->u.dst.dev)
1783 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1784 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1785 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1786 	if (rt->rt6i_expires)
1787 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1788 	else
1789 		ci.rta_expires = 0;
1790 	ci.rta_used = rt->u.dst.__use;
1791 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1792 	ci.rta_error = rt->u.dst.error;
1793 	ci.rta_id = 0;
1794 	ci.rta_ts = 0;
1795 	ci.rta_tsage = 0;
1796 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1797 	nlh->nlmsg_len = skb->tail - b;
1798 	return skb->len;
1799 
1800 nlmsg_failure:
1801 rtattr_failure:
1802 	skb_trim(skb, b - skb->data);
1803 	return -1;
1804 }
1805 
1806 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1807 {
1808 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1809 	int prefix;
1810 
1811 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1812 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1813 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1814 	} else
1815 		prefix = 0;
1816 
1817 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1818 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1819 		     prefix, NLM_F_MULTI);
1820 }
1821 
1822 static int fib6_dump_node(struct fib6_walker_t *w)
1823 {
1824 	int res;
1825 	struct rt6_info *rt;
1826 
1827 	for (rt = w->leaf; rt; rt = rt->u.next) {
1828 		res = rt6_dump_route(rt, w->args);
1829 		if (res < 0) {
1830 			/* Frame is full, suspend walking */
1831 			w->leaf = rt;
1832 			return 1;
1833 		}
1834 		BUG_TRAP(res!=0);
1835 	}
1836 	w->leaf = NULL;
1837 	return 0;
1838 }
1839 
1840 static void fib6_dump_end(struct netlink_callback *cb)
1841 {
1842 	struct fib6_walker_t *w = (void*)cb->args[0];
1843 
1844 	if (w) {
1845 		cb->args[0] = 0;
1846 		fib6_walker_unlink(w);
1847 		kfree(w);
1848 	}
1849 	cb->done = (void*)cb->args[1];
1850 	cb->args[1] = 0;
1851 }
1852 
1853 static int fib6_dump_done(struct netlink_callback *cb)
1854 {
1855 	fib6_dump_end(cb);
1856 	return cb->done ? cb->done(cb) : 0;
1857 }
1858 
1859 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1860 {
1861 	struct rt6_rtnl_dump_arg arg;
1862 	struct fib6_walker_t *w;
1863 	int res;
1864 
1865 	arg.skb = skb;
1866 	arg.cb = cb;
1867 
1868 	w = (void*)cb->args[0];
1869 	if (w == NULL) {
1870 		/* New dump:
1871 		 *
1872 		 * 1. hook callback destructor.
1873 		 */
1874 		cb->args[1] = (long)cb->done;
1875 		cb->done = fib6_dump_done;
1876 
1877 		/*
1878 		 * 2. allocate and initialize walker.
1879 		 */
1880 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1881 		if (w == NULL)
1882 			return -ENOMEM;
1883 		RT6_TRACE("dump<%p", w);
1884 		w->root = &ip6_routing_table;
1885 		w->func = fib6_dump_node;
1886 		w->args = &arg;
1887 		cb->args[0] = (long)w;
1888 		read_lock_bh(&rt6_lock);
1889 		res = fib6_walk(w);
1890 		read_unlock_bh(&rt6_lock);
1891 	} else {
1892 		w->args = &arg;
1893 		read_lock_bh(&rt6_lock);
1894 		res = fib6_walk_continue(w);
1895 		read_unlock_bh(&rt6_lock);
1896 	}
1897 #if RT6_DEBUG >= 3
1898 	if (res <= 0 && skb->len == 0)
1899 		RT6_TRACE("%p>dump end\n", w);
1900 #endif
1901 	res = res < 0 ? res : skb->len;
1902 	/* res < 0 is an error. (really, impossible)
1903 	   res == 0 means that dump is complete, but skb still can contain data.
1904 	   res > 0 dump is not complete, but frame is full.
1905 	 */
1906 	/* Destroy walker, if dump of this table is complete. */
1907 	if (res <= 0)
1908 		fib6_dump_end(cb);
1909 	return res;
1910 }
1911 
1912 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1913 {
1914 	struct rtattr **rta = arg;
1915 	int iif = 0;
1916 	int err = -ENOBUFS;
1917 	struct sk_buff *skb;
1918 	struct flowi fl;
1919 	struct rt6_info *rt;
1920 
1921 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1922 	if (skb == NULL)
1923 		goto out;
1924 
1925 	/* Reserve room for dummy headers, this skb can pass
1926 	   through good chunk of routing engine.
1927 	 */
1928 	skb->mac.raw = skb->data;
1929 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1930 
1931 	memset(&fl, 0, sizeof(fl));
1932 	if (rta[RTA_SRC-1])
1933 		ipv6_addr_copy(&fl.fl6_src,
1934 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1935 	if (rta[RTA_DST-1])
1936 		ipv6_addr_copy(&fl.fl6_dst,
1937 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1938 
1939 	if (rta[RTA_IIF-1])
1940 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1941 
1942 	if (iif) {
1943 		struct net_device *dev;
1944 		dev = __dev_get_by_index(iif);
1945 		if (!dev) {
1946 			err = -ENODEV;
1947 			goto out_free;
1948 		}
1949 	}
1950 
1951 	fl.oif = 0;
1952 	if (rta[RTA_OIF-1])
1953 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1954 
1955 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1956 
1957 	skb->dst = &rt->u.dst;
1958 
1959 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1960 	err = rt6_fill_node(skb, rt,
1961 			    &fl.fl6_dst, &fl.fl6_src,
1962 			    iif,
1963 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1964 			    nlh->nlmsg_seq, 0, 0);
1965 	if (err < 0) {
1966 		err = -EMSGSIZE;
1967 		goto out_free;
1968 	}
1969 
1970 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1971 	if (err > 0)
1972 		err = 0;
1973 out:
1974 	return err;
1975 out_free:
1976 	kfree_skb(skb);
1977 	goto out;
1978 }
1979 
1980 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1981 			struct netlink_skb_parms *req)
1982 {
1983 	struct sk_buff *skb;
1984 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1985 	u32 pid = current->pid;
1986 	u32 seq = 0;
1987 
1988 	if (req)
1989 		pid = req->pid;
1990 	if (nlh)
1991 		seq = nlh->nlmsg_seq;
1992 
1993 	skb = alloc_skb(size, gfp_any());
1994 	if (!skb) {
1995 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1996 		return;
1997 	}
1998 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1999 		kfree_skb(skb);
2000 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2001 		return;
2002 	}
2003 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2004 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2005 }
2006 
2007 /*
2008  *	/proc
2009  */
2010 
2011 #ifdef CONFIG_PROC_FS
2012 
2013 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2014 
2015 struct rt6_proc_arg
2016 {
2017 	char *buffer;
2018 	int offset;
2019 	int length;
2020 	int skip;
2021 	int len;
2022 };
2023 
2024 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2025 {
2026 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2027 	int i;
2028 
2029 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2030 		arg->skip++;
2031 		return 0;
2032 	}
2033 
2034 	if (arg->len >= arg->length)
2035 		return 0;
2036 
2037 	for (i=0; i<16; i++) {
2038 		sprintf(arg->buffer + arg->len, "%02x",
2039 			rt->rt6i_dst.addr.s6_addr[i]);
2040 		arg->len += 2;
2041 	}
2042 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2043 			    rt->rt6i_dst.plen);
2044 
2045 #ifdef CONFIG_IPV6_SUBTREES
2046 	for (i=0; i<16; i++) {
2047 		sprintf(arg->buffer + arg->len, "%02x",
2048 			rt->rt6i_src.addr.s6_addr[i]);
2049 		arg->len += 2;
2050 	}
2051 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2052 			    rt->rt6i_src.plen);
2053 #else
2054 	sprintf(arg->buffer + arg->len,
2055 		"00000000000000000000000000000000 00 ");
2056 	arg->len += 36;
2057 #endif
2058 
2059 	if (rt->rt6i_nexthop) {
2060 		for (i=0; i<16; i++) {
2061 			sprintf(arg->buffer + arg->len, "%02x",
2062 				rt->rt6i_nexthop->primary_key[i]);
2063 			arg->len += 2;
2064 		}
2065 	} else {
2066 		sprintf(arg->buffer + arg->len,
2067 			"00000000000000000000000000000000");
2068 		arg->len += 32;
2069 	}
2070 	arg->len += sprintf(arg->buffer + arg->len,
2071 			    " %08x %08x %08x %08x %8s\n",
2072 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2073 			    rt->u.dst.__use, rt->rt6i_flags,
2074 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2075 	return 0;
2076 }
2077 
2078 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2079 {
2080 	struct rt6_proc_arg arg;
2081 	arg.buffer = buffer;
2082 	arg.offset = offset;
2083 	arg.length = length;
2084 	arg.skip = 0;
2085 	arg.len = 0;
2086 
2087 	read_lock_bh(&rt6_lock);
2088 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2089 	read_unlock_bh(&rt6_lock);
2090 
2091 	*start = buffer;
2092 	if (offset)
2093 		*start += offset % RT6_INFO_LEN;
2094 
2095 	arg.len -= offset % RT6_INFO_LEN;
2096 
2097 	if (arg.len > length)
2098 		arg.len = length;
2099 	if (arg.len < 0)
2100 		arg.len = 0;
2101 
2102 	return arg.len;
2103 }
2104 
2105 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2106 {
2107 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2108 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2109 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2110 		      rt6_stats.fib_rt_cache,
2111 		      atomic_read(&ip6_dst_ops.entries),
2112 		      rt6_stats.fib_discarded_routes);
2113 
2114 	return 0;
2115 }
2116 
2117 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2118 {
2119 	return single_open(file, rt6_stats_seq_show, NULL);
2120 }
2121 
2122 static struct file_operations rt6_stats_seq_fops = {
2123 	.owner	 = THIS_MODULE,
2124 	.open	 = rt6_stats_seq_open,
2125 	.read	 = seq_read,
2126 	.llseek	 = seq_lseek,
2127 	.release = single_release,
2128 };
2129 #endif	/* CONFIG_PROC_FS */
2130 
2131 #ifdef CONFIG_SYSCTL
2132 
2133 static int flush_delay;
2134 
2135 static
2136 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2137 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2138 {
2139 	if (write) {
2140 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2141 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2142 		return 0;
2143 	} else
2144 		return -EINVAL;
2145 }
2146 
2147 ctl_table ipv6_route_table[] = {
2148         {
2149 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2150 		.procname	=	"flush",
2151          	.data		=	&flush_delay,
2152 		.maxlen		=	sizeof(int),
2153 		.mode		=	0200,
2154          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2155 	},
2156 	{
2157 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2158 		.procname	=	"gc_thresh",
2159          	.data		=	&ip6_dst_ops.gc_thresh,
2160 		.maxlen		=	sizeof(int),
2161 		.mode		=	0644,
2162          	.proc_handler	=	&proc_dointvec,
2163 	},
2164 	{
2165 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2166 		.procname	=	"max_size",
2167          	.data		=	&ip6_rt_max_size,
2168 		.maxlen		=	sizeof(int),
2169 		.mode		=	0644,
2170          	.proc_handler	=	&proc_dointvec,
2171 	},
2172 	{
2173 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2174 		.procname	=	"gc_min_interval",
2175          	.data		=	&ip6_rt_gc_min_interval,
2176 		.maxlen		=	sizeof(int),
2177 		.mode		=	0644,
2178          	.proc_handler	=	&proc_dointvec_jiffies,
2179 		.strategy	=	&sysctl_jiffies,
2180 	},
2181 	{
2182 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2183 		.procname	=	"gc_timeout",
2184          	.data		=	&ip6_rt_gc_timeout,
2185 		.maxlen		=	sizeof(int),
2186 		.mode		=	0644,
2187          	.proc_handler	=	&proc_dointvec_jiffies,
2188 		.strategy	=	&sysctl_jiffies,
2189 	},
2190 	{
2191 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2192 		.procname	=	"gc_interval",
2193          	.data		=	&ip6_rt_gc_interval,
2194 		.maxlen		=	sizeof(int),
2195 		.mode		=	0644,
2196          	.proc_handler	=	&proc_dointvec_jiffies,
2197 		.strategy	=	&sysctl_jiffies,
2198 	},
2199 	{
2200 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2201 		.procname	=	"gc_elasticity",
2202          	.data		=	&ip6_rt_gc_elasticity,
2203 		.maxlen		=	sizeof(int),
2204 		.mode		=	0644,
2205          	.proc_handler	=	&proc_dointvec_jiffies,
2206 		.strategy	=	&sysctl_jiffies,
2207 	},
2208 	{
2209 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2210 		.procname	=	"mtu_expires",
2211          	.data		=	&ip6_rt_mtu_expires,
2212 		.maxlen		=	sizeof(int),
2213 		.mode		=	0644,
2214          	.proc_handler	=	&proc_dointvec_jiffies,
2215 		.strategy	=	&sysctl_jiffies,
2216 	},
2217 	{
2218 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2219 		.procname	=	"min_adv_mss",
2220          	.data		=	&ip6_rt_min_advmss,
2221 		.maxlen		=	sizeof(int),
2222 		.mode		=	0644,
2223          	.proc_handler	=	&proc_dointvec_jiffies,
2224 		.strategy	=	&sysctl_jiffies,
2225 	},
2226 	{
2227 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2228 		.procname	=	"gc_min_interval_ms",
2229          	.data		=	&ip6_rt_gc_min_interval,
2230 		.maxlen		=	sizeof(int),
2231 		.mode		=	0644,
2232          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2233 		.strategy	=	&sysctl_ms_jiffies,
2234 	},
2235 	{ .ctl_name = 0 }
2236 };
2237 
2238 #endif
2239 
2240 void __init ip6_route_init(void)
2241 {
2242 	struct proc_dir_entry *p;
2243 
2244 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2245 						     sizeof(struct rt6_info),
2246 						     0, SLAB_HWCACHE_ALIGN,
2247 						     NULL, NULL);
2248 	if (!ip6_dst_ops.kmem_cachep)
2249 		panic("cannot create ip6_dst_cache");
2250 
2251 	fib6_init();
2252 #ifdef 	CONFIG_PROC_FS
2253 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2254 	if (p)
2255 		p->owner = THIS_MODULE;
2256 
2257 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2258 #endif
2259 #ifdef CONFIG_XFRM
2260 	xfrm6_init();
2261 #endif
2262 }
2263 
2264 void ip6_route_cleanup(void)
2265 {
2266 #ifdef CONFIG_PROC_FS
2267 	proc_net_remove("ipv6_route");
2268 	proc_net_remove("rt6_stats");
2269 #endif
2270 #ifdef CONFIG_XFRM
2271 	xfrm6_fini();
2272 #endif
2273 	rt6_ifdown(NULL);
2274 	fib6_gc_cleanup();
2275 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2276 }
2277