xref: /linux/net/ipv6/route.c (revision 7b12b9137930eb821b68e1bfa11e9de692208620)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41 
42 #ifdef 	CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46 
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (neigh) {
284 		read_lock_bh(&neigh->lock);
285 		if (neigh->nud_state & NUD_VALID)
286 			m = 1;
287 		read_unlock_bh(&neigh->lock);
288 	}
289 	return m;
290 }
291 
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293 			   int strict)
294 {
295 	int m = rt6_check_dev(rt, oif);
296 	if (!m && (strict & RT6_SELECT_F_IFACE))
297 		return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301 	if (rt6_check_neigh(rt))
302 		m |= 16;
303 	else if (strict & RT6_SELECT_F_REACHABLE)
304 		return -1;
305 	return m;
306 }
307 
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309 				   int strict)
310 {
311 	struct rt6_info *match = NULL, *last = NULL;
312 	struct rt6_info *rt, *rt0 = *head;
313 	u32 metric;
314 	int mpri = -1;
315 
316 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317 		  __FUNCTION__, head, head ? *head : NULL, oif);
318 
319 	for (rt = rt0, metric = rt0->rt6i_metric;
320 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
321 	     rt = rt->u.next) {
322 		int m;
323 
324 		if (rt6_check_expired(rt))
325 			continue;
326 
327 		last = rt;
328 
329 		m = rt6_score_route(rt, oif, strict);
330 		if (m < 0)
331 			continue;
332 
333 		if (m > mpri) {
334 			rt6_probe(match);
335 			match = rt;
336 			mpri = m;
337 		} else {
338 			rt6_probe(rt);
339 		}
340 	}
341 
342 	if (!match &&
343 	    (strict & RT6_SELECT_F_REACHABLE) &&
344 	    last && last != rt0) {
345 		/* no entries matched; do round-robin */
346 		static spinlock_t lock = SPIN_LOCK_UNLOCKED;
347 		spin_lock(&lock);
348 		*head = rt0->u.next;
349 		rt0->u.next = last->u.next;
350 		last->u.next = rt0;
351 		spin_unlock(&lock);
352 	}
353 
354 	RT6_TRACE("%s() => %p, score=%d\n",
355 		  __FUNCTION__, match, mpri);
356 
357 	return (match ? match : &ip6_null_entry);
358 }
359 
360 #ifdef CONFIG_IPV6_ROUTE_INFO
361 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
362 		  struct in6_addr *gwaddr)
363 {
364 	struct route_info *rinfo = (struct route_info *) opt;
365 	struct in6_addr prefix_buf, *prefix;
366 	unsigned int pref;
367 	u32 lifetime;
368 	struct rt6_info *rt;
369 
370 	if (len < sizeof(struct route_info)) {
371 		return -EINVAL;
372 	}
373 
374 	/* Sanity check for prefix_len and length */
375 	if (rinfo->length > 3) {
376 		return -EINVAL;
377 	} else if (rinfo->prefix_len > 128) {
378 		return -EINVAL;
379 	} else if (rinfo->prefix_len > 64) {
380 		if (rinfo->length < 2) {
381 			return -EINVAL;
382 		}
383 	} else if (rinfo->prefix_len > 0) {
384 		if (rinfo->length < 1) {
385 			return -EINVAL;
386 		}
387 	}
388 
389 	pref = rinfo->route_pref;
390 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
391 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
392 
393 	lifetime = htonl(rinfo->lifetime);
394 	if (lifetime == 0xffffffff) {
395 		/* infinity */
396 	} else if (lifetime > 0x7fffffff/HZ) {
397 		/* Avoid arithmetic overflow */
398 		lifetime = 0x7fffffff/HZ - 1;
399 	}
400 
401 	if (rinfo->length == 3)
402 		prefix = (struct in6_addr *)rinfo->prefix;
403 	else {
404 		/* this function is safe */
405 		ipv6_addr_prefix(&prefix_buf,
406 				 (struct in6_addr *)rinfo->prefix,
407 				 rinfo->prefix_len);
408 		prefix = &prefix_buf;
409 	}
410 
411 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
412 
413 	if (rt && !lifetime) {
414 		ip6_del_rt(rt, NULL, NULL, NULL);
415 		rt = NULL;
416 	}
417 
418 	if (!rt && lifetime)
419 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
420 					pref);
421 	else if (rt)
422 		rt->rt6i_flags = RTF_ROUTEINFO |
423 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
424 
425 	if (rt) {
426 		if (lifetime == 0xffffffff) {
427 			rt->rt6i_flags &= ~RTF_EXPIRES;
428 		} else {
429 			rt->rt6i_expires = jiffies + HZ * lifetime;
430 			rt->rt6i_flags |= RTF_EXPIRES;
431 		}
432 		dst_release(&rt->u.dst);
433 	}
434 	return 0;
435 }
436 #endif
437 
438 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
439 			    int oif, int strict)
440 {
441 	struct fib6_node *fn;
442 	struct rt6_info *rt;
443 
444 	read_lock_bh(&rt6_lock);
445 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
446 	rt = rt6_device_match(fn->leaf, oif, strict);
447 	dst_hold(&rt->u.dst);
448 	rt->u.dst.__use++;
449 	read_unlock_bh(&rt6_lock);
450 
451 	rt->u.dst.lastuse = jiffies;
452 	if (rt->u.dst.error == 0)
453 		return rt;
454 	dst_release(&rt->u.dst);
455 	return NULL;
456 }
457 
458 /* ip6_ins_rt is called with FREE rt6_lock.
459    It takes new route entry, the addition fails by any reason the
460    route is freed. In any case, if caller does not hold it, it may
461    be destroyed.
462  */
463 
464 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
465 		void *_rtattr, struct netlink_skb_parms *req)
466 {
467 	int err;
468 
469 	write_lock_bh(&rt6_lock);
470 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
471 	write_unlock_bh(&rt6_lock);
472 
473 	return err;
474 }
475 
476 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
477 				      struct in6_addr *saddr)
478 {
479 	struct rt6_info *rt;
480 
481 	/*
482 	 *	Clone the route.
483 	 */
484 
485 	rt = ip6_rt_copy(ort);
486 
487 	if (rt) {
488 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
489 			if (rt->rt6i_dst.plen != 128 &&
490 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
491 				rt->rt6i_flags |= RTF_ANYCAST;
492 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
493 		}
494 
495 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
496 		rt->rt6i_dst.plen = 128;
497 		rt->rt6i_flags |= RTF_CACHE;
498 		rt->u.dst.flags |= DST_HOST;
499 
500 #ifdef CONFIG_IPV6_SUBTREES
501 		if (rt->rt6i_src.plen && saddr) {
502 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
503 			rt->rt6i_src.plen = 128;
504 		}
505 #endif
506 
507 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
508 
509 	}
510 
511 	return rt;
512 }
513 
514 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
515 {
516 	struct rt6_info *rt = ip6_rt_copy(ort);
517 	if (rt) {
518 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
519 		rt->rt6i_dst.plen = 128;
520 		rt->rt6i_flags |= RTF_CACHE;
521 		if (rt->rt6i_flags & RTF_REJECT)
522 			rt->u.dst.error = ort->u.dst.error;
523 		rt->u.dst.flags |= DST_HOST;
524 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
525 	}
526 	return rt;
527 }
528 
529 #define BACKTRACK() \
530 if (rt == &ip6_null_entry) { \
531        while ((fn = fn->parent) != NULL) { \
532 		if (fn->fn_flags & RTN_ROOT) { \
533 			goto out; \
534 		} \
535 		if (fn->fn_flags & RTN_RTINFO) \
536 			goto restart; \
537 	} \
538 }
539 
540 
541 void ip6_route_input(struct sk_buff *skb)
542 {
543 	struct fib6_node *fn;
544 	struct rt6_info *rt, *nrt;
545 	int strict;
546 	int attempts = 3;
547 	int err;
548 	int reachable = RT6_SELECT_F_REACHABLE;
549 
550 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
551 
552 relookup:
553 	read_lock_bh(&rt6_lock);
554 
555 restart_2:
556 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
557 			 &skb->nh.ipv6h->saddr);
558 
559 restart:
560 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
561 	BACKTRACK();
562 	if (rt == &ip6_null_entry ||
563 	    rt->rt6i_flags & RTF_CACHE)
564 		goto out;
565 
566 	dst_hold(&rt->u.dst);
567 	read_unlock_bh(&rt6_lock);
568 
569 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
570 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
571 	else {
572 #if CLONE_OFFLINK_ROUTE
573 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
574 #else
575 		goto out2;
576 #endif
577 	}
578 
579 	dst_release(&rt->u.dst);
580 	rt = nrt ? : &ip6_null_entry;
581 
582 	dst_hold(&rt->u.dst);
583 	if (nrt) {
584 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
585 		if (!err)
586 			goto out2;
587 	}
588 
589 	if (--attempts <= 0)
590 		goto out2;
591 
592 	/*
593 	 * Race condition! In the gap, when rt6_lock was
594 	 * released someone could insert this route.  Relookup.
595 	 */
596 	dst_release(&rt->u.dst);
597 	goto relookup;
598 
599 out:
600 	if (reachable) {
601 		reachable = 0;
602 		goto restart_2;
603 	}
604 	dst_hold(&rt->u.dst);
605 	read_unlock_bh(&rt6_lock);
606 out2:
607 	rt->u.dst.lastuse = jiffies;
608 	rt->u.dst.__use++;
609 	skb->dst = (struct dst_entry *) rt;
610 	return;
611 }
612 
613 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
614 {
615 	struct fib6_node *fn;
616 	struct rt6_info *rt, *nrt;
617 	int strict;
618 	int attempts = 3;
619 	int err;
620 	int reachable = RT6_SELECT_F_REACHABLE;
621 
622 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
623 
624 relookup:
625 	read_lock_bh(&rt6_lock);
626 
627 restart_2:
628 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
629 
630 restart:
631 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
632 	BACKTRACK();
633 	if (rt == &ip6_null_entry ||
634 	    rt->rt6i_flags & RTF_CACHE)
635 		goto out;
636 
637 	dst_hold(&rt->u.dst);
638 	read_unlock_bh(&rt6_lock);
639 
640 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
641 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
642 	else {
643 #if CLONE_OFFLINK_ROUTE
644 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
645 #else
646 		goto out2;
647 #endif
648 	}
649 
650 	dst_release(&rt->u.dst);
651 	rt = nrt ? : &ip6_null_entry;
652 
653 	dst_hold(&rt->u.dst);
654 	if (nrt) {
655 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
656 		if (!err)
657 			goto out2;
658 	}
659 
660 	if (--attempts <= 0)
661 		goto out2;
662 
663 	/*
664 	 * Race condition! In the gap, when rt6_lock was
665 	 * released someone could insert this route.  Relookup.
666 	 */
667 	dst_release(&rt->u.dst);
668 	goto relookup;
669 
670 out:
671 	if (reachable) {
672 		reachable = 0;
673 		goto restart_2;
674 	}
675 	dst_hold(&rt->u.dst);
676 	read_unlock_bh(&rt6_lock);
677 out2:
678 	rt->u.dst.lastuse = jiffies;
679 	rt->u.dst.__use++;
680 	return &rt->u.dst;
681 }
682 
683 
684 /*
685  *	Destination cache support functions
686  */
687 
688 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
689 {
690 	struct rt6_info *rt;
691 
692 	rt = (struct rt6_info *) dst;
693 
694 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
695 		return dst;
696 
697 	return NULL;
698 }
699 
700 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
701 {
702 	struct rt6_info *rt = (struct rt6_info *) dst;
703 
704 	if (rt) {
705 		if (rt->rt6i_flags & RTF_CACHE)
706 			ip6_del_rt(rt, NULL, NULL, NULL);
707 		else
708 			dst_release(dst);
709 	}
710 	return NULL;
711 }
712 
713 static void ip6_link_failure(struct sk_buff *skb)
714 {
715 	struct rt6_info *rt;
716 
717 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
718 
719 	rt = (struct rt6_info *) skb->dst;
720 	if (rt) {
721 		if (rt->rt6i_flags&RTF_CACHE) {
722 			dst_set_expires(&rt->u.dst, 0);
723 			rt->rt6i_flags |= RTF_EXPIRES;
724 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
725 			rt->rt6i_node->fn_sernum = -1;
726 	}
727 }
728 
729 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
730 {
731 	struct rt6_info *rt6 = (struct rt6_info*)dst;
732 
733 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
734 		rt6->rt6i_flags |= RTF_MODIFIED;
735 		if (mtu < IPV6_MIN_MTU) {
736 			mtu = IPV6_MIN_MTU;
737 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
738 		}
739 		dst->metrics[RTAX_MTU-1] = mtu;
740 	}
741 }
742 
743 /* Protected by rt6_lock.  */
744 static struct dst_entry *ndisc_dst_gc_list;
745 static int ipv6_get_mtu(struct net_device *dev);
746 
747 static inline unsigned int ipv6_advmss(unsigned int mtu)
748 {
749 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
750 
751 	if (mtu < ip6_rt_min_advmss)
752 		mtu = ip6_rt_min_advmss;
753 
754 	/*
755 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
756 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
757 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
758 	 * rely only on pmtu discovery"
759 	 */
760 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
761 		mtu = IPV6_MAXPLEN;
762 	return mtu;
763 }
764 
765 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
766 				  struct neighbour *neigh,
767 				  struct in6_addr *addr,
768 				  int (*output)(struct sk_buff *))
769 {
770 	struct rt6_info *rt;
771 	struct inet6_dev *idev = in6_dev_get(dev);
772 
773 	if (unlikely(idev == NULL))
774 		return NULL;
775 
776 	rt = ip6_dst_alloc();
777 	if (unlikely(rt == NULL)) {
778 		in6_dev_put(idev);
779 		goto out;
780 	}
781 
782 	dev_hold(dev);
783 	if (neigh)
784 		neigh_hold(neigh);
785 	else
786 		neigh = ndisc_get_neigh(dev, addr);
787 
788 	rt->rt6i_dev	  = dev;
789 	rt->rt6i_idev     = idev;
790 	rt->rt6i_nexthop  = neigh;
791 	atomic_set(&rt->u.dst.__refcnt, 1);
792 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
793 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
794 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
795 	rt->u.dst.output  = output;
796 
797 #if 0	/* there's no chance to use these for ndisc */
798 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
799 				? DST_HOST
800 				: 0;
801 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
802 	rt->rt6i_dst.plen = 128;
803 #endif
804 
805 	write_lock_bh(&rt6_lock);
806 	rt->u.dst.next = ndisc_dst_gc_list;
807 	ndisc_dst_gc_list = &rt->u.dst;
808 	write_unlock_bh(&rt6_lock);
809 
810 	fib6_force_start_gc();
811 
812 out:
813 	return (struct dst_entry *)rt;
814 }
815 
816 int ndisc_dst_gc(int *more)
817 {
818 	struct dst_entry *dst, *next, **pprev;
819 	int freed;
820 
821 	next = NULL;
822 	pprev = &ndisc_dst_gc_list;
823 	freed = 0;
824 	while ((dst = *pprev) != NULL) {
825 		if (!atomic_read(&dst->__refcnt)) {
826 			*pprev = dst->next;
827 			dst_free(dst);
828 			freed++;
829 		} else {
830 			pprev = &dst->next;
831 			(*more)++;
832 		}
833 	}
834 
835 	return freed;
836 }
837 
838 static int ip6_dst_gc(void)
839 {
840 	static unsigned expire = 30*HZ;
841 	static unsigned long last_gc;
842 	unsigned long now = jiffies;
843 
844 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
845 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
846 		goto out;
847 
848 	expire++;
849 	fib6_run_gc(expire);
850 	last_gc = now;
851 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
852 		expire = ip6_rt_gc_timeout>>1;
853 
854 out:
855 	expire -= expire>>ip6_rt_gc_elasticity;
856 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
857 }
858 
859 /* Clean host part of a prefix. Not necessary in radix tree,
860    but results in cleaner routing tables.
861 
862    Remove it only when all the things will work!
863  */
864 
865 static int ipv6_get_mtu(struct net_device *dev)
866 {
867 	int mtu = IPV6_MIN_MTU;
868 	struct inet6_dev *idev;
869 
870 	idev = in6_dev_get(dev);
871 	if (idev) {
872 		mtu = idev->cnf.mtu6;
873 		in6_dev_put(idev);
874 	}
875 	return mtu;
876 }
877 
878 int ipv6_get_hoplimit(struct net_device *dev)
879 {
880 	int hoplimit = ipv6_devconf.hop_limit;
881 	struct inet6_dev *idev;
882 
883 	idev = in6_dev_get(dev);
884 	if (idev) {
885 		hoplimit = idev->cnf.hop_limit;
886 		in6_dev_put(idev);
887 	}
888 	return hoplimit;
889 }
890 
891 /*
892  *
893  */
894 
895 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
896 		void *_rtattr, struct netlink_skb_parms *req)
897 {
898 	int err;
899 	struct rtmsg *r;
900 	struct rtattr **rta;
901 	struct rt6_info *rt = NULL;
902 	struct net_device *dev = NULL;
903 	struct inet6_dev *idev = NULL;
904 	int addr_type;
905 
906 	rta = (struct rtattr **) _rtattr;
907 
908 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
909 		return -EINVAL;
910 #ifndef CONFIG_IPV6_SUBTREES
911 	if (rtmsg->rtmsg_src_len)
912 		return -EINVAL;
913 #endif
914 	if (rtmsg->rtmsg_ifindex) {
915 		err = -ENODEV;
916 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
917 		if (!dev)
918 			goto out;
919 		idev = in6_dev_get(dev);
920 		if (!idev)
921 			goto out;
922 	}
923 
924 	if (rtmsg->rtmsg_metric == 0)
925 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
926 
927 	rt = ip6_dst_alloc();
928 
929 	if (rt == NULL) {
930 		err = -ENOMEM;
931 		goto out;
932 	}
933 
934 	rt->u.dst.obsolete = -1;
935 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
936 	if (nlh && (r = NLMSG_DATA(nlh))) {
937 		rt->rt6i_protocol = r->rtm_protocol;
938 	} else {
939 		rt->rt6i_protocol = RTPROT_BOOT;
940 	}
941 
942 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
943 
944 	if (addr_type & IPV6_ADDR_MULTICAST)
945 		rt->u.dst.input = ip6_mc_input;
946 	else
947 		rt->u.dst.input = ip6_forward;
948 
949 	rt->u.dst.output = ip6_output;
950 
951 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
952 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
953 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
954 	if (rt->rt6i_dst.plen == 128)
955 	       rt->u.dst.flags = DST_HOST;
956 
957 #ifdef CONFIG_IPV6_SUBTREES
958 	ipv6_addr_prefix(&rt->rt6i_src.addr,
959 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
960 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
961 #endif
962 
963 	rt->rt6i_metric = rtmsg->rtmsg_metric;
964 
965 	/* We cannot add true routes via loopback here,
966 	   they would result in kernel looping; promote them to reject routes
967 	 */
968 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
969 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
970 		/* hold loopback dev/idev if we haven't done so. */
971 		if (dev != &loopback_dev) {
972 			if (dev) {
973 				dev_put(dev);
974 				in6_dev_put(idev);
975 			}
976 			dev = &loopback_dev;
977 			dev_hold(dev);
978 			idev = in6_dev_get(dev);
979 			if (!idev) {
980 				err = -ENODEV;
981 				goto out;
982 			}
983 		}
984 		rt->u.dst.output = ip6_pkt_discard_out;
985 		rt->u.dst.input = ip6_pkt_discard;
986 		rt->u.dst.error = -ENETUNREACH;
987 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
988 		goto install_route;
989 	}
990 
991 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
992 		struct in6_addr *gw_addr;
993 		int gwa_type;
994 
995 		gw_addr = &rtmsg->rtmsg_gateway;
996 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
997 		gwa_type = ipv6_addr_type(gw_addr);
998 
999 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1000 			struct rt6_info *grt;
1001 
1002 			/* IPv6 strictly inhibits using not link-local
1003 			   addresses as nexthop address.
1004 			   Otherwise, router will not able to send redirects.
1005 			   It is very good, but in some (rare!) circumstances
1006 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1007 			   some exceptions. --ANK
1008 			 */
1009 			err = -EINVAL;
1010 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1011 				goto out;
1012 
1013 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1014 
1015 			err = -EHOSTUNREACH;
1016 			if (grt == NULL)
1017 				goto out;
1018 			if (dev) {
1019 				if (dev != grt->rt6i_dev) {
1020 					dst_release(&grt->u.dst);
1021 					goto out;
1022 				}
1023 			} else {
1024 				dev = grt->rt6i_dev;
1025 				idev = grt->rt6i_idev;
1026 				dev_hold(dev);
1027 				in6_dev_hold(grt->rt6i_idev);
1028 			}
1029 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1030 				err = 0;
1031 			dst_release(&grt->u.dst);
1032 
1033 			if (err)
1034 				goto out;
1035 		}
1036 		err = -EINVAL;
1037 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1038 			goto out;
1039 	}
1040 
1041 	err = -ENODEV;
1042 	if (dev == NULL)
1043 		goto out;
1044 
1045 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1046 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1047 		if (IS_ERR(rt->rt6i_nexthop)) {
1048 			err = PTR_ERR(rt->rt6i_nexthop);
1049 			rt->rt6i_nexthop = NULL;
1050 			goto out;
1051 		}
1052 	}
1053 
1054 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1055 
1056 install_route:
1057 	if (rta && rta[RTA_METRICS-1]) {
1058 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1059 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1060 
1061 		while (RTA_OK(attr, attrlen)) {
1062 			unsigned flavor = attr->rta_type;
1063 			if (flavor) {
1064 				if (flavor > RTAX_MAX) {
1065 					err = -EINVAL;
1066 					goto out;
1067 				}
1068 				rt->u.dst.metrics[flavor-1] =
1069 					*(u32 *)RTA_DATA(attr);
1070 			}
1071 			attr = RTA_NEXT(attr, attrlen);
1072 		}
1073 	}
1074 
1075 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1076 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1077 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1078 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1079 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1080 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1081 	rt->u.dst.dev = dev;
1082 	rt->rt6i_idev = idev;
1083 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1084 
1085 out:
1086 	if (dev)
1087 		dev_put(dev);
1088 	if (idev)
1089 		in6_dev_put(idev);
1090 	if (rt)
1091 		dst_free((struct dst_entry *) rt);
1092 	return err;
1093 }
1094 
1095 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1096 {
1097 	int err;
1098 
1099 	write_lock_bh(&rt6_lock);
1100 
1101 	err = fib6_del(rt, nlh, _rtattr, req);
1102 	dst_release(&rt->u.dst);
1103 
1104 	write_unlock_bh(&rt6_lock);
1105 
1106 	return err;
1107 }
1108 
1109 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1110 {
1111 	struct fib6_node *fn;
1112 	struct rt6_info *rt;
1113 	int err = -ESRCH;
1114 
1115 	read_lock_bh(&rt6_lock);
1116 
1117 	fn = fib6_locate(&ip6_routing_table,
1118 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1119 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1120 
1121 	if (fn) {
1122 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1123 			if (rtmsg->rtmsg_ifindex &&
1124 			    (rt->rt6i_dev == NULL ||
1125 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1126 				continue;
1127 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1128 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1129 				continue;
1130 			if (rtmsg->rtmsg_metric &&
1131 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1132 				continue;
1133 			dst_hold(&rt->u.dst);
1134 			read_unlock_bh(&rt6_lock);
1135 
1136 			return ip6_del_rt(rt, nlh, _rtattr, req);
1137 		}
1138 	}
1139 	read_unlock_bh(&rt6_lock);
1140 
1141 	return err;
1142 }
1143 
1144 /*
1145  *	Handle redirects
1146  */
1147 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1148 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1149 {
1150 	struct rt6_info *rt, *nrt = NULL;
1151 	int strict;
1152 	struct fib6_node *fn;
1153 
1154 	/*
1155 	 * Get the "current" route for this destination and
1156 	 * check if the redirect has come from approriate router.
1157 	 *
1158 	 * RFC 2461 specifies that redirects should only be
1159 	 * accepted if they come from the nexthop to the target.
1160 	 * Due to the way the routes are chosen, this notion
1161 	 * is a bit fuzzy and one might need to check all possible
1162 	 * routes.
1163 	 */
1164 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1165 
1166 	read_lock_bh(&rt6_lock);
1167 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1168 restart:
1169 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1170 		/*
1171 		 * Current route is on-link; redirect is always invalid.
1172 		 *
1173 		 * Seems, previous statement is not true. It could
1174 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1175 		 * But then router serving it might decide, that we should
1176 		 * know truth 8)8) --ANK (980726).
1177 		 */
1178 		if (rt6_check_expired(rt))
1179 			continue;
1180 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1181 			continue;
1182 		if (neigh->dev != rt->rt6i_dev)
1183 			continue;
1184 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1185 			continue;
1186 		break;
1187 	}
1188 	if (rt)
1189 		dst_hold(&rt->u.dst);
1190 	else if (strict) {
1191 		while ((fn = fn->parent) != NULL) {
1192 			if (fn->fn_flags & RTN_ROOT)
1193 				break;
1194 			if (fn->fn_flags & RTN_RTINFO)
1195 				goto restart;
1196 		}
1197 	}
1198 	read_unlock_bh(&rt6_lock);
1199 
1200 	if (!rt) {
1201 		if (net_ratelimit())
1202 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1203 			       "for redirect target\n");
1204 		return;
1205 	}
1206 
1207 	/*
1208 	 *	We have finally decided to accept it.
1209 	 */
1210 
1211 	neigh_update(neigh, lladdr, NUD_STALE,
1212 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1213 		     NEIGH_UPDATE_F_OVERRIDE|
1214 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1215 				     NEIGH_UPDATE_F_ISROUTER))
1216 		     );
1217 
1218 	/*
1219 	 * Redirect received -> path was valid.
1220 	 * Look, redirects are sent only in response to data packets,
1221 	 * so that this nexthop apparently is reachable. --ANK
1222 	 */
1223 	dst_confirm(&rt->u.dst);
1224 
1225 	/* Duplicate redirect: silently ignore. */
1226 	if (neigh == rt->u.dst.neighbour)
1227 		goto out;
1228 
1229 	nrt = ip6_rt_copy(rt);
1230 	if (nrt == NULL)
1231 		goto out;
1232 
1233 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1234 	if (on_link)
1235 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1236 
1237 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1238 	nrt->rt6i_dst.plen = 128;
1239 	nrt->u.dst.flags |= DST_HOST;
1240 
1241 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1242 	nrt->rt6i_nexthop = neigh_clone(neigh);
1243 	/* Reset pmtu, it may be better */
1244 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1245 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1246 
1247 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1248 		goto out;
1249 
1250 	if (rt->rt6i_flags&RTF_CACHE) {
1251 		ip6_del_rt(rt, NULL, NULL, NULL);
1252 		return;
1253 	}
1254 
1255 out:
1256         dst_release(&rt->u.dst);
1257 	return;
1258 }
1259 
1260 /*
1261  *	Handle ICMP "packet too big" messages
1262  *	i.e. Path MTU discovery
1263  */
1264 
1265 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1266 			struct net_device *dev, u32 pmtu)
1267 {
1268 	struct rt6_info *rt, *nrt;
1269 	int allfrag = 0;
1270 
1271 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1272 	if (rt == NULL)
1273 		return;
1274 
1275 	if (pmtu >= dst_mtu(&rt->u.dst))
1276 		goto out;
1277 
1278 	if (pmtu < IPV6_MIN_MTU) {
1279 		/*
1280 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1281 		 * MTU (1280) and a fragment header should always be included
1282 		 * after a node receiving Too Big message reporting PMTU is
1283 		 * less than the IPv6 Minimum Link MTU.
1284 		 */
1285 		pmtu = IPV6_MIN_MTU;
1286 		allfrag = 1;
1287 	}
1288 
1289 	/* New mtu received -> path was valid.
1290 	   They are sent only in response to data packets,
1291 	   so that this nexthop apparently is reachable. --ANK
1292 	 */
1293 	dst_confirm(&rt->u.dst);
1294 
1295 	/* Host route. If it is static, it would be better
1296 	   not to override it, but add new one, so that
1297 	   when cache entry will expire old pmtu
1298 	   would return automatically.
1299 	 */
1300 	if (rt->rt6i_flags & RTF_CACHE) {
1301 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1302 		if (allfrag)
1303 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1304 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1305 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1306 		goto out;
1307 	}
1308 
1309 	/* Network route.
1310 	   Two cases are possible:
1311 	   1. It is connected route. Action: COW
1312 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1313 	 */
1314 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1315 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1316 	else
1317 		nrt = rt6_alloc_clone(rt, daddr);
1318 
1319 	if (nrt) {
1320 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1321 		if (allfrag)
1322 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1323 
1324 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1325 		 * happened within 5 mins, the recommended timer is 10 mins.
1326 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1327 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1328 		 * and detecting PMTU increase will be automatically happened.
1329 		 */
1330 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1331 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1332 
1333 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1334 	}
1335 out:
1336 	dst_release(&rt->u.dst);
1337 }
1338 
1339 /*
1340  *	Misc support functions
1341  */
1342 
1343 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1344 {
1345 	struct rt6_info *rt = ip6_dst_alloc();
1346 
1347 	if (rt) {
1348 		rt->u.dst.input = ort->u.dst.input;
1349 		rt->u.dst.output = ort->u.dst.output;
1350 
1351 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1352 		rt->u.dst.dev = ort->u.dst.dev;
1353 		if (rt->u.dst.dev)
1354 			dev_hold(rt->u.dst.dev);
1355 		rt->rt6i_idev = ort->rt6i_idev;
1356 		if (rt->rt6i_idev)
1357 			in6_dev_hold(rt->rt6i_idev);
1358 		rt->u.dst.lastuse = jiffies;
1359 		rt->rt6i_expires = 0;
1360 
1361 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1362 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1363 		rt->rt6i_metric = 0;
1364 
1365 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1366 #ifdef CONFIG_IPV6_SUBTREES
1367 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1368 #endif
1369 	}
1370 	return rt;
1371 }
1372 
1373 #ifdef CONFIG_IPV6_ROUTE_INFO
1374 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1375 					   struct in6_addr *gwaddr, int ifindex)
1376 {
1377 	struct fib6_node *fn;
1378 	struct rt6_info *rt = NULL;
1379 
1380 	write_lock_bh(&rt6_lock);
1381 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1382 	if (!fn)
1383 		goto out;
1384 
1385 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1386 		if (rt->rt6i_dev->ifindex != ifindex)
1387 			continue;
1388 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1389 			continue;
1390 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1391 			continue;
1392 		dst_hold(&rt->u.dst);
1393 		break;
1394 	}
1395 out:
1396 	write_unlock_bh(&rt6_lock);
1397 	return rt;
1398 }
1399 
1400 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1401 					   struct in6_addr *gwaddr, int ifindex,
1402 					   unsigned pref)
1403 {
1404 	struct in6_rtmsg rtmsg;
1405 
1406 	memset(&rtmsg, 0, sizeof(rtmsg));
1407 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1408 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1409 	rtmsg.rtmsg_dst_len = prefixlen;
1410 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1411 	rtmsg.rtmsg_metric = 1024;
1412 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1413 	/* We should treat it as a default route if prefix length is 0. */
1414 	if (!prefixlen)
1415 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1416 	rtmsg.rtmsg_ifindex = ifindex;
1417 
1418 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1419 
1420 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1421 }
1422 #endif
1423 
1424 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1425 {
1426 	struct rt6_info *rt;
1427 	struct fib6_node *fn;
1428 
1429 	fn = &ip6_routing_table;
1430 
1431 	write_lock_bh(&rt6_lock);
1432 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1433 		if (dev == rt->rt6i_dev &&
1434 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1435 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1436 			break;
1437 	}
1438 	if (rt)
1439 		dst_hold(&rt->u.dst);
1440 	write_unlock_bh(&rt6_lock);
1441 	return rt;
1442 }
1443 
1444 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1445 				     struct net_device *dev,
1446 				     unsigned int pref)
1447 {
1448 	struct in6_rtmsg rtmsg;
1449 
1450 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1451 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1452 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1453 	rtmsg.rtmsg_metric = 1024;
1454 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1455 			    RTF_PREF(pref);
1456 
1457 	rtmsg.rtmsg_ifindex = dev->ifindex;
1458 
1459 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1460 	return rt6_get_dflt_router(gwaddr, dev);
1461 }
1462 
1463 void rt6_purge_dflt_routers(void)
1464 {
1465 	struct rt6_info *rt;
1466 
1467 restart:
1468 	read_lock_bh(&rt6_lock);
1469 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1470 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1471 			dst_hold(&rt->u.dst);
1472 
1473 			read_unlock_bh(&rt6_lock);
1474 
1475 			ip6_del_rt(rt, NULL, NULL, NULL);
1476 
1477 			goto restart;
1478 		}
1479 	}
1480 	read_unlock_bh(&rt6_lock);
1481 }
1482 
1483 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1484 {
1485 	struct in6_rtmsg rtmsg;
1486 	int err;
1487 
1488 	switch(cmd) {
1489 	case SIOCADDRT:		/* Add a route */
1490 	case SIOCDELRT:		/* Delete a route */
1491 		if (!capable(CAP_NET_ADMIN))
1492 			return -EPERM;
1493 		err = copy_from_user(&rtmsg, arg,
1494 				     sizeof(struct in6_rtmsg));
1495 		if (err)
1496 			return -EFAULT;
1497 
1498 		rtnl_lock();
1499 		switch (cmd) {
1500 		case SIOCADDRT:
1501 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1502 			break;
1503 		case SIOCDELRT:
1504 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1505 			break;
1506 		default:
1507 			err = -EINVAL;
1508 		}
1509 		rtnl_unlock();
1510 
1511 		return err;
1512 	};
1513 
1514 	return -EINVAL;
1515 }
1516 
1517 /*
1518  *	Drop the packet on the floor
1519  */
1520 
1521 static int ip6_pkt_discard(struct sk_buff *skb)
1522 {
1523 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1524 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1525 	kfree_skb(skb);
1526 	return 0;
1527 }
1528 
1529 static int ip6_pkt_discard_out(struct sk_buff *skb)
1530 {
1531 	skb->dev = skb->dst->dev;
1532 	return ip6_pkt_discard(skb);
1533 }
1534 
1535 /*
1536  *	Allocate a dst for local (unicast / anycast) address.
1537  */
1538 
1539 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1540 				    const struct in6_addr *addr,
1541 				    int anycast)
1542 {
1543 	struct rt6_info *rt = ip6_dst_alloc();
1544 
1545 	if (rt == NULL)
1546 		return ERR_PTR(-ENOMEM);
1547 
1548 	dev_hold(&loopback_dev);
1549 	in6_dev_hold(idev);
1550 
1551 	rt->u.dst.flags = DST_HOST;
1552 	rt->u.dst.input = ip6_input;
1553 	rt->u.dst.output = ip6_output;
1554 	rt->rt6i_dev = &loopback_dev;
1555 	rt->rt6i_idev = idev;
1556 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1557 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1558 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1559 	rt->u.dst.obsolete = -1;
1560 
1561 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1562 	if (anycast)
1563 		rt->rt6i_flags |= RTF_ANYCAST;
1564 	else
1565 		rt->rt6i_flags |= RTF_LOCAL;
1566 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1567 	if (rt->rt6i_nexthop == NULL) {
1568 		dst_free((struct dst_entry *) rt);
1569 		return ERR_PTR(-ENOMEM);
1570 	}
1571 
1572 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1573 	rt->rt6i_dst.plen = 128;
1574 
1575 	atomic_set(&rt->u.dst.__refcnt, 1);
1576 
1577 	return rt;
1578 }
1579 
1580 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1581 {
1582 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1583 	    rt != &ip6_null_entry) {
1584 		RT6_TRACE("deleted by ifdown %p\n", rt);
1585 		return -1;
1586 	}
1587 	return 0;
1588 }
1589 
1590 void rt6_ifdown(struct net_device *dev)
1591 {
1592 	write_lock_bh(&rt6_lock);
1593 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1594 	write_unlock_bh(&rt6_lock);
1595 }
1596 
1597 struct rt6_mtu_change_arg
1598 {
1599 	struct net_device *dev;
1600 	unsigned mtu;
1601 };
1602 
1603 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1604 {
1605 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1606 	struct inet6_dev *idev;
1607 
1608 	/* In IPv6 pmtu discovery is not optional,
1609 	   so that RTAX_MTU lock cannot disable it.
1610 	   We still use this lock to block changes
1611 	   caused by addrconf/ndisc.
1612 	*/
1613 
1614 	idev = __in6_dev_get(arg->dev);
1615 	if (idev == NULL)
1616 		return 0;
1617 
1618 	/* For administrative MTU increase, there is no way to discover
1619 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1620 	   Since RFC 1981 doesn't include administrative MTU increase
1621 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1622 	 */
1623 	/*
1624 	   If new MTU is less than route PMTU, this new MTU will be the
1625 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1626 	   decreases; if new MTU is greater than route PMTU, and the
1627 	   old MTU is the lowest MTU in the path, update the route PMTU
1628 	   to reflect the increase. In this case if the other nodes' MTU
1629 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1630 	   PMTU discouvery.
1631 	 */
1632 	if (rt->rt6i_dev == arg->dev &&
1633 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1634             (dst_mtu(&rt->u.dst) > arg->mtu ||
1635              (dst_mtu(&rt->u.dst) < arg->mtu &&
1636 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1637 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1638 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1639 	return 0;
1640 }
1641 
1642 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1643 {
1644 	struct rt6_mtu_change_arg arg;
1645 
1646 	arg.dev = dev;
1647 	arg.mtu = mtu;
1648 	read_lock_bh(&rt6_lock);
1649 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1650 	read_unlock_bh(&rt6_lock);
1651 }
1652 
1653 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1654 			      struct in6_rtmsg *rtmsg)
1655 {
1656 	memset(rtmsg, 0, sizeof(*rtmsg));
1657 
1658 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1659 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1660 	rtmsg->rtmsg_flags = RTF_UP;
1661 	if (r->rtm_type == RTN_UNREACHABLE)
1662 		rtmsg->rtmsg_flags |= RTF_REJECT;
1663 
1664 	if (rta[RTA_GATEWAY-1]) {
1665 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1666 			return -EINVAL;
1667 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1668 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1669 	}
1670 	if (rta[RTA_DST-1]) {
1671 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1672 			return -EINVAL;
1673 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1674 	}
1675 	if (rta[RTA_SRC-1]) {
1676 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1677 			return -EINVAL;
1678 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1679 	}
1680 	if (rta[RTA_OIF-1]) {
1681 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1682 			return -EINVAL;
1683 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1684 	}
1685 	if (rta[RTA_PRIORITY-1]) {
1686 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1687 			return -EINVAL;
1688 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1689 	}
1690 	return 0;
1691 }
1692 
1693 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1694 {
1695 	struct rtmsg *r = NLMSG_DATA(nlh);
1696 	struct in6_rtmsg rtmsg;
1697 
1698 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1699 		return -EINVAL;
1700 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1701 }
1702 
1703 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1704 {
1705 	struct rtmsg *r = NLMSG_DATA(nlh);
1706 	struct in6_rtmsg rtmsg;
1707 
1708 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1709 		return -EINVAL;
1710 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1711 }
1712 
1713 struct rt6_rtnl_dump_arg
1714 {
1715 	struct sk_buff *skb;
1716 	struct netlink_callback *cb;
1717 };
1718 
1719 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1720 			 struct in6_addr *dst, struct in6_addr *src,
1721 			 int iif, int type, u32 pid, u32 seq,
1722 			 int prefix, unsigned int flags)
1723 {
1724 	struct rtmsg *rtm;
1725 	struct nlmsghdr  *nlh;
1726 	unsigned char	 *b = skb->tail;
1727 	struct rta_cacheinfo ci;
1728 
1729 	if (prefix) {	/* user wants prefix routes only */
1730 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1731 			/* success since this is not a prefix route */
1732 			return 1;
1733 		}
1734 	}
1735 
1736 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1737 	rtm = NLMSG_DATA(nlh);
1738 	rtm->rtm_family = AF_INET6;
1739 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1740 	rtm->rtm_src_len = rt->rt6i_src.plen;
1741 	rtm->rtm_tos = 0;
1742 	rtm->rtm_table = RT_TABLE_MAIN;
1743 	if (rt->rt6i_flags&RTF_REJECT)
1744 		rtm->rtm_type = RTN_UNREACHABLE;
1745 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1746 		rtm->rtm_type = RTN_LOCAL;
1747 	else
1748 		rtm->rtm_type = RTN_UNICAST;
1749 	rtm->rtm_flags = 0;
1750 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1751 	rtm->rtm_protocol = rt->rt6i_protocol;
1752 	if (rt->rt6i_flags&RTF_DYNAMIC)
1753 		rtm->rtm_protocol = RTPROT_REDIRECT;
1754 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1755 		rtm->rtm_protocol = RTPROT_KERNEL;
1756 	else if (rt->rt6i_flags&RTF_DEFAULT)
1757 		rtm->rtm_protocol = RTPROT_RA;
1758 
1759 	if (rt->rt6i_flags&RTF_CACHE)
1760 		rtm->rtm_flags |= RTM_F_CLONED;
1761 
1762 	if (dst) {
1763 		RTA_PUT(skb, RTA_DST, 16, dst);
1764 	        rtm->rtm_dst_len = 128;
1765 	} else if (rtm->rtm_dst_len)
1766 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1767 #ifdef CONFIG_IPV6_SUBTREES
1768 	if (src) {
1769 		RTA_PUT(skb, RTA_SRC, 16, src);
1770 	        rtm->rtm_src_len = 128;
1771 	} else if (rtm->rtm_src_len)
1772 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1773 #endif
1774 	if (iif)
1775 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1776 	else if (dst) {
1777 		struct in6_addr saddr_buf;
1778 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1779 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1780 	}
1781 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1782 		goto rtattr_failure;
1783 	if (rt->u.dst.neighbour)
1784 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1785 	if (rt->u.dst.dev)
1786 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1787 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1788 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1789 	if (rt->rt6i_expires)
1790 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1791 	else
1792 		ci.rta_expires = 0;
1793 	ci.rta_used = rt->u.dst.__use;
1794 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1795 	ci.rta_error = rt->u.dst.error;
1796 	ci.rta_id = 0;
1797 	ci.rta_ts = 0;
1798 	ci.rta_tsage = 0;
1799 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1800 	nlh->nlmsg_len = skb->tail - b;
1801 	return skb->len;
1802 
1803 nlmsg_failure:
1804 rtattr_failure:
1805 	skb_trim(skb, b - skb->data);
1806 	return -1;
1807 }
1808 
1809 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1810 {
1811 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1812 	int prefix;
1813 
1814 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1815 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1816 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1817 	} else
1818 		prefix = 0;
1819 
1820 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1821 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1822 		     prefix, NLM_F_MULTI);
1823 }
1824 
1825 static int fib6_dump_node(struct fib6_walker_t *w)
1826 {
1827 	int res;
1828 	struct rt6_info *rt;
1829 
1830 	for (rt = w->leaf; rt; rt = rt->u.next) {
1831 		res = rt6_dump_route(rt, w->args);
1832 		if (res < 0) {
1833 			/* Frame is full, suspend walking */
1834 			w->leaf = rt;
1835 			return 1;
1836 		}
1837 		BUG_TRAP(res!=0);
1838 	}
1839 	w->leaf = NULL;
1840 	return 0;
1841 }
1842 
1843 static void fib6_dump_end(struct netlink_callback *cb)
1844 {
1845 	struct fib6_walker_t *w = (void*)cb->args[0];
1846 
1847 	if (w) {
1848 		cb->args[0] = 0;
1849 		fib6_walker_unlink(w);
1850 		kfree(w);
1851 	}
1852 	cb->done = (void*)cb->args[1];
1853 	cb->args[1] = 0;
1854 }
1855 
1856 static int fib6_dump_done(struct netlink_callback *cb)
1857 {
1858 	fib6_dump_end(cb);
1859 	return cb->done ? cb->done(cb) : 0;
1860 }
1861 
1862 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1863 {
1864 	struct rt6_rtnl_dump_arg arg;
1865 	struct fib6_walker_t *w;
1866 	int res;
1867 
1868 	arg.skb = skb;
1869 	arg.cb = cb;
1870 
1871 	w = (void*)cb->args[0];
1872 	if (w == NULL) {
1873 		/* New dump:
1874 		 *
1875 		 * 1. hook callback destructor.
1876 		 */
1877 		cb->args[1] = (long)cb->done;
1878 		cb->done = fib6_dump_done;
1879 
1880 		/*
1881 		 * 2. allocate and initialize walker.
1882 		 */
1883 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1884 		if (w == NULL)
1885 			return -ENOMEM;
1886 		RT6_TRACE("dump<%p", w);
1887 		w->root = &ip6_routing_table;
1888 		w->func = fib6_dump_node;
1889 		w->args = &arg;
1890 		cb->args[0] = (long)w;
1891 		read_lock_bh(&rt6_lock);
1892 		res = fib6_walk(w);
1893 		read_unlock_bh(&rt6_lock);
1894 	} else {
1895 		w->args = &arg;
1896 		read_lock_bh(&rt6_lock);
1897 		res = fib6_walk_continue(w);
1898 		read_unlock_bh(&rt6_lock);
1899 	}
1900 #if RT6_DEBUG >= 3
1901 	if (res <= 0 && skb->len == 0)
1902 		RT6_TRACE("%p>dump end\n", w);
1903 #endif
1904 	res = res < 0 ? res : skb->len;
1905 	/* res < 0 is an error. (really, impossible)
1906 	   res == 0 means that dump is complete, but skb still can contain data.
1907 	   res > 0 dump is not complete, but frame is full.
1908 	 */
1909 	/* Destroy walker, if dump of this table is complete. */
1910 	if (res <= 0)
1911 		fib6_dump_end(cb);
1912 	return res;
1913 }
1914 
1915 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1916 {
1917 	struct rtattr **rta = arg;
1918 	int iif = 0;
1919 	int err = -ENOBUFS;
1920 	struct sk_buff *skb;
1921 	struct flowi fl;
1922 	struct rt6_info *rt;
1923 
1924 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1925 	if (skb == NULL)
1926 		goto out;
1927 
1928 	/* Reserve room for dummy headers, this skb can pass
1929 	   through good chunk of routing engine.
1930 	 */
1931 	skb->mac.raw = skb->data;
1932 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1933 
1934 	memset(&fl, 0, sizeof(fl));
1935 	if (rta[RTA_SRC-1])
1936 		ipv6_addr_copy(&fl.fl6_src,
1937 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1938 	if (rta[RTA_DST-1])
1939 		ipv6_addr_copy(&fl.fl6_dst,
1940 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1941 
1942 	if (rta[RTA_IIF-1])
1943 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1944 
1945 	if (iif) {
1946 		struct net_device *dev;
1947 		dev = __dev_get_by_index(iif);
1948 		if (!dev) {
1949 			err = -ENODEV;
1950 			goto out_free;
1951 		}
1952 	}
1953 
1954 	fl.oif = 0;
1955 	if (rta[RTA_OIF-1])
1956 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1957 
1958 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1959 
1960 	skb->dst = &rt->u.dst;
1961 
1962 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1963 	err = rt6_fill_node(skb, rt,
1964 			    &fl.fl6_dst, &fl.fl6_src,
1965 			    iif,
1966 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1967 			    nlh->nlmsg_seq, 0, 0);
1968 	if (err < 0) {
1969 		err = -EMSGSIZE;
1970 		goto out_free;
1971 	}
1972 
1973 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1974 	if (err > 0)
1975 		err = 0;
1976 out:
1977 	return err;
1978 out_free:
1979 	kfree_skb(skb);
1980 	goto out;
1981 }
1982 
1983 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1984 			struct netlink_skb_parms *req)
1985 {
1986 	struct sk_buff *skb;
1987 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1988 	u32 pid = current->pid;
1989 	u32 seq = 0;
1990 
1991 	if (req)
1992 		pid = req->pid;
1993 	if (nlh)
1994 		seq = nlh->nlmsg_seq;
1995 
1996 	skb = alloc_skb(size, gfp_any());
1997 	if (!skb) {
1998 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1999 		return;
2000 	}
2001 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2002 		kfree_skb(skb);
2003 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2004 		return;
2005 	}
2006 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2007 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2008 }
2009 
2010 /*
2011  *	/proc
2012  */
2013 
2014 #ifdef CONFIG_PROC_FS
2015 
2016 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2017 
2018 struct rt6_proc_arg
2019 {
2020 	char *buffer;
2021 	int offset;
2022 	int length;
2023 	int skip;
2024 	int len;
2025 };
2026 
2027 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2028 {
2029 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2030 	int i;
2031 
2032 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2033 		arg->skip++;
2034 		return 0;
2035 	}
2036 
2037 	if (arg->len >= arg->length)
2038 		return 0;
2039 
2040 	for (i=0; i<16; i++) {
2041 		sprintf(arg->buffer + arg->len, "%02x",
2042 			rt->rt6i_dst.addr.s6_addr[i]);
2043 		arg->len += 2;
2044 	}
2045 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2046 			    rt->rt6i_dst.plen);
2047 
2048 #ifdef CONFIG_IPV6_SUBTREES
2049 	for (i=0; i<16; i++) {
2050 		sprintf(arg->buffer + arg->len, "%02x",
2051 			rt->rt6i_src.addr.s6_addr[i]);
2052 		arg->len += 2;
2053 	}
2054 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2055 			    rt->rt6i_src.plen);
2056 #else
2057 	sprintf(arg->buffer + arg->len,
2058 		"00000000000000000000000000000000 00 ");
2059 	arg->len += 36;
2060 #endif
2061 
2062 	if (rt->rt6i_nexthop) {
2063 		for (i=0; i<16; i++) {
2064 			sprintf(arg->buffer + arg->len, "%02x",
2065 				rt->rt6i_nexthop->primary_key[i]);
2066 			arg->len += 2;
2067 		}
2068 	} else {
2069 		sprintf(arg->buffer + arg->len,
2070 			"00000000000000000000000000000000");
2071 		arg->len += 32;
2072 	}
2073 	arg->len += sprintf(arg->buffer + arg->len,
2074 			    " %08x %08x %08x %08x %8s\n",
2075 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2076 			    rt->u.dst.__use, rt->rt6i_flags,
2077 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2078 	return 0;
2079 }
2080 
2081 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2082 {
2083 	struct rt6_proc_arg arg;
2084 	arg.buffer = buffer;
2085 	arg.offset = offset;
2086 	arg.length = length;
2087 	arg.skip = 0;
2088 	arg.len = 0;
2089 
2090 	read_lock_bh(&rt6_lock);
2091 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2092 	read_unlock_bh(&rt6_lock);
2093 
2094 	*start = buffer;
2095 	if (offset)
2096 		*start += offset % RT6_INFO_LEN;
2097 
2098 	arg.len -= offset % RT6_INFO_LEN;
2099 
2100 	if (arg.len > length)
2101 		arg.len = length;
2102 	if (arg.len < 0)
2103 		arg.len = 0;
2104 
2105 	return arg.len;
2106 }
2107 
2108 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2109 {
2110 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2111 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2112 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2113 		      rt6_stats.fib_rt_cache,
2114 		      atomic_read(&ip6_dst_ops.entries),
2115 		      rt6_stats.fib_discarded_routes);
2116 
2117 	return 0;
2118 }
2119 
2120 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2121 {
2122 	return single_open(file, rt6_stats_seq_show, NULL);
2123 }
2124 
2125 static struct file_operations rt6_stats_seq_fops = {
2126 	.owner	 = THIS_MODULE,
2127 	.open	 = rt6_stats_seq_open,
2128 	.read	 = seq_read,
2129 	.llseek	 = seq_lseek,
2130 	.release = single_release,
2131 };
2132 #endif	/* CONFIG_PROC_FS */
2133 
2134 #ifdef CONFIG_SYSCTL
2135 
2136 static int flush_delay;
2137 
2138 static
2139 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2140 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2141 {
2142 	if (write) {
2143 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2144 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2145 		return 0;
2146 	} else
2147 		return -EINVAL;
2148 }
2149 
2150 ctl_table ipv6_route_table[] = {
2151         {
2152 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2153 		.procname	=	"flush",
2154          	.data		=	&flush_delay,
2155 		.maxlen		=	sizeof(int),
2156 		.mode		=	0200,
2157          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2158 	},
2159 	{
2160 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2161 		.procname	=	"gc_thresh",
2162          	.data		=	&ip6_dst_ops.gc_thresh,
2163 		.maxlen		=	sizeof(int),
2164 		.mode		=	0644,
2165          	.proc_handler	=	&proc_dointvec,
2166 	},
2167 	{
2168 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2169 		.procname	=	"max_size",
2170          	.data		=	&ip6_rt_max_size,
2171 		.maxlen		=	sizeof(int),
2172 		.mode		=	0644,
2173          	.proc_handler	=	&proc_dointvec,
2174 	},
2175 	{
2176 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2177 		.procname	=	"gc_min_interval",
2178          	.data		=	&ip6_rt_gc_min_interval,
2179 		.maxlen		=	sizeof(int),
2180 		.mode		=	0644,
2181          	.proc_handler	=	&proc_dointvec_jiffies,
2182 		.strategy	=	&sysctl_jiffies,
2183 	},
2184 	{
2185 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2186 		.procname	=	"gc_timeout",
2187          	.data		=	&ip6_rt_gc_timeout,
2188 		.maxlen		=	sizeof(int),
2189 		.mode		=	0644,
2190          	.proc_handler	=	&proc_dointvec_jiffies,
2191 		.strategy	=	&sysctl_jiffies,
2192 	},
2193 	{
2194 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2195 		.procname	=	"gc_interval",
2196          	.data		=	&ip6_rt_gc_interval,
2197 		.maxlen		=	sizeof(int),
2198 		.mode		=	0644,
2199          	.proc_handler	=	&proc_dointvec_jiffies,
2200 		.strategy	=	&sysctl_jiffies,
2201 	},
2202 	{
2203 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2204 		.procname	=	"gc_elasticity",
2205          	.data		=	&ip6_rt_gc_elasticity,
2206 		.maxlen		=	sizeof(int),
2207 		.mode		=	0644,
2208          	.proc_handler	=	&proc_dointvec_jiffies,
2209 		.strategy	=	&sysctl_jiffies,
2210 	},
2211 	{
2212 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2213 		.procname	=	"mtu_expires",
2214          	.data		=	&ip6_rt_mtu_expires,
2215 		.maxlen		=	sizeof(int),
2216 		.mode		=	0644,
2217          	.proc_handler	=	&proc_dointvec_jiffies,
2218 		.strategy	=	&sysctl_jiffies,
2219 	},
2220 	{
2221 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2222 		.procname	=	"min_adv_mss",
2223          	.data		=	&ip6_rt_min_advmss,
2224 		.maxlen		=	sizeof(int),
2225 		.mode		=	0644,
2226          	.proc_handler	=	&proc_dointvec_jiffies,
2227 		.strategy	=	&sysctl_jiffies,
2228 	},
2229 	{
2230 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2231 		.procname	=	"gc_min_interval_ms",
2232          	.data		=	&ip6_rt_gc_min_interval,
2233 		.maxlen		=	sizeof(int),
2234 		.mode		=	0644,
2235          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2236 		.strategy	=	&sysctl_ms_jiffies,
2237 	},
2238 	{ .ctl_name = 0 }
2239 };
2240 
2241 #endif
2242 
2243 void __init ip6_route_init(void)
2244 {
2245 	struct proc_dir_entry *p;
2246 
2247 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2248 						     sizeof(struct rt6_info),
2249 						     0, SLAB_HWCACHE_ALIGN,
2250 						     NULL, NULL);
2251 	if (!ip6_dst_ops.kmem_cachep)
2252 		panic("cannot create ip6_dst_cache");
2253 
2254 	fib6_init();
2255 #ifdef 	CONFIG_PROC_FS
2256 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2257 	if (p)
2258 		p->owner = THIS_MODULE;
2259 
2260 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2261 #endif
2262 #ifdef CONFIG_XFRM
2263 	xfrm6_init();
2264 #endif
2265 }
2266 
2267 void ip6_route_cleanup(void)
2268 {
2269 #ifdef CONFIG_PROC_FS
2270 	proc_net_remove("ipv6_route");
2271 	proc_net_remove("rt6_stats");
2272 #endif
2273 #ifdef CONFIG_XFRM
2274 	xfrm6_fini();
2275 #endif
2276 	rt6_ifdown(NULL);
2277 	fib6_gc_cleanup();
2278 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2279 }
2280