xref: /linux/net/ipv6/route.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40 
41 #ifdef 	CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45 
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 	    !(rt->rt6i_flags & RTF_GATEWAY))
285 		m = 1;
286 	else if (neigh) {
287 		read_lock_bh(&neigh->lock);
288 		if (neigh->nud_state & NUD_VALID)
289 			m = 2;
290 		read_unlock_bh(&neigh->lock);
291 	}
292 	return m;
293 }
294 
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296 			   int strict)
297 {
298 	int m, n;
299 
300 	m = rt6_check_dev(rt, oif);
301 	if (!m && (strict & RT6_SELECT_F_IFACE))
302 		return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306 	n = rt6_check_neigh(rt);
307 	if (n > 1)
308 		m |= 16;
309 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
310 		return -1;
311 	return m;
312 }
313 
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315 				   int strict)
316 {
317 	struct rt6_info *match = NULL, *last = NULL;
318 	struct rt6_info *rt, *rt0 = *head;
319 	u32 metric;
320 	int mpri = -1;
321 
322 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 		  __FUNCTION__, head, head ? *head : NULL, oif);
324 
325 	for (rt = rt0, metric = rt0->rt6i_metric;
326 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327 	     rt = rt->u.next) {
328 		int m;
329 
330 		if (rt6_check_expired(rt))
331 			continue;
332 
333 		last = rt;
334 
335 		m = rt6_score_route(rt, oif, strict);
336 		if (m < 0)
337 			continue;
338 
339 		if (m > mpri) {
340 			rt6_probe(match);
341 			match = rt;
342 			mpri = m;
343 		} else {
344 			rt6_probe(rt);
345 		}
346 	}
347 
348 	if (!match &&
349 	    (strict & RT6_SELECT_F_REACHABLE) &&
350 	    last && last != rt0) {
351 		/* no entries matched; do round-robin */
352 		static DEFINE_SPINLOCK(lock);
353 		spin_lock(&lock);
354 		*head = rt0->u.next;
355 		rt0->u.next = last->u.next;
356 		last->u.next = rt0;
357 		spin_unlock(&lock);
358 	}
359 
360 	RT6_TRACE("%s() => %p, score=%d\n",
361 		  __FUNCTION__, match, mpri);
362 
363 	return (match ? match : &ip6_null_entry);
364 }
365 
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 		  struct in6_addr *gwaddr)
369 {
370 	struct route_info *rinfo = (struct route_info *) opt;
371 	struct in6_addr prefix_buf, *prefix;
372 	unsigned int pref;
373 	u32 lifetime;
374 	struct rt6_info *rt;
375 
376 	if (len < sizeof(struct route_info)) {
377 		return -EINVAL;
378 	}
379 
380 	/* Sanity check for prefix_len and length */
381 	if (rinfo->length > 3) {
382 		return -EINVAL;
383 	} else if (rinfo->prefix_len > 128) {
384 		return -EINVAL;
385 	} else if (rinfo->prefix_len > 64) {
386 		if (rinfo->length < 2) {
387 			return -EINVAL;
388 		}
389 	} else if (rinfo->prefix_len > 0) {
390 		if (rinfo->length < 1) {
391 			return -EINVAL;
392 		}
393 	}
394 
395 	pref = rinfo->route_pref;
396 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
398 
399 	lifetime = htonl(rinfo->lifetime);
400 	if (lifetime == 0xffffffff) {
401 		/* infinity */
402 	} else if (lifetime > 0x7fffffff/HZ) {
403 		/* Avoid arithmetic overflow */
404 		lifetime = 0x7fffffff/HZ - 1;
405 	}
406 
407 	if (rinfo->length == 3)
408 		prefix = (struct in6_addr *)rinfo->prefix;
409 	else {
410 		/* this function is safe */
411 		ipv6_addr_prefix(&prefix_buf,
412 				 (struct in6_addr *)rinfo->prefix,
413 				 rinfo->prefix_len);
414 		prefix = &prefix_buf;
415 	}
416 
417 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418 
419 	if (rt && !lifetime) {
420 		ip6_del_rt(rt, NULL, NULL, NULL);
421 		rt = NULL;
422 	}
423 
424 	if (!rt && lifetime)
425 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426 					pref);
427 	else if (rt)
428 		rt->rt6i_flags = RTF_ROUTEINFO |
429 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430 
431 	if (rt) {
432 		if (lifetime == 0xffffffff) {
433 			rt->rt6i_flags &= ~RTF_EXPIRES;
434 		} else {
435 			rt->rt6i_expires = jiffies + HZ * lifetime;
436 			rt->rt6i_flags |= RTF_EXPIRES;
437 		}
438 		dst_release(&rt->u.dst);
439 	}
440 	return 0;
441 }
442 #endif
443 
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445 			    int oif, int strict)
446 {
447 	struct fib6_node *fn;
448 	struct rt6_info *rt;
449 
450 	read_lock_bh(&rt6_lock);
451 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 	rt = rt6_device_match(fn->leaf, oif, strict);
453 	dst_hold(&rt->u.dst);
454 	rt->u.dst.__use++;
455 	read_unlock_bh(&rt6_lock);
456 
457 	rt->u.dst.lastuse = jiffies;
458 	if (rt->u.dst.error == 0)
459 		return rt;
460 	dst_release(&rt->u.dst);
461 	return NULL;
462 }
463 
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469 
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 		void *_rtattr, struct netlink_skb_parms *req)
472 {
473 	int err;
474 
475 	write_lock_bh(&rt6_lock);
476 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477 	write_unlock_bh(&rt6_lock);
478 
479 	return err;
480 }
481 
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 				      struct in6_addr *saddr)
484 {
485 	struct rt6_info *rt;
486 
487 	/*
488 	 *	Clone the route.
489 	 */
490 
491 	rt = ip6_rt_copy(ort);
492 
493 	if (rt) {
494 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 			if (rt->rt6i_dst.plen != 128 &&
496 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 				rt->rt6i_flags |= RTF_ANYCAST;
498 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499 		}
500 
501 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502 		rt->rt6i_dst.plen = 128;
503 		rt->rt6i_flags |= RTF_CACHE;
504 		rt->u.dst.flags |= DST_HOST;
505 
506 #ifdef CONFIG_IPV6_SUBTREES
507 		if (rt->rt6i_src.plen && saddr) {
508 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 			rt->rt6i_src.plen = 128;
510 		}
511 #endif
512 
513 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514 
515 	}
516 
517 	return rt;
518 }
519 
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522 	struct rt6_info *rt = ip6_rt_copy(ort);
523 	if (rt) {
524 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 		rt->rt6i_dst.plen = 128;
526 		rt->rt6i_flags |= RTF_CACHE;
527 		if (rt->rt6i_flags & RTF_REJECT)
528 			rt->u.dst.error = ort->u.dst.error;
529 		rt->u.dst.flags |= DST_HOST;
530 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531 	}
532 	return rt;
533 }
534 
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538 		if (fn->fn_flags & RTN_ROOT) { \
539 			goto out; \
540 		} \
541 		if (fn->fn_flags & RTN_RTINFO) \
542 			goto restart; \
543 	} \
544 }
545 
546 
547 void ip6_route_input(struct sk_buff *skb)
548 {
549 	struct fib6_node *fn;
550 	struct rt6_info *rt, *nrt;
551 	int strict;
552 	int attempts = 3;
553 	int err;
554 	int reachable = RT6_SELECT_F_REACHABLE;
555 
556 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557 
558 relookup:
559 	read_lock_bh(&rt6_lock);
560 
561 restart_2:
562 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 			 &skb->nh.ipv6h->saddr);
564 
565 restart:
566 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567 	BACKTRACK();
568 	if (rt == &ip6_null_entry ||
569 	    rt->rt6i_flags & RTF_CACHE)
570 		goto out;
571 
572 	dst_hold(&rt->u.dst);
573 	read_unlock_bh(&rt6_lock);
574 
575 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577 	else {
578 #if CLONE_OFFLINK_ROUTE
579 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581 		goto out2;
582 #endif
583 	}
584 
585 	dst_release(&rt->u.dst);
586 	rt = nrt ? : &ip6_null_entry;
587 
588 	dst_hold(&rt->u.dst);
589 	if (nrt) {
590 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591 		if (!err)
592 			goto out2;
593 	}
594 
595 	if (--attempts <= 0)
596 		goto out2;
597 
598 	/*
599 	 * Race condition! In the gap, when rt6_lock was
600 	 * released someone could insert this route.  Relookup.
601 	 */
602 	dst_release(&rt->u.dst);
603 	goto relookup;
604 
605 out:
606 	if (reachable) {
607 		reachable = 0;
608 		goto restart_2;
609 	}
610 	dst_hold(&rt->u.dst);
611 	read_unlock_bh(&rt6_lock);
612 out2:
613 	rt->u.dst.lastuse = jiffies;
614 	rt->u.dst.__use++;
615 	skb->dst = (struct dst_entry *) rt;
616 	return;
617 }
618 
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621 	struct fib6_node *fn;
622 	struct rt6_info *rt, *nrt;
623 	int strict;
624 	int attempts = 3;
625 	int err;
626 	int reachable = RT6_SELECT_F_REACHABLE;
627 
628 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629 
630 relookup:
631 	read_lock_bh(&rt6_lock);
632 
633 restart_2:
634 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635 
636 restart:
637 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638 	BACKTRACK();
639 	if (rt == &ip6_null_entry ||
640 	    rt->rt6i_flags & RTF_CACHE)
641 		goto out;
642 
643 	dst_hold(&rt->u.dst);
644 	read_unlock_bh(&rt6_lock);
645 
646 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648 	else {
649 #if CLONE_OFFLINK_ROUTE
650 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652 		goto out2;
653 #endif
654 	}
655 
656 	dst_release(&rt->u.dst);
657 	rt = nrt ? : &ip6_null_entry;
658 
659 	dst_hold(&rt->u.dst);
660 	if (nrt) {
661 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662 		if (!err)
663 			goto out2;
664 	}
665 
666 	if (--attempts <= 0)
667 		goto out2;
668 
669 	/*
670 	 * Race condition! In the gap, when rt6_lock was
671 	 * released someone could insert this route.  Relookup.
672 	 */
673 	dst_release(&rt->u.dst);
674 	goto relookup;
675 
676 out:
677 	if (reachable) {
678 		reachable = 0;
679 		goto restart_2;
680 	}
681 	dst_hold(&rt->u.dst);
682 	read_unlock_bh(&rt6_lock);
683 out2:
684 	rt->u.dst.lastuse = jiffies;
685 	rt->u.dst.__use++;
686 	return &rt->u.dst;
687 }
688 
689 
690 /*
691  *	Destination cache support functions
692  */
693 
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696 	struct rt6_info *rt;
697 
698 	rt = (struct rt6_info *) dst;
699 
700 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701 		return dst;
702 
703 	return NULL;
704 }
705 
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708 	struct rt6_info *rt = (struct rt6_info *) dst;
709 
710 	if (rt) {
711 		if (rt->rt6i_flags & RTF_CACHE)
712 			ip6_del_rt(rt, NULL, NULL, NULL);
713 		else
714 			dst_release(dst);
715 	}
716 	return NULL;
717 }
718 
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721 	struct rt6_info *rt;
722 
723 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724 
725 	rt = (struct rt6_info *) skb->dst;
726 	if (rt) {
727 		if (rt->rt6i_flags&RTF_CACHE) {
728 			dst_set_expires(&rt->u.dst, 0);
729 			rt->rt6i_flags |= RTF_EXPIRES;
730 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 			rt->rt6i_node->fn_sernum = -1;
732 	}
733 }
734 
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737 	struct rt6_info *rt6 = (struct rt6_info*)dst;
738 
739 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 		rt6->rt6i_flags |= RTF_MODIFIED;
741 		if (mtu < IPV6_MIN_MTU) {
742 			mtu = IPV6_MIN_MTU;
743 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744 		}
745 		dst->metrics[RTAX_MTU-1] = mtu;
746 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747 	}
748 }
749 
750 /* Protected by rt6_lock.  */
751 static struct dst_entry *ndisc_dst_gc_list;
752 static int ipv6_get_mtu(struct net_device *dev);
753 
754 static inline unsigned int ipv6_advmss(unsigned int mtu)
755 {
756 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
757 
758 	if (mtu < ip6_rt_min_advmss)
759 		mtu = ip6_rt_min_advmss;
760 
761 	/*
762 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
763 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
764 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
765 	 * rely only on pmtu discovery"
766 	 */
767 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
768 		mtu = IPV6_MAXPLEN;
769 	return mtu;
770 }
771 
772 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
773 				  struct neighbour *neigh,
774 				  struct in6_addr *addr,
775 				  int (*output)(struct sk_buff *))
776 {
777 	struct rt6_info *rt;
778 	struct inet6_dev *idev = in6_dev_get(dev);
779 
780 	if (unlikely(idev == NULL))
781 		return NULL;
782 
783 	rt = ip6_dst_alloc();
784 	if (unlikely(rt == NULL)) {
785 		in6_dev_put(idev);
786 		goto out;
787 	}
788 
789 	dev_hold(dev);
790 	if (neigh)
791 		neigh_hold(neigh);
792 	else
793 		neigh = ndisc_get_neigh(dev, addr);
794 
795 	rt->rt6i_dev	  = dev;
796 	rt->rt6i_idev     = idev;
797 	rt->rt6i_nexthop  = neigh;
798 	atomic_set(&rt->u.dst.__refcnt, 1);
799 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
800 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
801 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
802 	rt->u.dst.output  = output;
803 
804 #if 0	/* there's no chance to use these for ndisc */
805 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
806 				? DST_HOST
807 				: 0;
808 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
809 	rt->rt6i_dst.plen = 128;
810 #endif
811 
812 	write_lock_bh(&rt6_lock);
813 	rt->u.dst.next = ndisc_dst_gc_list;
814 	ndisc_dst_gc_list = &rt->u.dst;
815 	write_unlock_bh(&rt6_lock);
816 
817 	fib6_force_start_gc();
818 
819 out:
820 	return (struct dst_entry *)rt;
821 }
822 
823 int ndisc_dst_gc(int *more)
824 {
825 	struct dst_entry *dst, *next, **pprev;
826 	int freed;
827 
828 	next = NULL;
829 	pprev = &ndisc_dst_gc_list;
830 	freed = 0;
831 	while ((dst = *pprev) != NULL) {
832 		if (!atomic_read(&dst->__refcnt)) {
833 			*pprev = dst->next;
834 			dst_free(dst);
835 			freed++;
836 		} else {
837 			pprev = &dst->next;
838 			(*more)++;
839 		}
840 	}
841 
842 	return freed;
843 }
844 
845 static int ip6_dst_gc(void)
846 {
847 	static unsigned expire = 30*HZ;
848 	static unsigned long last_gc;
849 	unsigned long now = jiffies;
850 
851 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
852 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
853 		goto out;
854 
855 	expire++;
856 	fib6_run_gc(expire);
857 	last_gc = now;
858 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
859 		expire = ip6_rt_gc_timeout>>1;
860 
861 out:
862 	expire -= expire>>ip6_rt_gc_elasticity;
863 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
864 }
865 
866 /* Clean host part of a prefix. Not necessary in radix tree,
867    but results in cleaner routing tables.
868 
869    Remove it only when all the things will work!
870  */
871 
872 static int ipv6_get_mtu(struct net_device *dev)
873 {
874 	int mtu = IPV6_MIN_MTU;
875 	struct inet6_dev *idev;
876 
877 	idev = in6_dev_get(dev);
878 	if (idev) {
879 		mtu = idev->cnf.mtu6;
880 		in6_dev_put(idev);
881 	}
882 	return mtu;
883 }
884 
885 int ipv6_get_hoplimit(struct net_device *dev)
886 {
887 	int hoplimit = ipv6_devconf.hop_limit;
888 	struct inet6_dev *idev;
889 
890 	idev = in6_dev_get(dev);
891 	if (idev) {
892 		hoplimit = idev->cnf.hop_limit;
893 		in6_dev_put(idev);
894 	}
895 	return hoplimit;
896 }
897 
898 /*
899  *
900  */
901 
902 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
903 		void *_rtattr, struct netlink_skb_parms *req)
904 {
905 	int err;
906 	struct rtmsg *r;
907 	struct rtattr **rta;
908 	struct rt6_info *rt = NULL;
909 	struct net_device *dev = NULL;
910 	struct inet6_dev *idev = NULL;
911 	int addr_type;
912 
913 	rta = (struct rtattr **) _rtattr;
914 
915 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
916 		return -EINVAL;
917 #ifndef CONFIG_IPV6_SUBTREES
918 	if (rtmsg->rtmsg_src_len)
919 		return -EINVAL;
920 #endif
921 	if (rtmsg->rtmsg_ifindex) {
922 		err = -ENODEV;
923 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
924 		if (!dev)
925 			goto out;
926 		idev = in6_dev_get(dev);
927 		if (!idev)
928 			goto out;
929 	}
930 
931 	if (rtmsg->rtmsg_metric == 0)
932 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
933 
934 	rt = ip6_dst_alloc();
935 
936 	if (rt == NULL) {
937 		err = -ENOMEM;
938 		goto out;
939 	}
940 
941 	rt->u.dst.obsolete = -1;
942 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
943 	if (nlh && (r = NLMSG_DATA(nlh))) {
944 		rt->rt6i_protocol = r->rtm_protocol;
945 	} else {
946 		rt->rt6i_protocol = RTPROT_BOOT;
947 	}
948 
949 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
950 
951 	if (addr_type & IPV6_ADDR_MULTICAST)
952 		rt->u.dst.input = ip6_mc_input;
953 	else
954 		rt->u.dst.input = ip6_forward;
955 
956 	rt->u.dst.output = ip6_output;
957 
958 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
959 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
960 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
961 	if (rt->rt6i_dst.plen == 128)
962 	       rt->u.dst.flags = DST_HOST;
963 
964 #ifdef CONFIG_IPV6_SUBTREES
965 	ipv6_addr_prefix(&rt->rt6i_src.addr,
966 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
967 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
968 #endif
969 
970 	rt->rt6i_metric = rtmsg->rtmsg_metric;
971 
972 	/* We cannot add true routes via loopback here,
973 	   they would result in kernel looping; promote them to reject routes
974 	 */
975 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
976 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
977 		/* hold loopback dev/idev if we haven't done so. */
978 		if (dev != &loopback_dev) {
979 			if (dev) {
980 				dev_put(dev);
981 				in6_dev_put(idev);
982 			}
983 			dev = &loopback_dev;
984 			dev_hold(dev);
985 			idev = in6_dev_get(dev);
986 			if (!idev) {
987 				err = -ENODEV;
988 				goto out;
989 			}
990 		}
991 		rt->u.dst.output = ip6_pkt_discard_out;
992 		rt->u.dst.input = ip6_pkt_discard;
993 		rt->u.dst.error = -ENETUNREACH;
994 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
995 		goto install_route;
996 	}
997 
998 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
999 		struct in6_addr *gw_addr;
1000 		int gwa_type;
1001 
1002 		gw_addr = &rtmsg->rtmsg_gateway;
1003 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1004 		gwa_type = ipv6_addr_type(gw_addr);
1005 
1006 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1007 			struct rt6_info *grt;
1008 
1009 			/* IPv6 strictly inhibits using not link-local
1010 			   addresses as nexthop address.
1011 			   Otherwise, router will not able to send redirects.
1012 			   It is very good, but in some (rare!) circumstances
1013 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1014 			   some exceptions. --ANK
1015 			 */
1016 			err = -EINVAL;
1017 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1018 				goto out;
1019 
1020 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1021 
1022 			err = -EHOSTUNREACH;
1023 			if (grt == NULL)
1024 				goto out;
1025 			if (dev) {
1026 				if (dev != grt->rt6i_dev) {
1027 					dst_release(&grt->u.dst);
1028 					goto out;
1029 				}
1030 			} else {
1031 				dev = grt->rt6i_dev;
1032 				idev = grt->rt6i_idev;
1033 				dev_hold(dev);
1034 				in6_dev_hold(grt->rt6i_idev);
1035 			}
1036 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1037 				err = 0;
1038 			dst_release(&grt->u.dst);
1039 
1040 			if (err)
1041 				goto out;
1042 		}
1043 		err = -EINVAL;
1044 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1045 			goto out;
1046 	}
1047 
1048 	err = -ENODEV;
1049 	if (dev == NULL)
1050 		goto out;
1051 
1052 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1053 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1054 		if (IS_ERR(rt->rt6i_nexthop)) {
1055 			err = PTR_ERR(rt->rt6i_nexthop);
1056 			rt->rt6i_nexthop = NULL;
1057 			goto out;
1058 		}
1059 	}
1060 
1061 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1062 
1063 install_route:
1064 	if (rta && rta[RTA_METRICS-1]) {
1065 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1066 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1067 
1068 		while (RTA_OK(attr, attrlen)) {
1069 			unsigned flavor = attr->rta_type;
1070 			if (flavor) {
1071 				if (flavor > RTAX_MAX) {
1072 					err = -EINVAL;
1073 					goto out;
1074 				}
1075 				rt->u.dst.metrics[flavor-1] =
1076 					*(u32 *)RTA_DATA(attr);
1077 			}
1078 			attr = RTA_NEXT(attr, attrlen);
1079 		}
1080 	}
1081 
1082 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1083 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1084 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1085 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1086 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1087 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1088 	rt->u.dst.dev = dev;
1089 	rt->rt6i_idev = idev;
1090 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1091 
1092 out:
1093 	if (dev)
1094 		dev_put(dev);
1095 	if (idev)
1096 		in6_dev_put(idev);
1097 	if (rt)
1098 		dst_free((struct dst_entry *) rt);
1099 	return err;
1100 }
1101 
1102 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1103 {
1104 	int err;
1105 
1106 	write_lock_bh(&rt6_lock);
1107 
1108 	err = fib6_del(rt, nlh, _rtattr, req);
1109 	dst_release(&rt->u.dst);
1110 
1111 	write_unlock_bh(&rt6_lock);
1112 
1113 	return err;
1114 }
1115 
1116 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1117 {
1118 	struct fib6_node *fn;
1119 	struct rt6_info *rt;
1120 	int err = -ESRCH;
1121 
1122 	read_lock_bh(&rt6_lock);
1123 
1124 	fn = fib6_locate(&ip6_routing_table,
1125 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1126 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1127 
1128 	if (fn) {
1129 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1130 			if (rtmsg->rtmsg_ifindex &&
1131 			    (rt->rt6i_dev == NULL ||
1132 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1133 				continue;
1134 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1135 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1136 				continue;
1137 			if (rtmsg->rtmsg_metric &&
1138 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1139 				continue;
1140 			dst_hold(&rt->u.dst);
1141 			read_unlock_bh(&rt6_lock);
1142 
1143 			return ip6_del_rt(rt, nlh, _rtattr, req);
1144 		}
1145 	}
1146 	read_unlock_bh(&rt6_lock);
1147 
1148 	return err;
1149 }
1150 
1151 /*
1152  *	Handle redirects
1153  */
1154 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1155 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1156 {
1157 	struct rt6_info *rt, *nrt = NULL;
1158 	int strict;
1159 	struct fib6_node *fn;
1160 	struct netevent_redirect netevent;
1161 
1162 	/*
1163 	 * Get the "current" route for this destination and
1164 	 * check if the redirect has come from approriate router.
1165 	 *
1166 	 * RFC 2461 specifies that redirects should only be
1167 	 * accepted if they come from the nexthop to the target.
1168 	 * Due to the way the routes are chosen, this notion
1169 	 * is a bit fuzzy and one might need to check all possible
1170 	 * routes.
1171 	 */
1172 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1173 
1174 	read_lock_bh(&rt6_lock);
1175 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1176 restart:
1177 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1178 		/*
1179 		 * Current route is on-link; redirect is always invalid.
1180 		 *
1181 		 * Seems, previous statement is not true. It could
1182 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1183 		 * But then router serving it might decide, that we should
1184 		 * know truth 8)8) --ANK (980726).
1185 		 */
1186 		if (rt6_check_expired(rt))
1187 			continue;
1188 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1189 			continue;
1190 		if (neigh->dev != rt->rt6i_dev)
1191 			continue;
1192 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1193 			continue;
1194 		break;
1195 	}
1196 	if (rt)
1197 		dst_hold(&rt->u.dst);
1198 	else if (strict) {
1199 		while ((fn = fn->parent) != NULL) {
1200 			if (fn->fn_flags & RTN_ROOT)
1201 				break;
1202 			if (fn->fn_flags & RTN_RTINFO)
1203 				goto restart;
1204 		}
1205 	}
1206 	read_unlock_bh(&rt6_lock);
1207 
1208 	if (!rt) {
1209 		if (net_ratelimit())
1210 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1211 			       "for redirect target\n");
1212 		return;
1213 	}
1214 
1215 	/*
1216 	 *	We have finally decided to accept it.
1217 	 */
1218 
1219 	neigh_update(neigh, lladdr, NUD_STALE,
1220 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1221 		     NEIGH_UPDATE_F_OVERRIDE|
1222 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1223 				     NEIGH_UPDATE_F_ISROUTER))
1224 		     );
1225 
1226 	/*
1227 	 * Redirect received -> path was valid.
1228 	 * Look, redirects are sent only in response to data packets,
1229 	 * so that this nexthop apparently is reachable. --ANK
1230 	 */
1231 	dst_confirm(&rt->u.dst);
1232 
1233 	/* Duplicate redirect: silently ignore. */
1234 	if (neigh == rt->u.dst.neighbour)
1235 		goto out;
1236 
1237 	nrt = ip6_rt_copy(rt);
1238 	if (nrt == NULL)
1239 		goto out;
1240 
1241 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1242 	if (on_link)
1243 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1244 
1245 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1246 	nrt->rt6i_dst.plen = 128;
1247 	nrt->u.dst.flags |= DST_HOST;
1248 
1249 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1250 	nrt->rt6i_nexthop = neigh_clone(neigh);
1251 	/* Reset pmtu, it may be better */
1252 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1253 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1254 
1255 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1256 		goto out;
1257 
1258 	netevent.old = &rt->u.dst;
1259 	netevent.new = &nrt->u.dst;
1260 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1261 
1262 	if (rt->rt6i_flags&RTF_CACHE) {
1263 		ip6_del_rt(rt, NULL, NULL, NULL);
1264 		return;
1265 	}
1266 
1267 out:
1268         dst_release(&rt->u.dst);
1269 	return;
1270 }
1271 
1272 /*
1273  *	Handle ICMP "packet too big" messages
1274  *	i.e. Path MTU discovery
1275  */
1276 
1277 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1278 			struct net_device *dev, u32 pmtu)
1279 {
1280 	struct rt6_info *rt, *nrt;
1281 	int allfrag = 0;
1282 
1283 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1284 	if (rt == NULL)
1285 		return;
1286 
1287 	if (pmtu >= dst_mtu(&rt->u.dst))
1288 		goto out;
1289 
1290 	if (pmtu < IPV6_MIN_MTU) {
1291 		/*
1292 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1293 		 * MTU (1280) and a fragment header should always be included
1294 		 * after a node receiving Too Big message reporting PMTU is
1295 		 * less than the IPv6 Minimum Link MTU.
1296 		 */
1297 		pmtu = IPV6_MIN_MTU;
1298 		allfrag = 1;
1299 	}
1300 
1301 	/* New mtu received -> path was valid.
1302 	   They are sent only in response to data packets,
1303 	   so that this nexthop apparently is reachable. --ANK
1304 	 */
1305 	dst_confirm(&rt->u.dst);
1306 
1307 	/* Host route. If it is static, it would be better
1308 	   not to override it, but add new one, so that
1309 	   when cache entry will expire old pmtu
1310 	   would return automatically.
1311 	 */
1312 	if (rt->rt6i_flags & RTF_CACHE) {
1313 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314 		if (allfrag)
1315 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1317 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1318 		goto out;
1319 	}
1320 
1321 	/* Network route.
1322 	   Two cases are possible:
1323 	   1. It is connected route. Action: COW
1324 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1325 	 */
1326 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1327 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1328 	else
1329 		nrt = rt6_alloc_clone(rt, daddr);
1330 
1331 	if (nrt) {
1332 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1333 		if (allfrag)
1334 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1335 
1336 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1337 		 * happened within 5 mins, the recommended timer is 10 mins.
1338 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1339 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1340 		 * and detecting PMTU increase will be automatically happened.
1341 		 */
1342 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1343 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1344 
1345 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1346 	}
1347 out:
1348 	dst_release(&rt->u.dst);
1349 }
1350 
1351 /*
1352  *	Misc support functions
1353  */
1354 
1355 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1356 {
1357 	struct rt6_info *rt = ip6_dst_alloc();
1358 
1359 	if (rt) {
1360 		rt->u.dst.input = ort->u.dst.input;
1361 		rt->u.dst.output = ort->u.dst.output;
1362 
1363 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1364 		rt->u.dst.dev = ort->u.dst.dev;
1365 		if (rt->u.dst.dev)
1366 			dev_hold(rt->u.dst.dev);
1367 		rt->rt6i_idev = ort->rt6i_idev;
1368 		if (rt->rt6i_idev)
1369 			in6_dev_hold(rt->rt6i_idev);
1370 		rt->u.dst.lastuse = jiffies;
1371 		rt->rt6i_expires = 0;
1372 
1373 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1374 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1375 		rt->rt6i_metric = 0;
1376 
1377 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1378 #ifdef CONFIG_IPV6_SUBTREES
1379 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1380 #endif
1381 	}
1382 	return rt;
1383 }
1384 
1385 #ifdef CONFIG_IPV6_ROUTE_INFO
1386 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1387 					   struct in6_addr *gwaddr, int ifindex)
1388 {
1389 	struct fib6_node *fn;
1390 	struct rt6_info *rt = NULL;
1391 
1392 	write_lock_bh(&rt6_lock);
1393 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1394 	if (!fn)
1395 		goto out;
1396 
1397 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1398 		if (rt->rt6i_dev->ifindex != ifindex)
1399 			continue;
1400 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1401 			continue;
1402 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1403 			continue;
1404 		dst_hold(&rt->u.dst);
1405 		break;
1406 	}
1407 out:
1408 	write_unlock_bh(&rt6_lock);
1409 	return rt;
1410 }
1411 
1412 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1413 					   struct in6_addr *gwaddr, int ifindex,
1414 					   unsigned pref)
1415 {
1416 	struct in6_rtmsg rtmsg;
1417 
1418 	memset(&rtmsg, 0, sizeof(rtmsg));
1419 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1420 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1421 	rtmsg.rtmsg_dst_len = prefixlen;
1422 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1423 	rtmsg.rtmsg_metric = 1024;
1424 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1425 	/* We should treat it as a default route if prefix length is 0. */
1426 	if (!prefixlen)
1427 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1428 	rtmsg.rtmsg_ifindex = ifindex;
1429 
1430 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1431 
1432 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1433 }
1434 #endif
1435 
1436 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1437 {
1438 	struct rt6_info *rt;
1439 	struct fib6_node *fn;
1440 
1441 	fn = &ip6_routing_table;
1442 
1443 	write_lock_bh(&rt6_lock);
1444 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1445 		if (dev == rt->rt6i_dev &&
1446 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1447 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1448 			break;
1449 	}
1450 	if (rt)
1451 		dst_hold(&rt->u.dst);
1452 	write_unlock_bh(&rt6_lock);
1453 	return rt;
1454 }
1455 
1456 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1457 				     struct net_device *dev,
1458 				     unsigned int pref)
1459 {
1460 	struct in6_rtmsg rtmsg;
1461 
1462 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1463 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1464 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1465 	rtmsg.rtmsg_metric = 1024;
1466 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1467 			    RTF_PREF(pref);
1468 
1469 	rtmsg.rtmsg_ifindex = dev->ifindex;
1470 
1471 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1472 	return rt6_get_dflt_router(gwaddr, dev);
1473 }
1474 
1475 void rt6_purge_dflt_routers(void)
1476 {
1477 	struct rt6_info *rt;
1478 
1479 restart:
1480 	read_lock_bh(&rt6_lock);
1481 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1482 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1483 			dst_hold(&rt->u.dst);
1484 
1485 			read_unlock_bh(&rt6_lock);
1486 
1487 			ip6_del_rt(rt, NULL, NULL, NULL);
1488 
1489 			goto restart;
1490 		}
1491 	}
1492 	read_unlock_bh(&rt6_lock);
1493 }
1494 
1495 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1496 {
1497 	struct in6_rtmsg rtmsg;
1498 	int err;
1499 
1500 	switch(cmd) {
1501 	case SIOCADDRT:		/* Add a route */
1502 	case SIOCDELRT:		/* Delete a route */
1503 		if (!capable(CAP_NET_ADMIN))
1504 			return -EPERM;
1505 		err = copy_from_user(&rtmsg, arg,
1506 				     sizeof(struct in6_rtmsg));
1507 		if (err)
1508 			return -EFAULT;
1509 
1510 		rtnl_lock();
1511 		switch (cmd) {
1512 		case SIOCADDRT:
1513 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1514 			break;
1515 		case SIOCDELRT:
1516 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1517 			break;
1518 		default:
1519 			err = -EINVAL;
1520 		}
1521 		rtnl_unlock();
1522 
1523 		return err;
1524 	};
1525 
1526 	return -EINVAL;
1527 }
1528 
1529 /*
1530  *	Drop the packet on the floor
1531  */
1532 
1533 static int ip6_pkt_discard(struct sk_buff *skb)
1534 {
1535 	int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1536 	if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1537 		IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1538 
1539 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1540 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1541 	kfree_skb(skb);
1542 	return 0;
1543 }
1544 
1545 static int ip6_pkt_discard_out(struct sk_buff *skb)
1546 {
1547 	skb->dev = skb->dst->dev;
1548 	return ip6_pkt_discard(skb);
1549 }
1550 
1551 /*
1552  *	Allocate a dst for local (unicast / anycast) address.
1553  */
1554 
1555 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1556 				    const struct in6_addr *addr,
1557 				    int anycast)
1558 {
1559 	struct rt6_info *rt = ip6_dst_alloc();
1560 
1561 	if (rt == NULL)
1562 		return ERR_PTR(-ENOMEM);
1563 
1564 	dev_hold(&loopback_dev);
1565 	in6_dev_hold(idev);
1566 
1567 	rt->u.dst.flags = DST_HOST;
1568 	rt->u.dst.input = ip6_input;
1569 	rt->u.dst.output = ip6_output;
1570 	rt->rt6i_dev = &loopback_dev;
1571 	rt->rt6i_idev = idev;
1572 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1573 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1574 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1575 	rt->u.dst.obsolete = -1;
1576 
1577 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1578 	if (anycast)
1579 		rt->rt6i_flags |= RTF_ANYCAST;
1580 	else
1581 		rt->rt6i_flags |= RTF_LOCAL;
1582 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1583 	if (rt->rt6i_nexthop == NULL) {
1584 		dst_free((struct dst_entry *) rt);
1585 		return ERR_PTR(-ENOMEM);
1586 	}
1587 
1588 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1589 	rt->rt6i_dst.plen = 128;
1590 
1591 	atomic_set(&rt->u.dst.__refcnt, 1);
1592 
1593 	return rt;
1594 }
1595 
1596 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1597 {
1598 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1599 	    rt != &ip6_null_entry) {
1600 		RT6_TRACE("deleted by ifdown %p\n", rt);
1601 		return -1;
1602 	}
1603 	return 0;
1604 }
1605 
1606 void rt6_ifdown(struct net_device *dev)
1607 {
1608 	write_lock_bh(&rt6_lock);
1609 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1610 	write_unlock_bh(&rt6_lock);
1611 }
1612 
1613 struct rt6_mtu_change_arg
1614 {
1615 	struct net_device *dev;
1616 	unsigned mtu;
1617 };
1618 
1619 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1620 {
1621 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1622 	struct inet6_dev *idev;
1623 
1624 	/* In IPv6 pmtu discovery is not optional,
1625 	   so that RTAX_MTU lock cannot disable it.
1626 	   We still use this lock to block changes
1627 	   caused by addrconf/ndisc.
1628 	*/
1629 
1630 	idev = __in6_dev_get(arg->dev);
1631 	if (idev == NULL)
1632 		return 0;
1633 
1634 	/* For administrative MTU increase, there is no way to discover
1635 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1636 	   Since RFC 1981 doesn't include administrative MTU increase
1637 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1638 	 */
1639 	/*
1640 	   If new MTU is less than route PMTU, this new MTU will be the
1641 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1642 	   decreases; if new MTU is greater than route PMTU, and the
1643 	   old MTU is the lowest MTU in the path, update the route PMTU
1644 	   to reflect the increase. In this case if the other nodes' MTU
1645 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1646 	   PMTU discouvery.
1647 	 */
1648 	if (rt->rt6i_dev == arg->dev &&
1649 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1650             (dst_mtu(&rt->u.dst) > arg->mtu ||
1651              (dst_mtu(&rt->u.dst) < arg->mtu &&
1652 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1653 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1654 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1655 	return 0;
1656 }
1657 
1658 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1659 {
1660 	struct rt6_mtu_change_arg arg;
1661 
1662 	arg.dev = dev;
1663 	arg.mtu = mtu;
1664 	read_lock_bh(&rt6_lock);
1665 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1666 	read_unlock_bh(&rt6_lock);
1667 }
1668 
1669 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1670 			      struct in6_rtmsg *rtmsg)
1671 {
1672 	memset(rtmsg, 0, sizeof(*rtmsg));
1673 
1674 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1675 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1676 	rtmsg->rtmsg_flags = RTF_UP;
1677 	if (r->rtm_type == RTN_UNREACHABLE)
1678 		rtmsg->rtmsg_flags |= RTF_REJECT;
1679 
1680 	if (rta[RTA_GATEWAY-1]) {
1681 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1682 			return -EINVAL;
1683 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1684 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1685 	}
1686 	if (rta[RTA_DST-1]) {
1687 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1688 			return -EINVAL;
1689 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1690 	}
1691 	if (rta[RTA_SRC-1]) {
1692 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1693 			return -EINVAL;
1694 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1695 	}
1696 	if (rta[RTA_OIF-1]) {
1697 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1698 			return -EINVAL;
1699 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1700 	}
1701 	if (rta[RTA_PRIORITY-1]) {
1702 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1703 			return -EINVAL;
1704 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1705 	}
1706 	return 0;
1707 }
1708 
1709 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1710 {
1711 	struct rtmsg *r = NLMSG_DATA(nlh);
1712 	struct in6_rtmsg rtmsg;
1713 
1714 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1715 		return -EINVAL;
1716 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1717 }
1718 
1719 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1720 {
1721 	struct rtmsg *r = NLMSG_DATA(nlh);
1722 	struct in6_rtmsg rtmsg;
1723 
1724 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1725 		return -EINVAL;
1726 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1727 }
1728 
1729 struct rt6_rtnl_dump_arg
1730 {
1731 	struct sk_buff *skb;
1732 	struct netlink_callback *cb;
1733 };
1734 
1735 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1736 			 struct in6_addr *dst, struct in6_addr *src,
1737 			 int iif, int type, u32 pid, u32 seq,
1738 			 int prefix, unsigned int flags)
1739 {
1740 	struct rtmsg *rtm;
1741 	struct nlmsghdr  *nlh;
1742 	unsigned char	 *b = skb->tail;
1743 	struct rta_cacheinfo ci;
1744 
1745 	if (prefix) {	/* user wants prefix routes only */
1746 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1747 			/* success since this is not a prefix route */
1748 			return 1;
1749 		}
1750 	}
1751 
1752 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1753 	rtm = NLMSG_DATA(nlh);
1754 	rtm->rtm_family = AF_INET6;
1755 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1756 	rtm->rtm_src_len = rt->rt6i_src.plen;
1757 	rtm->rtm_tos = 0;
1758 	rtm->rtm_table = RT_TABLE_MAIN;
1759 	if (rt->rt6i_flags&RTF_REJECT)
1760 		rtm->rtm_type = RTN_UNREACHABLE;
1761 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1762 		rtm->rtm_type = RTN_LOCAL;
1763 	else
1764 		rtm->rtm_type = RTN_UNICAST;
1765 	rtm->rtm_flags = 0;
1766 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1767 	rtm->rtm_protocol = rt->rt6i_protocol;
1768 	if (rt->rt6i_flags&RTF_DYNAMIC)
1769 		rtm->rtm_protocol = RTPROT_REDIRECT;
1770 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1771 		rtm->rtm_protocol = RTPROT_KERNEL;
1772 	else if (rt->rt6i_flags&RTF_DEFAULT)
1773 		rtm->rtm_protocol = RTPROT_RA;
1774 
1775 	if (rt->rt6i_flags&RTF_CACHE)
1776 		rtm->rtm_flags |= RTM_F_CLONED;
1777 
1778 	if (dst) {
1779 		RTA_PUT(skb, RTA_DST, 16, dst);
1780 	        rtm->rtm_dst_len = 128;
1781 	} else if (rtm->rtm_dst_len)
1782 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1783 #ifdef CONFIG_IPV6_SUBTREES
1784 	if (src) {
1785 		RTA_PUT(skb, RTA_SRC, 16, src);
1786 	        rtm->rtm_src_len = 128;
1787 	} else if (rtm->rtm_src_len)
1788 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1789 #endif
1790 	if (iif)
1791 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1792 	else if (dst) {
1793 		struct in6_addr saddr_buf;
1794 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1795 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1796 	}
1797 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1798 		goto rtattr_failure;
1799 	if (rt->u.dst.neighbour)
1800 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1801 	if (rt->u.dst.dev)
1802 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1803 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1804 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1805 	if (rt->rt6i_expires)
1806 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1807 	else
1808 		ci.rta_expires = 0;
1809 	ci.rta_used = rt->u.dst.__use;
1810 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1811 	ci.rta_error = rt->u.dst.error;
1812 	ci.rta_id = 0;
1813 	ci.rta_ts = 0;
1814 	ci.rta_tsage = 0;
1815 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1816 	nlh->nlmsg_len = skb->tail - b;
1817 	return skb->len;
1818 
1819 nlmsg_failure:
1820 rtattr_failure:
1821 	skb_trim(skb, b - skb->data);
1822 	return -1;
1823 }
1824 
1825 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1826 {
1827 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1828 	int prefix;
1829 
1830 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1831 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1832 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1833 	} else
1834 		prefix = 0;
1835 
1836 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1837 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1838 		     prefix, NLM_F_MULTI);
1839 }
1840 
1841 static int fib6_dump_node(struct fib6_walker_t *w)
1842 {
1843 	int res;
1844 	struct rt6_info *rt;
1845 
1846 	for (rt = w->leaf; rt; rt = rt->u.next) {
1847 		res = rt6_dump_route(rt, w->args);
1848 		if (res < 0) {
1849 			/* Frame is full, suspend walking */
1850 			w->leaf = rt;
1851 			return 1;
1852 		}
1853 		BUG_TRAP(res!=0);
1854 	}
1855 	w->leaf = NULL;
1856 	return 0;
1857 }
1858 
1859 static void fib6_dump_end(struct netlink_callback *cb)
1860 {
1861 	struct fib6_walker_t *w = (void*)cb->args[0];
1862 
1863 	if (w) {
1864 		cb->args[0] = 0;
1865 		fib6_walker_unlink(w);
1866 		kfree(w);
1867 	}
1868 	cb->done = (void*)cb->args[1];
1869 	cb->args[1] = 0;
1870 }
1871 
1872 static int fib6_dump_done(struct netlink_callback *cb)
1873 {
1874 	fib6_dump_end(cb);
1875 	return cb->done ? cb->done(cb) : 0;
1876 }
1877 
1878 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1879 {
1880 	struct rt6_rtnl_dump_arg arg;
1881 	struct fib6_walker_t *w;
1882 	int res;
1883 
1884 	arg.skb = skb;
1885 	arg.cb = cb;
1886 
1887 	w = (void*)cb->args[0];
1888 	if (w == NULL) {
1889 		/* New dump:
1890 		 *
1891 		 * 1. hook callback destructor.
1892 		 */
1893 		cb->args[1] = (long)cb->done;
1894 		cb->done = fib6_dump_done;
1895 
1896 		/*
1897 		 * 2. allocate and initialize walker.
1898 		 */
1899 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1900 		if (w == NULL)
1901 			return -ENOMEM;
1902 		RT6_TRACE("dump<%p", w);
1903 		w->root = &ip6_routing_table;
1904 		w->func = fib6_dump_node;
1905 		w->args = &arg;
1906 		cb->args[0] = (long)w;
1907 		read_lock_bh(&rt6_lock);
1908 		res = fib6_walk(w);
1909 		read_unlock_bh(&rt6_lock);
1910 	} else {
1911 		w->args = &arg;
1912 		read_lock_bh(&rt6_lock);
1913 		res = fib6_walk_continue(w);
1914 		read_unlock_bh(&rt6_lock);
1915 	}
1916 #if RT6_DEBUG >= 3
1917 	if (res <= 0 && skb->len == 0)
1918 		RT6_TRACE("%p>dump end\n", w);
1919 #endif
1920 	res = res < 0 ? res : skb->len;
1921 	/* res < 0 is an error. (really, impossible)
1922 	   res == 0 means that dump is complete, but skb still can contain data.
1923 	   res > 0 dump is not complete, but frame is full.
1924 	 */
1925 	/* Destroy walker, if dump of this table is complete. */
1926 	if (res <= 0)
1927 		fib6_dump_end(cb);
1928 	return res;
1929 }
1930 
1931 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1932 {
1933 	struct rtattr **rta = arg;
1934 	int iif = 0;
1935 	int err = -ENOBUFS;
1936 	struct sk_buff *skb;
1937 	struct flowi fl;
1938 	struct rt6_info *rt;
1939 
1940 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1941 	if (skb == NULL)
1942 		goto out;
1943 
1944 	/* Reserve room for dummy headers, this skb can pass
1945 	   through good chunk of routing engine.
1946 	 */
1947 	skb->mac.raw = skb->data;
1948 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1949 
1950 	memset(&fl, 0, sizeof(fl));
1951 	if (rta[RTA_SRC-1])
1952 		ipv6_addr_copy(&fl.fl6_src,
1953 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1954 	if (rta[RTA_DST-1])
1955 		ipv6_addr_copy(&fl.fl6_dst,
1956 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1957 
1958 	if (rta[RTA_IIF-1])
1959 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1960 
1961 	if (iif) {
1962 		struct net_device *dev;
1963 		dev = __dev_get_by_index(iif);
1964 		if (!dev) {
1965 			err = -ENODEV;
1966 			goto out_free;
1967 		}
1968 	}
1969 
1970 	fl.oif = 0;
1971 	if (rta[RTA_OIF-1])
1972 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1973 
1974 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1975 
1976 	skb->dst = &rt->u.dst;
1977 
1978 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1979 	err = rt6_fill_node(skb, rt,
1980 			    &fl.fl6_dst, &fl.fl6_src,
1981 			    iif,
1982 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1983 			    nlh->nlmsg_seq, 0, 0);
1984 	if (err < 0) {
1985 		err = -EMSGSIZE;
1986 		goto out_free;
1987 	}
1988 
1989 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1990 	if (err > 0)
1991 		err = 0;
1992 out:
1993 	return err;
1994 out_free:
1995 	kfree_skb(skb);
1996 	goto out;
1997 }
1998 
1999 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
2000 			struct netlink_skb_parms *req)
2001 {
2002 	struct sk_buff *skb;
2003 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2004 	u32 pid = current->pid;
2005 	u32 seq = 0;
2006 
2007 	if (req)
2008 		pid = req->pid;
2009 	if (nlh)
2010 		seq = nlh->nlmsg_seq;
2011 
2012 	skb = alloc_skb(size, gfp_any());
2013 	if (!skb) {
2014 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2015 		return;
2016 	}
2017 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2018 		kfree_skb(skb);
2019 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2020 		return;
2021 	}
2022 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2023 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2024 }
2025 
2026 /*
2027  *	/proc
2028  */
2029 
2030 #ifdef CONFIG_PROC_FS
2031 
2032 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2033 
2034 struct rt6_proc_arg
2035 {
2036 	char *buffer;
2037 	int offset;
2038 	int length;
2039 	int skip;
2040 	int len;
2041 };
2042 
2043 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2044 {
2045 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2046 	int i;
2047 
2048 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2049 		arg->skip++;
2050 		return 0;
2051 	}
2052 
2053 	if (arg->len >= arg->length)
2054 		return 0;
2055 
2056 	for (i=0; i<16; i++) {
2057 		sprintf(arg->buffer + arg->len, "%02x",
2058 			rt->rt6i_dst.addr.s6_addr[i]);
2059 		arg->len += 2;
2060 	}
2061 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2062 			    rt->rt6i_dst.plen);
2063 
2064 #ifdef CONFIG_IPV6_SUBTREES
2065 	for (i=0; i<16; i++) {
2066 		sprintf(arg->buffer + arg->len, "%02x",
2067 			rt->rt6i_src.addr.s6_addr[i]);
2068 		arg->len += 2;
2069 	}
2070 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2071 			    rt->rt6i_src.plen);
2072 #else
2073 	sprintf(arg->buffer + arg->len,
2074 		"00000000000000000000000000000000 00 ");
2075 	arg->len += 36;
2076 #endif
2077 
2078 	if (rt->rt6i_nexthop) {
2079 		for (i=0; i<16; i++) {
2080 			sprintf(arg->buffer + arg->len, "%02x",
2081 				rt->rt6i_nexthop->primary_key[i]);
2082 			arg->len += 2;
2083 		}
2084 	} else {
2085 		sprintf(arg->buffer + arg->len,
2086 			"00000000000000000000000000000000");
2087 		arg->len += 32;
2088 	}
2089 	arg->len += sprintf(arg->buffer + arg->len,
2090 			    " %08x %08x %08x %08x %8s\n",
2091 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2092 			    rt->u.dst.__use, rt->rt6i_flags,
2093 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2094 	return 0;
2095 }
2096 
2097 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2098 {
2099 	struct rt6_proc_arg arg;
2100 	arg.buffer = buffer;
2101 	arg.offset = offset;
2102 	arg.length = length;
2103 	arg.skip = 0;
2104 	arg.len = 0;
2105 
2106 	read_lock_bh(&rt6_lock);
2107 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2108 	read_unlock_bh(&rt6_lock);
2109 
2110 	*start = buffer;
2111 	if (offset)
2112 		*start += offset % RT6_INFO_LEN;
2113 
2114 	arg.len -= offset % RT6_INFO_LEN;
2115 
2116 	if (arg.len > length)
2117 		arg.len = length;
2118 	if (arg.len < 0)
2119 		arg.len = 0;
2120 
2121 	return arg.len;
2122 }
2123 
2124 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2125 {
2126 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2127 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2128 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2129 		      rt6_stats.fib_rt_cache,
2130 		      atomic_read(&ip6_dst_ops.entries),
2131 		      rt6_stats.fib_discarded_routes);
2132 
2133 	return 0;
2134 }
2135 
2136 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2137 {
2138 	return single_open(file, rt6_stats_seq_show, NULL);
2139 }
2140 
2141 static struct file_operations rt6_stats_seq_fops = {
2142 	.owner	 = THIS_MODULE,
2143 	.open	 = rt6_stats_seq_open,
2144 	.read	 = seq_read,
2145 	.llseek	 = seq_lseek,
2146 	.release = single_release,
2147 };
2148 #endif	/* CONFIG_PROC_FS */
2149 
2150 #ifdef CONFIG_SYSCTL
2151 
2152 static int flush_delay;
2153 
2154 static
2155 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2156 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2157 {
2158 	if (write) {
2159 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2160 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2161 		return 0;
2162 	} else
2163 		return -EINVAL;
2164 }
2165 
2166 ctl_table ipv6_route_table[] = {
2167         {
2168 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2169 		.procname	=	"flush",
2170          	.data		=	&flush_delay,
2171 		.maxlen		=	sizeof(int),
2172 		.mode		=	0200,
2173          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2174 	},
2175 	{
2176 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2177 		.procname	=	"gc_thresh",
2178          	.data		=	&ip6_dst_ops.gc_thresh,
2179 		.maxlen		=	sizeof(int),
2180 		.mode		=	0644,
2181          	.proc_handler	=	&proc_dointvec,
2182 	},
2183 	{
2184 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2185 		.procname	=	"max_size",
2186          	.data		=	&ip6_rt_max_size,
2187 		.maxlen		=	sizeof(int),
2188 		.mode		=	0644,
2189          	.proc_handler	=	&proc_dointvec,
2190 	},
2191 	{
2192 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2193 		.procname	=	"gc_min_interval",
2194          	.data		=	&ip6_rt_gc_min_interval,
2195 		.maxlen		=	sizeof(int),
2196 		.mode		=	0644,
2197          	.proc_handler	=	&proc_dointvec_jiffies,
2198 		.strategy	=	&sysctl_jiffies,
2199 	},
2200 	{
2201 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2202 		.procname	=	"gc_timeout",
2203          	.data		=	&ip6_rt_gc_timeout,
2204 		.maxlen		=	sizeof(int),
2205 		.mode		=	0644,
2206          	.proc_handler	=	&proc_dointvec_jiffies,
2207 		.strategy	=	&sysctl_jiffies,
2208 	},
2209 	{
2210 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2211 		.procname	=	"gc_interval",
2212          	.data		=	&ip6_rt_gc_interval,
2213 		.maxlen		=	sizeof(int),
2214 		.mode		=	0644,
2215          	.proc_handler	=	&proc_dointvec_jiffies,
2216 		.strategy	=	&sysctl_jiffies,
2217 	},
2218 	{
2219 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2220 		.procname	=	"gc_elasticity",
2221          	.data		=	&ip6_rt_gc_elasticity,
2222 		.maxlen		=	sizeof(int),
2223 		.mode		=	0644,
2224          	.proc_handler	=	&proc_dointvec_jiffies,
2225 		.strategy	=	&sysctl_jiffies,
2226 	},
2227 	{
2228 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2229 		.procname	=	"mtu_expires",
2230          	.data		=	&ip6_rt_mtu_expires,
2231 		.maxlen		=	sizeof(int),
2232 		.mode		=	0644,
2233          	.proc_handler	=	&proc_dointvec_jiffies,
2234 		.strategy	=	&sysctl_jiffies,
2235 	},
2236 	{
2237 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2238 		.procname	=	"min_adv_mss",
2239          	.data		=	&ip6_rt_min_advmss,
2240 		.maxlen		=	sizeof(int),
2241 		.mode		=	0644,
2242          	.proc_handler	=	&proc_dointvec_jiffies,
2243 		.strategy	=	&sysctl_jiffies,
2244 	},
2245 	{
2246 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2247 		.procname	=	"gc_min_interval_ms",
2248          	.data		=	&ip6_rt_gc_min_interval,
2249 		.maxlen		=	sizeof(int),
2250 		.mode		=	0644,
2251          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2252 		.strategy	=	&sysctl_ms_jiffies,
2253 	},
2254 	{ .ctl_name = 0 }
2255 };
2256 
2257 #endif
2258 
2259 void __init ip6_route_init(void)
2260 {
2261 	struct proc_dir_entry *p;
2262 
2263 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2264 						     sizeof(struct rt6_info),
2265 						     0, SLAB_HWCACHE_ALIGN,
2266 						     NULL, NULL);
2267 	if (!ip6_dst_ops.kmem_cachep)
2268 		panic("cannot create ip6_dst_cache");
2269 
2270 	fib6_init();
2271 #ifdef 	CONFIG_PROC_FS
2272 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2273 	if (p)
2274 		p->owner = THIS_MODULE;
2275 
2276 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2277 #endif
2278 #ifdef CONFIG_XFRM
2279 	xfrm6_init();
2280 #endif
2281 }
2282 
2283 void ip6_route_cleanup(void)
2284 {
2285 #ifdef CONFIG_PROC_FS
2286 	proc_net_remove("ipv6_route");
2287 	proc_net_remove("rt6_stats");
2288 #endif
2289 #ifdef CONFIG_XFRM
2290 	xfrm6_fini();
2291 #endif
2292 	rt6_ifdown(NULL);
2293 	fib6_gc_cleanup();
2294 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2295 }
2296