xref: /linux/net/ipv6/route.c (revision 6e8331ac6973435b1e7604c30f2ad394035b46e1)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40 
41 #ifdef 	CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45 
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 	    !(rt->rt6i_flags & RTF_GATEWAY))
285 		m = 1;
286 	else if (neigh) {
287 		read_lock_bh(&neigh->lock);
288 		if (neigh->nud_state & NUD_VALID)
289 			m = 2;
290 		read_unlock_bh(&neigh->lock);
291 	}
292 	return m;
293 }
294 
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296 			   int strict)
297 {
298 	int m, n;
299 
300 	m = rt6_check_dev(rt, oif);
301 	if (!m && (strict & RT6_SELECT_F_IFACE))
302 		return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306 	n = rt6_check_neigh(rt);
307 	if (n > 1)
308 		m |= 16;
309 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
310 		return -1;
311 	return m;
312 }
313 
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315 				   int strict)
316 {
317 	struct rt6_info *match = NULL, *last = NULL;
318 	struct rt6_info *rt, *rt0 = *head;
319 	u32 metric;
320 	int mpri = -1;
321 
322 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 		  __FUNCTION__, head, head ? *head : NULL, oif);
324 
325 	for (rt = rt0, metric = rt0->rt6i_metric;
326 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327 	     rt = rt->u.next) {
328 		int m;
329 
330 		if (rt6_check_expired(rt))
331 			continue;
332 
333 		last = rt;
334 
335 		m = rt6_score_route(rt, oif, strict);
336 		if (m < 0)
337 			continue;
338 
339 		if (m > mpri) {
340 			rt6_probe(match);
341 			match = rt;
342 			mpri = m;
343 		} else {
344 			rt6_probe(rt);
345 		}
346 	}
347 
348 	if (!match &&
349 	    (strict & RT6_SELECT_F_REACHABLE) &&
350 	    last && last != rt0) {
351 		/* no entries matched; do round-robin */
352 		static DEFINE_SPINLOCK(lock);
353 		spin_lock(&lock);
354 		*head = rt0->u.next;
355 		rt0->u.next = last->u.next;
356 		last->u.next = rt0;
357 		spin_unlock(&lock);
358 	}
359 
360 	RT6_TRACE("%s() => %p, score=%d\n",
361 		  __FUNCTION__, match, mpri);
362 
363 	return (match ? match : &ip6_null_entry);
364 }
365 
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 		  struct in6_addr *gwaddr)
369 {
370 	struct route_info *rinfo = (struct route_info *) opt;
371 	struct in6_addr prefix_buf, *prefix;
372 	unsigned int pref;
373 	u32 lifetime;
374 	struct rt6_info *rt;
375 
376 	if (len < sizeof(struct route_info)) {
377 		return -EINVAL;
378 	}
379 
380 	/* Sanity check for prefix_len and length */
381 	if (rinfo->length > 3) {
382 		return -EINVAL;
383 	} else if (rinfo->prefix_len > 128) {
384 		return -EINVAL;
385 	} else if (rinfo->prefix_len > 64) {
386 		if (rinfo->length < 2) {
387 			return -EINVAL;
388 		}
389 	} else if (rinfo->prefix_len > 0) {
390 		if (rinfo->length < 1) {
391 			return -EINVAL;
392 		}
393 	}
394 
395 	pref = rinfo->route_pref;
396 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
398 
399 	lifetime = htonl(rinfo->lifetime);
400 	if (lifetime == 0xffffffff) {
401 		/* infinity */
402 	} else if (lifetime > 0x7fffffff/HZ) {
403 		/* Avoid arithmetic overflow */
404 		lifetime = 0x7fffffff/HZ - 1;
405 	}
406 
407 	if (rinfo->length == 3)
408 		prefix = (struct in6_addr *)rinfo->prefix;
409 	else {
410 		/* this function is safe */
411 		ipv6_addr_prefix(&prefix_buf,
412 				 (struct in6_addr *)rinfo->prefix,
413 				 rinfo->prefix_len);
414 		prefix = &prefix_buf;
415 	}
416 
417 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418 
419 	if (rt && !lifetime) {
420 		ip6_del_rt(rt, NULL, NULL, NULL);
421 		rt = NULL;
422 	}
423 
424 	if (!rt && lifetime)
425 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426 					pref);
427 	else if (rt)
428 		rt->rt6i_flags = RTF_ROUTEINFO |
429 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430 
431 	if (rt) {
432 		if (lifetime == 0xffffffff) {
433 			rt->rt6i_flags &= ~RTF_EXPIRES;
434 		} else {
435 			rt->rt6i_expires = jiffies + HZ * lifetime;
436 			rt->rt6i_flags |= RTF_EXPIRES;
437 		}
438 		dst_release(&rt->u.dst);
439 	}
440 	return 0;
441 }
442 #endif
443 
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445 			    int oif, int strict)
446 {
447 	struct fib6_node *fn;
448 	struct rt6_info *rt;
449 
450 	read_lock_bh(&rt6_lock);
451 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 	rt = rt6_device_match(fn->leaf, oif, strict);
453 	dst_hold(&rt->u.dst);
454 	rt->u.dst.__use++;
455 	read_unlock_bh(&rt6_lock);
456 
457 	rt->u.dst.lastuse = jiffies;
458 	if (rt->u.dst.error == 0)
459 		return rt;
460 	dst_release(&rt->u.dst);
461 	return NULL;
462 }
463 
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469 
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 		void *_rtattr, struct netlink_skb_parms *req)
472 {
473 	int err;
474 
475 	write_lock_bh(&rt6_lock);
476 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477 	write_unlock_bh(&rt6_lock);
478 
479 	return err;
480 }
481 
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 				      struct in6_addr *saddr)
484 {
485 	struct rt6_info *rt;
486 
487 	/*
488 	 *	Clone the route.
489 	 */
490 
491 	rt = ip6_rt_copy(ort);
492 
493 	if (rt) {
494 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 			if (rt->rt6i_dst.plen != 128 &&
496 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 				rt->rt6i_flags |= RTF_ANYCAST;
498 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499 		}
500 
501 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502 		rt->rt6i_dst.plen = 128;
503 		rt->rt6i_flags |= RTF_CACHE;
504 		rt->u.dst.flags |= DST_HOST;
505 
506 #ifdef CONFIG_IPV6_SUBTREES
507 		if (rt->rt6i_src.plen && saddr) {
508 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 			rt->rt6i_src.plen = 128;
510 		}
511 #endif
512 
513 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514 
515 	}
516 
517 	return rt;
518 }
519 
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522 	struct rt6_info *rt = ip6_rt_copy(ort);
523 	if (rt) {
524 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 		rt->rt6i_dst.plen = 128;
526 		rt->rt6i_flags |= RTF_CACHE;
527 		if (rt->rt6i_flags & RTF_REJECT)
528 			rt->u.dst.error = ort->u.dst.error;
529 		rt->u.dst.flags |= DST_HOST;
530 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531 	}
532 	return rt;
533 }
534 
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538 		if (fn->fn_flags & RTN_ROOT) { \
539 			goto out; \
540 		} \
541 		if (fn->fn_flags & RTN_RTINFO) \
542 			goto restart; \
543 	} \
544 }
545 
546 
547 void ip6_route_input(struct sk_buff *skb)
548 {
549 	struct fib6_node *fn;
550 	struct rt6_info *rt, *nrt;
551 	int strict;
552 	int attempts = 3;
553 	int err;
554 	int reachable = RT6_SELECT_F_REACHABLE;
555 
556 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557 
558 relookup:
559 	read_lock_bh(&rt6_lock);
560 
561 restart_2:
562 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 			 &skb->nh.ipv6h->saddr);
564 
565 restart:
566 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567 	BACKTRACK();
568 	if (rt == &ip6_null_entry ||
569 	    rt->rt6i_flags & RTF_CACHE)
570 		goto out;
571 
572 	dst_hold(&rt->u.dst);
573 	read_unlock_bh(&rt6_lock);
574 
575 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577 	else {
578 #if CLONE_OFFLINK_ROUTE
579 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581 		goto out2;
582 #endif
583 	}
584 
585 	dst_release(&rt->u.dst);
586 	rt = nrt ? : &ip6_null_entry;
587 
588 	dst_hold(&rt->u.dst);
589 	if (nrt) {
590 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591 		if (!err)
592 			goto out2;
593 	}
594 
595 	if (--attempts <= 0)
596 		goto out2;
597 
598 	/*
599 	 * Race condition! In the gap, when rt6_lock was
600 	 * released someone could insert this route.  Relookup.
601 	 */
602 	dst_release(&rt->u.dst);
603 	goto relookup;
604 
605 out:
606 	if (reachable) {
607 		reachable = 0;
608 		goto restart_2;
609 	}
610 	dst_hold(&rt->u.dst);
611 	read_unlock_bh(&rt6_lock);
612 out2:
613 	rt->u.dst.lastuse = jiffies;
614 	rt->u.dst.__use++;
615 	skb->dst = (struct dst_entry *) rt;
616 	return;
617 }
618 
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621 	struct fib6_node *fn;
622 	struct rt6_info *rt, *nrt;
623 	int strict;
624 	int attempts = 3;
625 	int err;
626 	int reachable = RT6_SELECT_F_REACHABLE;
627 
628 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629 
630 relookup:
631 	read_lock_bh(&rt6_lock);
632 
633 restart_2:
634 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635 
636 restart:
637 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638 	BACKTRACK();
639 	if (rt == &ip6_null_entry ||
640 	    rt->rt6i_flags & RTF_CACHE)
641 		goto out;
642 
643 	dst_hold(&rt->u.dst);
644 	read_unlock_bh(&rt6_lock);
645 
646 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648 	else {
649 #if CLONE_OFFLINK_ROUTE
650 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652 		goto out2;
653 #endif
654 	}
655 
656 	dst_release(&rt->u.dst);
657 	rt = nrt ? : &ip6_null_entry;
658 
659 	dst_hold(&rt->u.dst);
660 	if (nrt) {
661 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662 		if (!err)
663 			goto out2;
664 	}
665 
666 	if (--attempts <= 0)
667 		goto out2;
668 
669 	/*
670 	 * Race condition! In the gap, when rt6_lock was
671 	 * released someone could insert this route.  Relookup.
672 	 */
673 	dst_release(&rt->u.dst);
674 	goto relookup;
675 
676 out:
677 	if (reachable) {
678 		reachable = 0;
679 		goto restart_2;
680 	}
681 	dst_hold(&rt->u.dst);
682 	read_unlock_bh(&rt6_lock);
683 out2:
684 	rt->u.dst.lastuse = jiffies;
685 	rt->u.dst.__use++;
686 	return &rt->u.dst;
687 }
688 
689 
690 /*
691  *	Destination cache support functions
692  */
693 
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696 	struct rt6_info *rt;
697 
698 	rt = (struct rt6_info *) dst;
699 
700 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701 		return dst;
702 
703 	return NULL;
704 }
705 
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708 	struct rt6_info *rt = (struct rt6_info *) dst;
709 
710 	if (rt) {
711 		if (rt->rt6i_flags & RTF_CACHE)
712 			ip6_del_rt(rt, NULL, NULL, NULL);
713 		else
714 			dst_release(dst);
715 	}
716 	return NULL;
717 }
718 
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721 	struct rt6_info *rt;
722 
723 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724 
725 	rt = (struct rt6_info *) skb->dst;
726 	if (rt) {
727 		if (rt->rt6i_flags&RTF_CACHE) {
728 			dst_set_expires(&rt->u.dst, 0);
729 			rt->rt6i_flags |= RTF_EXPIRES;
730 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 			rt->rt6i_node->fn_sernum = -1;
732 	}
733 }
734 
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737 	struct rt6_info *rt6 = (struct rt6_info*)dst;
738 
739 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 		rt6->rt6i_flags |= RTF_MODIFIED;
741 		if (mtu < IPV6_MIN_MTU) {
742 			mtu = IPV6_MIN_MTU;
743 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744 		}
745 		dst->metrics[RTAX_MTU-1] = mtu;
746 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747 	}
748 }
749 
750 /* Protected by rt6_lock.  */
751 static struct dst_entry *ndisc_dst_gc_list;
752 static int ipv6_get_mtu(struct net_device *dev);
753 
754 static inline unsigned int ipv6_advmss(unsigned int mtu)
755 {
756 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
757 
758 	if (mtu < ip6_rt_min_advmss)
759 		mtu = ip6_rt_min_advmss;
760 
761 	/*
762 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
763 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
764 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
765 	 * rely only on pmtu discovery"
766 	 */
767 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
768 		mtu = IPV6_MAXPLEN;
769 	return mtu;
770 }
771 
772 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
773 				  struct neighbour *neigh,
774 				  struct in6_addr *addr,
775 				  int (*output)(struct sk_buff *))
776 {
777 	struct rt6_info *rt;
778 	struct inet6_dev *idev = in6_dev_get(dev);
779 
780 	if (unlikely(idev == NULL))
781 		return NULL;
782 
783 	rt = ip6_dst_alloc();
784 	if (unlikely(rt == NULL)) {
785 		in6_dev_put(idev);
786 		goto out;
787 	}
788 
789 	dev_hold(dev);
790 	if (neigh)
791 		neigh_hold(neigh);
792 	else
793 		neigh = ndisc_get_neigh(dev, addr);
794 
795 	rt->rt6i_dev	  = dev;
796 	rt->rt6i_idev     = idev;
797 	rt->rt6i_nexthop  = neigh;
798 	atomic_set(&rt->u.dst.__refcnt, 1);
799 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
800 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
801 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
802 	rt->u.dst.output  = output;
803 
804 #if 0	/* there's no chance to use these for ndisc */
805 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
806 				? DST_HOST
807 				: 0;
808 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
809 	rt->rt6i_dst.plen = 128;
810 #endif
811 
812 	write_lock_bh(&rt6_lock);
813 	rt->u.dst.next = ndisc_dst_gc_list;
814 	ndisc_dst_gc_list = &rt->u.dst;
815 	write_unlock_bh(&rt6_lock);
816 
817 	fib6_force_start_gc();
818 
819 out:
820 	return (struct dst_entry *)rt;
821 }
822 
823 int ndisc_dst_gc(int *more)
824 {
825 	struct dst_entry *dst, *next, **pprev;
826 	int freed;
827 
828 	next = NULL;
829 	pprev = &ndisc_dst_gc_list;
830 	freed = 0;
831 	while ((dst = *pprev) != NULL) {
832 		if (!atomic_read(&dst->__refcnt)) {
833 			*pprev = dst->next;
834 			dst_free(dst);
835 			freed++;
836 		} else {
837 			pprev = &dst->next;
838 			(*more)++;
839 		}
840 	}
841 
842 	return freed;
843 }
844 
845 static int ip6_dst_gc(void)
846 {
847 	static unsigned expire = 30*HZ;
848 	static unsigned long last_gc;
849 	unsigned long now = jiffies;
850 
851 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
852 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
853 		goto out;
854 
855 	expire++;
856 	fib6_run_gc(expire);
857 	last_gc = now;
858 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
859 		expire = ip6_rt_gc_timeout>>1;
860 
861 out:
862 	expire -= expire>>ip6_rt_gc_elasticity;
863 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
864 }
865 
866 /* Clean host part of a prefix. Not necessary in radix tree,
867    but results in cleaner routing tables.
868 
869    Remove it only when all the things will work!
870  */
871 
872 static int ipv6_get_mtu(struct net_device *dev)
873 {
874 	int mtu = IPV6_MIN_MTU;
875 	struct inet6_dev *idev;
876 
877 	idev = in6_dev_get(dev);
878 	if (idev) {
879 		mtu = idev->cnf.mtu6;
880 		in6_dev_put(idev);
881 	}
882 	return mtu;
883 }
884 
885 int ipv6_get_hoplimit(struct net_device *dev)
886 {
887 	int hoplimit = ipv6_devconf.hop_limit;
888 	struct inet6_dev *idev;
889 
890 	idev = in6_dev_get(dev);
891 	if (idev) {
892 		hoplimit = idev->cnf.hop_limit;
893 		in6_dev_put(idev);
894 	}
895 	return hoplimit;
896 }
897 
898 /*
899  *
900  */
901 
902 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
903 		void *_rtattr, struct netlink_skb_parms *req)
904 {
905 	int err;
906 	struct rtmsg *r;
907 	struct rtattr **rta;
908 	struct rt6_info *rt = NULL;
909 	struct net_device *dev = NULL;
910 	struct inet6_dev *idev = NULL;
911 	int addr_type;
912 
913 	rta = (struct rtattr **) _rtattr;
914 
915 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
916 		return -EINVAL;
917 #ifndef CONFIG_IPV6_SUBTREES
918 	if (rtmsg->rtmsg_src_len)
919 		return -EINVAL;
920 #endif
921 	if (rtmsg->rtmsg_ifindex) {
922 		err = -ENODEV;
923 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
924 		if (!dev)
925 			goto out;
926 		idev = in6_dev_get(dev);
927 		if (!idev)
928 			goto out;
929 	}
930 
931 	if (rtmsg->rtmsg_metric == 0)
932 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
933 
934 	rt = ip6_dst_alloc();
935 
936 	if (rt == NULL) {
937 		err = -ENOMEM;
938 		goto out;
939 	}
940 
941 	rt->u.dst.obsolete = -1;
942 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
943 	if (nlh && (r = NLMSG_DATA(nlh))) {
944 		rt->rt6i_protocol = r->rtm_protocol;
945 	} else {
946 		rt->rt6i_protocol = RTPROT_BOOT;
947 	}
948 
949 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
950 
951 	if (addr_type & IPV6_ADDR_MULTICAST)
952 		rt->u.dst.input = ip6_mc_input;
953 	else
954 		rt->u.dst.input = ip6_forward;
955 
956 	rt->u.dst.output = ip6_output;
957 
958 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
959 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
960 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
961 	if (rt->rt6i_dst.plen == 128)
962 	       rt->u.dst.flags = DST_HOST;
963 
964 #ifdef CONFIG_IPV6_SUBTREES
965 	ipv6_addr_prefix(&rt->rt6i_src.addr,
966 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
967 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
968 #endif
969 
970 	rt->rt6i_metric = rtmsg->rtmsg_metric;
971 
972 	/* We cannot add true routes via loopback here,
973 	   they would result in kernel looping; promote them to reject routes
974 	 */
975 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
976 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
977 		/* hold loopback dev/idev if we haven't done so. */
978 		if (dev != &loopback_dev) {
979 			if (dev) {
980 				dev_put(dev);
981 				in6_dev_put(idev);
982 			}
983 			dev = &loopback_dev;
984 			dev_hold(dev);
985 			idev = in6_dev_get(dev);
986 			if (!idev) {
987 				err = -ENODEV;
988 				goto out;
989 			}
990 		}
991 		rt->u.dst.output = ip6_pkt_discard_out;
992 		rt->u.dst.input = ip6_pkt_discard;
993 		rt->u.dst.error = -ENETUNREACH;
994 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
995 		goto install_route;
996 	}
997 
998 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
999 		struct in6_addr *gw_addr;
1000 		int gwa_type;
1001 
1002 		gw_addr = &rtmsg->rtmsg_gateway;
1003 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1004 		gwa_type = ipv6_addr_type(gw_addr);
1005 
1006 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1007 			struct rt6_info *grt;
1008 
1009 			/* IPv6 strictly inhibits using not link-local
1010 			   addresses as nexthop address.
1011 			   Otherwise, router will not able to send redirects.
1012 			   It is very good, but in some (rare!) circumstances
1013 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1014 			   some exceptions. --ANK
1015 			 */
1016 			err = -EINVAL;
1017 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1018 				goto out;
1019 
1020 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1021 
1022 			err = -EHOSTUNREACH;
1023 			if (grt == NULL)
1024 				goto out;
1025 			if (dev) {
1026 				if (dev != grt->rt6i_dev) {
1027 					dst_release(&grt->u.dst);
1028 					goto out;
1029 				}
1030 			} else {
1031 				dev = grt->rt6i_dev;
1032 				idev = grt->rt6i_idev;
1033 				dev_hold(dev);
1034 				in6_dev_hold(grt->rt6i_idev);
1035 			}
1036 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1037 				err = 0;
1038 			dst_release(&grt->u.dst);
1039 
1040 			if (err)
1041 				goto out;
1042 		}
1043 		err = -EINVAL;
1044 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1045 			goto out;
1046 	}
1047 
1048 	err = -ENODEV;
1049 	if (dev == NULL)
1050 		goto out;
1051 
1052 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1053 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1054 		if (IS_ERR(rt->rt6i_nexthop)) {
1055 			err = PTR_ERR(rt->rt6i_nexthop);
1056 			rt->rt6i_nexthop = NULL;
1057 			goto out;
1058 		}
1059 	}
1060 
1061 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1062 
1063 install_route:
1064 	if (rta && rta[RTA_METRICS-1]) {
1065 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1066 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1067 
1068 		while (RTA_OK(attr, attrlen)) {
1069 			unsigned flavor = attr->rta_type;
1070 			if (flavor) {
1071 				if (flavor > RTAX_MAX) {
1072 					err = -EINVAL;
1073 					goto out;
1074 				}
1075 				rt->u.dst.metrics[flavor-1] =
1076 					*(u32 *)RTA_DATA(attr);
1077 			}
1078 			attr = RTA_NEXT(attr, attrlen);
1079 		}
1080 	}
1081 
1082 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1083 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1084 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1085 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1086 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1087 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1088 	rt->u.dst.dev = dev;
1089 	rt->rt6i_idev = idev;
1090 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1091 
1092 out:
1093 	if (dev)
1094 		dev_put(dev);
1095 	if (idev)
1096 		in6_dev_put(idev);
1097 	if (rt)
1098 		dst_free((struct dst_entry *) rt);
1099 	return err;
1100 }
1101 
1102 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1103 {
1104 	int err;
1105 
1106 	write_lock_bh(&rt6_lock);
1107 
1108 	err = fib6_del(rt, nlh, _rtattr, req);
1109 	dst_release(&rt->u.dst);
1110 
1111 	write_unlock_bh(&rt6_lock);
1112 
1113 	return err;
1114 }
1115 
1116 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1117 {
1118 	struct fib6_node *fn;
1119 	struct rt6_info *rt;
1120 	int err = -ESRCH;
1121 
1122 	read_lock_bh(&rt6_lock);
1123 
1124 	fn = fib6_locate(&ip6_routing_table,
1125 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1126 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1127 
1128 	if (fn) {
1129 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1130 			if (rtmsg->rtmsg_ifindex &&
1131 			    (rt->rt6i_dev == NULL ||
1132 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1133 				continue;
1134 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1135 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1136 				continue;
1137 			if (rtmsg->rtmsg_metric &&
1138 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1139 				continue;
1140 			dst_hold(&rt->u.dst);
1141 			read_unlock_bh(&rt6_lock);
1142 
1143 			return ip6_del_rt(rt, nlh, _rtattr, req);
1144 		}
1145 	}
1146 	read_unlock_bh(&rt6_lock);
1147 
1148 	return err;
1149 }
1150 
1151 /*
1152  *	Handle redirects
1153  */
1154 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1155 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1156 {
1157 	struct rt6_info *rt, *nrt = NULL;
1158 	int strict;
1159 	struct fib6_node *fn;
1160 	struct netevent_redirect netevent;
1161 
1162 	/*
1163 	 * Get the "current" route for this destination and
1164 	 * check if the redirect has come from approriate router.
1165 	 *
1166 	 * RFC 2461 specifies that redirects should only be
1167 	 * accepted if they come from the nexthop to the target.
1168 	 * Due to the way the routes are chosen, this notion
1169 	 * is a bit fuzzy and one might need to check all possible
1170 	 * routes.
1171 	 */
1172 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1173 
1174 	read_lock_bh(&rt6_lock);
1175 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1176 restart:
1177 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1178 		/*
1179 		 * Current route is on-link; redirect is always invalid.
1180 		 *
1181 		 * Seems, previous statement is not true. It could
1182 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1183 		 * But then router serving it might decide, that we should
1184 		 * know truth 8)8) --ANK (980726).
1185 		 */
1186 		if (rt6_check_expired(rt))
1187 			continue;
1188 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1189 			continue;
1190 		if (neigh->dev != rt->rt6i_dev)
1191 			continue;
1192 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1193 			continue;
1194 		break;
1195 	}
1196 	if (rt)
1197 		dst_hold(&rt->u.dst);
1198 	else if (strict) {
1199 		while ((fn = fn->parent) != NULL) {
1200 			if (fn->fn_flags & RTN_ROOT)
1201 				break;
1202 			if (fn->fn_flags & RTN_RTINFO)
1203 				goto restart;
1204 		}
1205 	}
1206 	read_unlock_bh(&rt6_lock);
1207 
1208 	if (!rt) {
1209 		if (net_ratelimit())
1210 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1211 			       "for redirect target\n");
1212 		return;
1213 	}
1214 
1215 	/*
1216 	 *	We have finally decided to accept it.
1217 	 */
1218 
1219 	neigh_update(neigh, lladdr, NUD_STALE,
1220 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1221 		     NEIGH_UPDATE_F_OVERRIDE|
1222 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1223 				     NEIGH_UPDATE_F_ISROUTER))
1224 		     );
1225 
1226 	/*
1227 	 * Redirect received -> path was valid.
1228 	 * Look, redirects are sent only in response to data packets,
1229 	 * so that this nexthop apparently is reachable. --ANK
1230 	 */
1231 	dst_confirm(&rt->u.dst);
1232 
1233 	/* Duplicate redirect: silently ignore. */
1234 	if (neigh == rt->u.dst.neighbour)
1235 		goto out;
1236 
1237 	nrt = ip6_rt_copy(rt);
1238 	if (nrt == NULL)
1239 		goto out;
1240 
1241 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1242 	if (on_link)
1243 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1244 
1245 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1246 	nrt->rt6i_dst.plen = 128;
1247 	nrt->u.dst.flags |= DST_HOST;
1248 
1249 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1250 	nrt->rt6i_nexthop = neigh_clone(neigh);
1251 	/* Reset pmtu, it may be better */
1252 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1253 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1254 
1255 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1256 		goto out;
1257 
1258 	netevent.old = &rt->u.dst;
1259 	netevent.new = &nrt->u.dst;
1260 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1261 
1262 	if (rt->rt6i_flags&RTF_CACHE) {
1263 		ip6_del_rt(rt, NULL, NULL, NULL);
1264 		return;
1265 	}
1266 
1267 out:
1268         dst_release(&rt->u.dst);
1269 	return;
1270 }
1271 
1272 /*
1273  *	Handle ICMP "packet too big" messages
1274  *	i.e. Path MTU discovery
1275  */
1276 
1277 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1278 			struct net_device *dev, u32 pmtu)
1279 {
1280 	struct rt6_info *rt, *nrt;
1281 	int allfrag = 0;
1282 
1283 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1284 	if (rt == NULL)
1285 		return;
1286 
1287 	if (pmtu >= dst_mtu(&rt->u.dst))
1288 		goto out;
1289 
1290 	if (pmtu < IPV6_MIN_MTU) {
1291 		/*
1292 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1293 		 * MTU (1280) and a fragment header should always be included
1294 		 * after a node receiving Too Big message reporting PMTU is
1295 		 * less than the IPv6 Minimum Link MTU.
1296 		 */
1297 		pmtu = IPV6_MIN_MTU;
1298 		allfrag = 1;
1299 	}
1300 
1301 	/* New mtu received -> path was valid.
1302 	   They are sent only in response to data packets,
1303 	   so that this nexthop apparently is reachable. --ANK
1304 	 */
1305 	dst_confirm(&rt->u.dst);
1306 
1307 	/* Host route. If it is static, it would be better
1308 	   not to override it, but add new one, so that
1309 	   when cache entry will expire old pmtu
1310 	   would return automatically.
1311 	 */
1312 	if (rt->rt6i_flags & RTF_CACHE) {
1313 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314 		if (allfrag)
1315 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1317 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1318 		goto out;
1319 	}
1320 
1321 	/* Network route.
1322 	   Two cases are possible:
1323 	   1. It is connected route. Action: COW
1324 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1325 	 */
1326 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1327 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1328 	else
1329 		nrt = rt6_alloc_clone(rt, daddr);
1330 
1331 	if (nrt) {
1332 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1333 		if (allfrag)
1334 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1335 
1336 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1337 		 * happened within 5 mins, the recommended timer is 10 mins.
1338 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1339 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1340 		 * and detecting PMTU increase will be automatically happened.
1341 		 */
1342 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1343 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1344 
1345 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1346 	}
1347 out:
1348 	dst_release(&rt->u.dst);
1349 }
1350 
1351 /*
1352  *	Misc support functions
1353  */
1354 
1355 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1356 {
1357 	struct rt6_info *rt = ip6_dst_alloc();
1358 
1359 	if (rt) {
1360 		rt->u.dst.input = ort->u.dst.input;
1361 		rt->u.dst.output = ort->u.dst.output;
1362 
1363 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1364 		rt->u.dst.dev = ort->u.dst.dev;
1365 		if (rt->u.dst.dev)
1366 			dev_hold(rt->u.dst.dev);
1367 		rt->rt6i_idev = ort->rt6i_idev;
1368 		if (rt->rt6i_idev)
1369 			in6_dev_hold(rt->rt6i_idev);
1370 		rt->u.dst.lastuse = jiffies;
1371 		rt->rt6i_expires = 0;
1372 
1373 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1374 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1375 		rt->rt6i_metric = 0;
1376 
1377 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1378 #ifdef CONFIG_IPV6_SUBTREES
1379 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1380 #endif
1381 	}
1382 	return rt;
1383 }
1384 
1385 #ifdef CONFIG_IPV6_ROUTE_INFO
1386 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1387 					   struct in6_addr *gwaddr, int ifindex)
1388 {
1389 	struct fib6_node *fn;
1390 	struct rt6_info *rt = NULL;
1391 
1392 	write_lock_bh(&rt6_lock);
1393 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1394 	if (!fn)
1395 		goto out;
1396 
1397 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1398 		if (rt->rt6i_dev->ifindex != ifindex)
1399 			continue;
1400 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1401 			continue;
1402 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1403 			continue;
1404 		dst_hold(&rt->u.dst);
1405 		break;
1406 	}
1407 out:
1408 	write_unlock_bh(&rt6_lock);
1409 	return rt;
1410 }
1411 
1412 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1413 					   struct in6_addr *gwaddr, int ifindex,
1414 					   unsigned pref)
1415 {
1416 	struct in6_rtmsg rtmsg;
1417 
1418 	memset(&rtmsg, 0, sizeof(rtmsg));
1419 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1420 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1421 	rtmsg.rtmsg_dst_len = prefixlen;
1422 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1423 	rtmsg.rtmsg_metric = 1024;
1424 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1425 	/* We should treat it as a default route if prefix length is 0. */
1426 	if (!prefixlen)
1427 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1428 	rtmsg.rtmsg_ifindex = ifindex;
1429 
1430 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1431 
1432 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1433 }
1434 #endif
1435 
1436 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1437 {
1438 	struct rt6_info *rt;
1439 	struct fib6_node *fn;
1440 
1441 	fn = &ip6_routing_table;
1442 
1443 	write_lock_bh(&rt6_lock);
1444 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1445 		if (dev == rt->rt6i_dev &&
1446 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1447 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1448 			break;
1449 	}
1450 	if (rt)
1451 		dst_hold(&rt->u.dst);
1452 	write_unlock_bh(&rt6_lock);
1453 	return rt;
1454 }
1455 
1456 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1457 				     struct net_device *dev,
1458 				     unsigned int pref)
1459 {
1460 	struct in6_rtmsg rtmsg;
1461 
1462 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1463 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1464 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1465 	rtmsg.rtmsg_metric = 1024;
1466 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1467 			    RTF_PREF(pref);
1468 
1469 	rtmsg.rtmsg_ifindex = dev->ifindex;
1470 
1471 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1472 	return rt6_get_dflt_router(gwaddr, dev);
1473 }
1474 
1475 void rt6_purge_dflt_routers(void)
1476 {
1477 	struct rt6_info *rt;
1478 
1479 restart:
1480 	read_lock_bh(&rt6_lock);
1481 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1482 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1483 			dst_hold(&rt->u.dst);
1484 
1485 			read_unlock_bh(&rt6_lock);
1486 
1487 			ip6_del_rt(rt, NULL, NULL, NULL);
1488 
1489 			goto restart;
1490 		}
1491 	}
1492 	read_unlock_bh(&rt6_lock);
1493 }
1494 
1495 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1496 {
1497 	struct in6_rtmsg rtmsg;
1498 	int err;
1499 
1500 	switch(cmd) {
1501 	case SIOCADDRT:		/* Add a route */
1502 	case SIOCDELRT:		/* Delete a route */
1503 		if (!capable(CAP_NET_ADMIN))
1504 			return -EPERM;
1505 		err = copy_from_user(&rtmsg, arg,
1506 				     sizeof(struct in6_rtmsg));
1507 		if (err)
1508 			return -EFAULT;
1509 
1510 		rtnl_lock();
1511 		switch (cmd) {
1512 		case SIOCADDRT:
1513 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1514 			break;
1515 		case SIOCDELRT:
1516 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1517 			break;
1518 		default:
1519 			err = -EINVAL;
1520 		}
1521 		rtnl_unlock();
1522 
1523 		return err;
1524 	};
1525 
1526 	return -EINVAL;
1527 }
1528 
1529 /*
1530  *	Drop the packet on the floor
1531  */
1532 
1533 static int ip6_pkt_discard(struct sk_buff *skb)
1534 {
1535 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1536 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1537 	kfree_skb(skb);
1538 	return 0;
1539 }
1540 
1541 static int ip6_pkt_discard_out(struct sk_buff *skb)
1542 {
1543 	skb->dev = skb->dst->dev;
1544 	return ip6_pkt_discard(skb);
1545 }
1546 
1547 /*
1548  *	Allocate a dst for local (unicast / anycast) address.
1549  */
1550 
1551 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1552 				    const struct in6_addr *addr,
1553 				    int anycast)
1554 {
1555 	struct rt6_info *rt = ip6_dst_alloc();
1556 
1557 	if (rt == NULL)
1558 		return ERR_PTR(-ENOMEM);
1559 
1560 	dev_hold(&loopback_dev);
1561 	in6_dev_hold(idev);
1562 
1563 	rt->u.dst.flags = DST_HOST;
1564 	rt->u.dst.input = ip6_input;
1565 	rt->u.dst.output = ip6_output;
1566 	rt->rt6i_dev = &loopback_dev;
1567 	rt->rt6i_idev = idev;
1568 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1569 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1570 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1571 	rt->u.dst.obsolete = -1;
1572 
1573 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1574 	if (anycast)
1575 		rt->rt6i_flags |= RTF_ANYCAST;
1576 	else
1577 		rt->rt6i_flags |= RTF_LOCAL;
1578 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1579 	if (rt->rt6i_nexthop == NULL) {
1580 		dst_free((struct dst_entry *) rt);
1581 		return ERR_PTR(-ENOMEM);
1582 	}
1583 
1584 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1585 	rt->rt6i_dst.plen = 128;
1586 
1587 	atomic_set(&rt->u.dst.__refcnt, 1);
1588 
1589 	return rt;
1590 }
1591 
1592 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1593 {
1594 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1595 	    rt != &ip6_null_entry) {
1596 		RT6_TRACE("deleted by ifdown %p\n", rt);
1597 		return -1;
1598 	}
1599 	return 0;
1600 }
1601 
1602 void rt6_ifdown(struct net_device *dev)
1603 {
1604 	write_lock_bh(&rt6_lock);
1605 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1606 	write_unlock_bh(&rt6_lock);
1607 }
1608 
1609 struct rt6_mtu_change_arg
1610 {
1611 	struct net_device *dev;
1612 	unsigned mtu;
1613 };
1614 
1615 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1616 {
1617 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1618 	struct inet6_dev *idev;
1619 
1620 	/* In IPv6 pmtu discovery is not optional,
1621 	   so that RTAX_MTU lock cannot disable it.
1622 	   We still use this lock to block changes
1623 	   caused by addrconf/ndisc.
1624 	*/
1625 
1626 	idev = __in6_dev_get(arg->dev);
1627 	if (idev == NULL)
1628 		return 0;
1629 
1630 	/* For administrative MTU increase, there is no way to discover
1631 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1632 	   Since RFC 1981 doesn't include administrative MTU increase
1633 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1634 	 */
1635 	/*
1636 	   If new MTU is less than route PMTU, this new MTU will be the
1637 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1638 	   decreases; if new MTU is greater than route PMTU, and the
1639 	   old MTU is the lowest MTU in the path, update the route PMTU
1640 	   to reflect the increase. In this case if the other nodes' MTU
1641 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1642 	   PMTU discouvery.
1643 	 */
1644 	if (rt->rt6i_dev == arg->dev &&
1645 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1646             (dst_mtu(&rt->u.dst) > arg->mtu ||
1647              (dst_mtu(&rt->u.dst) < arg->mtu &&
1648 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1649 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1650 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1651 	return 0;
1652 }
1653 
1654 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1655 {
1656 	struct rt6_mtu_change_arg arg;
1657 
1658 	arg.dev = dev;
1659 	arg.mtu = mtu;
1660 	read_lock_bh(&rt6_lock);
1661 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1662 	read_unlock_bh(&rt6_lock);
1663 }
1664 
1665 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1666 			      struct in6_rtmsg *rtmsg)
1667 {
1668 	memset(rtmsg, 0, sizeof(*rtmsg));
1669 
1670 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1671 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1672 	rtmsg->rtmsg_flags = RTF_UP;
1673 	if (r->rtm_type == RTN_UNREACHABLE)
1674 		rtmsg->rtmsg_flags |= RTF_REJECT;
1675 
1676 	if (rta[RTA_GATEWAY-1]) {
1677 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1678 			return -EINVAL;
1679 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1680 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1681 	}
1682 	if (rta[RTA_DST-1]) {
1683 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1684 			return -EINVAL;
1685 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1686 	}
1687 	if (rta[RTA_SRC-1]) {
1688 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1689 			return -EINVAL;
1690 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1691 	}
1692 	if (rta[RTA_OIF-1]) {
1693 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1694 			return -EINVAL;
1695 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1696 	}
1697 	if (rta[RTA_PRIORITY-1]) {
1698 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1699 			return -EINVAL;
1700 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1701 	}
1702 	return 0;
1703 }
1704 
1705 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1706 {
1707 	struct rtmsg *r = NLMSG_DATA(nlh);
1708 	struct in6_rtmsg rtmsg;
1709 
1710 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1711 		return -EINVAL;
1712 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1713 }
1714 
1715 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1716 {
1717 	struct rtmsg *r = NLMSG_DATA(nlh);
1718 	struct in6_rtmsg rtmsg;
1719 
1720 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1721 		return -EINVAL;
1722 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1723 }
1724 
1725 struct rt6_rtnl_dump_arg
1726 {
1727 	struct sk_buff *skb;
1728 	struct netlink_callback *cb;
1729 };
1730 
1731 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1732 			 struct in6_addr *dst, struct in6_addr *src,
1733 			 int iif, int type, u32 pid, u32 seq,
1734 			 int prefix, unsigned int flags)
1735 {
1736 	struct rtmsg *rtm;
1737 	struct nlmsghdr  *nlh;
1738 	unsigned char	 *b = skb->tail;
1739 	struct rta_cacheinfo ci;
1740 
1741 	if (prefix) {	/* user wants prefix routes only */
1742 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1743 			/* success since this is not a prefix route */
1744 			return 1;
1745 		}
1746 	}
1747 
1748 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1749 	rtm = NLMSG_DATA(nlh);
1750 	rtm->rtm_family = AF_INET6;
1751 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1752 	rtm->rtm_src_len = rt->rt6i_src.plen;
1753 	rtm->rtm_tos = 0;
1754 	rtm->rtm_table = RT_TABLE_MAIN;
1755 	if (rt->rt6i_flags&RTF_REJECT)
1756 		rtm->rtm_type = RTN_UNREACHABLE;
1757 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1758 		rtm->rtm_type = RTN_LOCAL;
1759 	else
1760 		rtm->rtm_type = RTN_UNICAST;
1761 	rtm->rtm_flags = 0;
1762 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1763 	rtm->rtm_protocol = rt->rt6i_protocol;
1764 	if (rt->rt6i_flags&RTF_DYNAMIC)
1765 		rtm->rtm_protocol = RTPROT_REDIRECT;
1766 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1767 		rtm->rtm_protocol = RTPROT_KERNEL;
1768 	else if (rt->rt6i_flags&RTF_DEFAULT)
1769 		rtm->rtm_protocol = RTPROT_RA;
1770 
1771 	if (rt->rt6i_flags&RTF_CACHE)
1772 		rtm->rtm_flags |= RTM_F_CLONED;
1773 
1774 	if (dst) {
1775 		RTA_PUT(skb, RTA_DST, 16, dst);
1776 	        rtm->rtm_dst_len = 128;
1777 	} else if (rtm->rtm_dst_len)
1778 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1779 #ifdef CONFIG_IPV6_SUBTREES
1780 	if (src) {
1781 		RTA_PUT(skb, RTA_SRC, 16, src);
1782 	        rtm->rtm_src_len = 128;
1783 	} else if (rtm->rtm_src_len)
1784 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1785 #endif
1786 	if (iif)
1787 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1788 	else if (dst) {
1789 		struct in6_addr saddr_buf;
1790 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1791 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1792 	}
1793 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1794 		goto rtattr_failure;
1795 	if (rt->u.dst.neighbour)
1796 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1797 	if (rt->u.dst.dev)
1798 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1799 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1800 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1801 	if (rt->rt6i_expires)
1802 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1803 	else
1804 		ci.rta_expires = 0;
1805 	ci.rta_used = rt->u.dst.__use;
1806 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1807 	ci.rta_error = rt->u.dst.error;
1808 	ci.rta_id = 0;
1809 	ci.rta_ts = 0;
1810 	ci.rta_tsage = 0;
1811 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1812 	nlh->nlmsg_len = skb->tail - b;
1813 	return skb->len;
1814 
1815 nlmsg_failure:
1816 rtattr_failure:
1817 	skb_trim(skb, b - skb->data);
1818 	return -1;
1819 }
1820 
1821 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1822 {
1823 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1824 	int prefix;
1825 
1826 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1827 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1828 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1829 	} else
1830 		prefix = 0;
1831 
1832 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1833 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1834 		     prefix, NLM_F_MULTI);
1835 }
1836 
1837 static int fib6_dump_node(struct fib6_walker_t *w)
1838 {
1839 	int res;
1840 	struct rt6_info *rt;
1841 
1842 	for (rt = w->leaf; rt; rt = rt->u.next) {
1843 		res = rt6_dump_route(rt, w->args);
1844 		if (res < 0) {
1845 			/* Frame is full, suspend walking */
1846 			w->leaf = rt;
1847 			return 1;
1848 		}
1849 		BUG_TRAP(res!=0);
1850 	}
1851 	w->leaf = NULL;
1852 	return 0;
1853 }
1854 
1855 static void fib6_dump_end(struct netlink_callback *cb)
1856 {
1857 	struct fib6_walker_t *w = (void*)cb->args[0];
1858 
1859 	if (w) {
1860 		cb->args[0] = 0;
1861 		fib6_walker_unlink(w);
1862 		kfree(w);
1863 	}
1864 	cb->done = (void*)cb->args[1];
1865 	cb->args[1] = 0;
1866 }
1867 
1868 static int fib6_dump_done(struct netlink_callback *cb)
1869 {
1870 	fib6_dump_end(cb);
1871 	return cb->done ? cb->done(cb) : 0;
1872 }
1873 
1874 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1875 {
1876 	struct rt6_rtnl_dump_arg arg;
1877 	struct fib6_walker_t *w;
1878 	int res;
1879 
1880 	arg.skb = skb;
1881 	arg.cb = cb;
1882 
1883 	w = (void*)cb->args[0];
1884 	if (w == NULL) {
1885 		/* New dump:
1886 		 *
1887 		 * 1. hook callback destructor.
1888 		 */
1889 		cb->args[1] = (long)cb->done;
1890 		cb->done = fib6_dump_done;
1891 
1892 		/*
1893 		 * 2. allocate and initialize walker.
1894 		 */
1895 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1896 		if (w == NULL)
1897 			return -ENOMEM;
1898 		RT6_TRACE("dump<%p", w);
1899 		w->root = &ip6_routing_table;
1900 		w->func = fib6_dump_node;
1901 		w->args = &arg;
1902 		cb->args[0] = (long)w;
1903 		read_lock_bh(&rt6_lock);
1904 		res = fib6_walk(w);
1905 		read_unlock_bh(&rt6_lock);
1906 	} else {
1907 		w->args = &arg;
1908 		read_lock_bh(&rt6_lock);
1909 		res = fib6_walk_continue(w);
1910 		read_unlock_bh(&rt6_lock);
1911 	}
1912 #if RT6_DEBUG >= 3
1913 	if (res <= 0 && skb->len == 0)
1914 		RT6_TRACE("%p>dump end\n", w);
1915 #endif
1916 	res = res < 0 ? res : skb->len;
1917 	/* res < 0 is an error. (really, impossible)
1918 	   res == 0 means that dump is complete, but skb still can contain data.
1919 	   res > 0 dump is not complete, but frame is full.
1920 	 */
1921 	/* Destroy walker, if dump of this table is complete. */
1922 	if (res <= 0)
1923 		fib6_dump_end(cb);
1924 	return res;
1925 }
1926 
1927 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1928 {
1929 	struct rtattr **rta = arg;
1930 	int iif = 0;
1931 	int err = -ENOBUFS;
1932 	struct sk_buff *skb;
1933 	struct flowi fl;
1934 	struct rt6_info *rt;
1935 
1936 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1937 	if (skb == NULL)
1938 		goto out;
1939 
1940 	/* Reserve room for dummy headers, this skb can pass
1941 	   through good chunk of routing engine.
1942 	 */
1943 	skb->mac.raw = skb->data;
1944 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1945 
1946 	memset(&fl, 0, sizeof(fl));
1947 	if (rta[RTA_SRC-1])
1948 		ipv6_addr_copy(&fl.fl6_src,
1949 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1950 	if (rta[RTA_DST-1])
1951 		ipv6_addr_copy(&fl.fl6_dst,
1952 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1953 
1954 	if (rta[RTA_IIF-1])
1955 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1956 
1957 	if (iif) {
1958 		struct net_device *dev;
1959 		dev = __dev_get_by_index(iif);
1960 		if (!dev) {
1961 			err = -ENODEV;
1962 			goto out_free;
1963 		}
1964 	}
1965 
1966 	fl.oif = 0;
1967 	if (rta[RTA_OIF-1])
1968 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1969 
1970 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1971 
1972 	skb->dst = &rt->u.dst;
1973 
1974 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1975 	err = rt6_fill_node(skb, rt,
1976 			    &fl.fl6_dst, &fl.fl6_src,
1977 			    iif,
1978 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1979 			    nlh->nlmsg_seq, 0, 0);
1980 	if (err < 0) {
1981 		err = -EMSGSIZE;
1982 		goto out_free;
1983 	}
1984 
1985 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1986 	if (err > 0)
1987 		err = 0;
1988 out:
1989 	return err;
1990 out_free:
1991 	kfree_skb(skb);
1992 	goto out;
1993 }
1994 
1995 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1996 			struct netlink_skb_parms *req)
1997 {
1998 	struct sk_buff *skb;
1999 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2000 	u32 pid = current->pid;
2001 	u32 seq = 0;
2002 
2003 	if (req)
2004 		pid = req->pid;
2005 	if (nlh)
2006 		seq = nlh->nlmsg_seq;
2007 
2008 	skb = alloc_skb(size, gfp_any());
2009 	if (!skb) {
2010 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2011 		return;
2012 	}
2013 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2014 		kfree_skb(skb);
2015 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2016 		return;
2017 	}
2018 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2019 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2020 }
2021 
2022 /*
2023  *	/proc
2024  */
2025 
2026 #ifdef CONFIG_PROC_FS
2027 
2028 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2029 
2030 struct rt6_proc_arg
2031 {
2032 	char *buffer;
2033 	int offset;
2034 	int length;
2035 	int skip;
2036 	int len;
2037 };
2038 
2039 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2040 {
2041 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2042 	int i;
2043 
2044 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2045 		arg->skip++;
2046 		return 0;
2047 	}
2048 
2049 	if (arg->len >= arg->length)
2050 		return 0;
2051 
2052 	for (i=0; i<16; i++) {
2053 		sprintf(arg->buffer + arg->len, "%02x",
2054 			rt->rt6i_dst.addr.s6_addr[i]);
2055 		arg->len += 2;
2056 	}
2057 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2058 			    rt->rt6i_dst.plen);
2059 
2060 #ifdef CONFIG_IPV6_SUBTREES
2061 	for (i=0; i<16; i++) {
2062 		sprintf(arg->buffer + arg->len, "%02x",
2063 			rt->rt6i_src.addr.s6_addr[i]);
2064 		arg->len += 2;
2065 	}
2066 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2067 			    rt->rt6i_src.plen);
2068 #else
2069 	sprintf(arg->buffer + arg->len,
2070 		"00000000000000000000000000000000 00 ");
2071 	arg->len += 36;
2072 #endif
2073 
2074 	if (rt->rt6i_nexthop) {
2075 		for (i=0; i<16; i++) {
2076 			sprintf(arg->buffer + arg->len, "%02x",
2077 				rt->rt6i_nexthop->primary_key[i]);
2078 			arg->len += 2;
2079 		}
2080 	} else {
2081 		sprintf(arg->buffer + arg->len,
2082 			"00000000000000000000000000000000");
2083 		arg->len += 32;
2084 	}
2085 	arg->len += sprintf(arg->buffer + arg->len,
2086 			    " %08x %08x %08x %08x %8s\n",
2087 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2088 			    rt->u.dst.__use, rt->rt6i_flags,
2089 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2090 	return 0;
2091 }
2092 
2093 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2094 {
2095 	struct rt6_proc_arg arg;
2096 	arg.buffer = buffer;
2097 	arg.offset = offset;
2098 	arg.length = length;
2099 	arg.skip = 0;
2100 	arg.len = 0;
2101 
2102 	read_lock_bh(&rt6_lock);
2103 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2104 	read_unlock_bh(&rt6_lock);
2105 
2106 	*start = buffer;
2107 	if (offset)
2108 		*start += offset % RT6_INFO_LEN;
2109 
2110 	arg.len -= offset % RT6_INFO_LEN;
2111 
2112 	if (arg.len > length)
2113 		arg.len = length;
2114 	if (arg.len < 0)
2115 		arg.len = 0;
2116 
2117 	return arg.len;
2118 }
2119 
2120 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2121 {
2122 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2123 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2124 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2125 		      rt6_stats.fib_rt_cache,
2126 		      atomic_read(&ip6_dst_ops.entries),
2127 		      rt6_stats.fib_discarded_routes);
2128 
2129 	return 0;
2130 }
2131 
2132 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2133 {
2134 	return single_open(file, rt6_stats_seq_show, NULL);
2135 }
2136 
2137 static struct file_operations rt6_stats_seq_fops = {
2138 	.owner	 = THIS_MODULE,
2139 	.open	 = rt6_stats_seq_open,
2140 	.read	 = seq_read,
2141 	.llseek	 = seq_lseek,
2142 	.release = single_release,
2143 };
2144 #endif	/* CONFIG_PROC_FS */
2145 
2146 #ifdef CONFIG_SYSCTL
2147 
2148 static int flush_delay;
2149 
2150 static
2151 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2152 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2153 {
2154 	if (write) {
2155 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2156 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2157 		return 0;
2158 	} else
2159 		return -EINVAL;
2160 }
2161 
2162 ctl_table ipv6_route_table[] = {
2163         {
2164 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2165 		.procname	=	"flush",
2166          	.data		=	&flush_delay,
2167 		.maxlen		=	sizeof(int),
2168 		.mode		=	0200,
2169          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2170 	},
2171 	{
2172 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2173 		.procname	=	"gc_thresh",
2174          	.data		=	&ip6_dst_ops.gc_thresh,
2175 		.maxlen		=	sizeof(int),
2176 		.mode		=	0644,
2177          	.proc_handler	=	&proc_dointvec,
2178 	},
2179 	{
2180 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2181 		.procname	=	"max_size",
2182          	.data		=	&ip6_rt_max_size,
2183 		.maxlen		=	sizeof(int),
2184 		.mode		=	0644,
2185          	.proc_handler	=	&proc_dointvec,
2186 	},
2187 	{
2188 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2189 		.procname	=	"gc_min_interval",
2190          	.data		=	&ip6_rt_gc_min_interval,
2191 		.maxlen		=	sizeof(int),
2192 		.mode		=	0644,
2193          	.proc_handler	=	&proc_dointvec_jiffies,
2194 		.strategy	=	&sysctl_jiffies,
2195 	},
2196 	{
2197 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2198 		.procname	=	"gc_timeout",
2199          	.data		=	&ip6_rt_gc_timeout,
2200 		.maxlen		=	sizeof(int),
2201 		.mode		=	0644,
2202          	.proc_handler	=	&proc_dointvec_jiffies,
2203 		.strategy	=	&sysctl_jiffies,
2204 	},
2205 	{
2206 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2207 		.procname	=	"gc_interval",
2208          	.data		=	&ip6_rt_gc_interval,
2209 		.maxlen		=	sizeof(int),
2210 		.mode		=	0644,
2211          	.proc_handler	=	&proc_dointvec_jiffies,
2212 		.strategy	=	&sysctl_jiffies,
2213 	},
2214 	{
2215 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2216 		.procname	=	"gc_elasticity",
2217          	.data		=	&ip6_rt_gc_elasticity,
2218 		.maxlen		=	sizeof(int),
2219 		.mode		=	0644,
2220          	.proc_handler	=	&proc_dointvec_jiffies,
2221 		.strategy	=	&sysctl_jiffies,
2222 	},
2223 	{
2224 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2225 		.procname	=	"mtu_expires",
2226          	.data		=	&ip6_rt_mtu_expires,
2227 		.maxlen		=	sizeof(int),
2228 		.mode		=	0644,
2229          	.proc_handler	=	&proc_dointvec_jiffies,
2230 		.strategy	=	&sysctl_jiffies,
2231 	},
2232 	{
2233 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2234 		.procname	=	"min_adv_mss",
2235          	.data		=	&ip6_rt_min_advmss,
2236 		.maxlen		=	sizeof(int),
2237 		.mode		=	0644,
2238          	.proc_handler	=	&proc_dointvec_jiffies,
2239 		.strategy	=	&sysctl_jiffies,
2240 	},
2241 	{
2242 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2243 		.procname	=	"gc_min_interval_ms",
2244          	.data		=	&ip6_rt_gc_min_interval,
2245 		.maxlen		=	sizeof(int),
2246 		.mode		=	0644,
2247          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2248 		.strategy	=	&sysctl_ms_jiffies,
2249 	},
2250 	{ .ctl_name = 0 }
2251 };
2252 
2253 #endif
2254 
2255 void __init ip6_route_init(void)
2256 {
2257 	struct proc_dir_entry *p;
2258 
2259 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2260 						     sizeof(struct rt6_info),
2261 						     0, SLAB_HWCACHE_ALIGN,
2262 						     NULL, NULL);
2263 	if (!ip6_dst_ops.kmem_cachep)
2264 		panic("cannot create ip6_dst_cache");
2265 
2266 	fib6_init();
2267 #ifdef 	CONFIG_PROC_FS
2268 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2269 	if (p)
2270 		p->owner = THIS_MODULE;
2271 
2272 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2273 #endif
2274 #ifdef CONFIG_XFRM
2275 	xfrm6_init();
2276 #endif
2277 }
2278 
2279 void ip6_route_cleanup(void)
2280 {
2281 #ifdef CONFIG_PROC_FS
2282 	proc_net_remove("ipv6_route");
2283 	proc_net_remove("rt6_stats");
2284 #endif
2285 #ifdef CONFIG_XFRM
2286 	xfrm6_fini();
2287 #endif
2288 	rt6_ifdown(NULL);
2289 	fib6_gc_cleanup();
2290 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2291 }
2292