xref: /linux/net/ipv6/route.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40 
41 #ifdef 	CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45 
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 
57 #include <asm/uaccess.h>
58 
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62 
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65 
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73 
74 #define CLONE_OFFLINK_ROUTE 0
75 
76 #define RT6_SELECT_F_IFACE	0x1
77 #define RT6_SELECT_F_REACHABLE	0x2
78 
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86 
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void		ip6_dst_destroy(struct dst_entry *);
91 static void		ip6_dst_ifdown(struct dst_entry *,
92 				       struct net_device *dev, int how);
93 static int		 ip6_dst_gc(void);
94 
95 static int		ip6_pkt_discard(struct sk_buff *skb);
96 static int		ip6_pkt_discard_out(struct sk_buff *skb);
97 static void		ip6_link_failure(struct sk_buff *skb);
98 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99 
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 					   struct in6_addr *gwaddr, int ifindex,
103 					   unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 					   struct in6_addr *gwaddr, int ifindex);
106 #endif
107 
108 static struct dst_ops ip6_dst_ops = {
109 	.family			=	AF_INET6,
110 	.protocol		=	__constant_htons(ETH_P_IPV6),
111 	.gc			=	ip6_dst_gc,
112 	.gc_thresh		=	1024,
113 	.check			=	ip6_dst_check,
114 	.destroy		=	ip6_dst_destroy,
115 	.ifdown			=	ip6_dst_ifdown,
116 	.negative_advice	=	ip6_negative_advice,
117 	.link_failure		=	ip6_link_failure,
118 	.update_pmtu		=	ip6_rt_update_pmtu,
119 	.entry_size		=	sizeof(struct rt6_info),
120 };
121 
122 struct rt6_info ip6_null_entry = {
123 	.u = {
124 		.dst = {
125 			.__refcnt	= ATOMIC_INIT(1),
126 			.__use		= 1,
127 			.dev		= &loopback_dev,
128 			.obsolete	= -1,
129 			.error		= -ENETUNREACH,
130 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
131 			.input		= ip6_pkt_discard,
132 			.output		= ip6_pkt_discard_out,
133 			.ops		= &ip6_dst_ops,
134 			.path		= (struct dst_entry*)&ip6_null_entry,
135 		}
136 	},
137 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
138 	.rt6i_metric	= ~(u32) 0,
139 	.rt6i_ref	= ATOMIC_INIT(1),
140 };
141 
142 struct fib6_node ip6_routing_table = {
143 	.leaf		= &ip6_null_entry,
144 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
145 };
146 
147 /* Protects all the ip6 fib */
148 
149 DEFINE_RWLOCK(rt6_lock);
150 
151 
152 /* allocate dst with ip6_dst_ops */
153 static __inline__ struct rt6_info *ip6_dst_alloc(void)
154 {
155 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
156 }
157 
158 static void ip6_dst_destroy(struct dst_entry *dst)
159 {
160 	struct rt6_info *rt = (struct rt6_info *)dst;
161 	struct inet6_dev *idev = rt->rt6i_idev;
162 
163 	if (idev != NULL) {
164 		rt->rt6i_idev = NULL;
165 		in6_dev_put(idev);
166 	}
167 }
168 
169 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
170 			   int how)
171 {
172 	struct rt6_info *rt = (struct rt6_info *)dst;
173 	struct inet6_dev *idev = rt->rt6i_idev;
174 
175 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
176 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
177 		if (loopback_idev != NULL) {
178 			rt->rt6i_idev = loopback_idev;
179 			in6_dev_put(idev);
180 		}
181 	}
182 }
183 
184 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
185 {
186 	return (rt->rt6i_flags & RTF_EXPIRES &&
187 		time_after(jiffies, rt->rt6i_expires));
188 }
189 
190 /*
191  *	Route lookup. Any rt6_lock is implied.
192  */
193 
194 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
195 						    int oif,
196 						    int strict)
197 {
198 	struct rt6_info *local = NULL;
199 	struct rt6_info *sprt;
200 
201 	if (oif) {
202 		for (sprt = rt; sprt; sprt = sprt->u.next) {
203 			struct net_device *dev = sprt->rt6i_dev;
204 			if (dev->ifindex == oif)
205 				return sprt;
206 			if (dev->flags & IFF_LOOPBACK) {
207 				if (sprt->rt6i_idev == NULL ||
208 				    sprt->rt6i_idev->dev->ifindex != oif) {
209 					if (strict && oif)
210 						continue;
211 					if (local && (!oif ||
212 						      local->rt6i_idev->dev->ifindex == oif))
213 						continue;
214 				}
215 				local = sprt;
216 			}
217 		}
218 
219 		if (local)
220 			return local;
221 
222 		if (strict)
223 			return &ip6_null_entry;
224 	}
225 	return rt;
226 }
227 
228 #ifdef CONFIG_IPV6_ROUTER_PREF
229 static void rt6_probe(struct rt6_info *rt)
230 {
231 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
232 	/*
233 	 * Okay, this does not seem to be appropriate
234 	 * for now, however, we need to check if it
235 	 * is really so; aka Router Reachability Probing.
236 	 *
237 	 * Router Reachability Probe MUST be rate-limited
238 	 * to no more than one per minute.
239 	 */
240 	if (!neigh || (neigh->nud_state & NUD_VALID))
241 		return;
242 	read_lock_bh(&neigh->lock);
243 	if (!(neigh->nud_state & NUD_VALID) &&
244 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
245 		struct in6_addr mcaddr;
246 		struct in6_addr *target;
247 
248 		neigh->updated = jiffies;
249 		read_unlock_bh(&neigh->lock);
250 
251 		target = (struct in6_addr *)&neigh->primary_key;
252 		addrconf_addr_solict_mult(target, &mcaddr);
253 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
254 	} else
255 		read_unlock_bh(&neigh->lock);
256 }
257 #else
258 static inline void rt6_probe(struct rt6_info *rt)
259 {
260 	return;
261 }
262 #endif
263 
264 /*
265  * Default Router Selection (RFC 2461 6.3.6)
266  */
267 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
268 {
269 	struct net_device *dev = rt->rt6i_dev;
270 	if (!oif || dev->ifindex == oif)
271 		return 2;
272 	if ((dev->flags & IFF_LOOPBACK) &&
273 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
274 		return 1;
275 	return 0;
276 }
277 
278 static int inline rt6_check_neigh(struct rt6_info *rt)
279 {
280 	struct neighbour *neigh = rt->rt6i_nexthop;
281 	int m = 0;
282 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
283 	    !(rt->rt6i_flags & RTF_GATEWAY))
284 		m = 1;
285 	else if (neigh) {
286 		read_lock_bh(&neigh->lock);
287 		if (neigh->nud_state & NUD_VALID)
288 			m = 2;
289 		read_unlock_bh(&neigh->lock);
290 	}
291 	return m;
292 }
293 
294 static int rt6_score_route(struct rt6_info *rt, int oif,
295 			   int strict)
296 {
297 	int m, n;
298 
299 	m = rt6_check_dev(rt, oif);
300 	if (!m && (strict & RT6_SELECT_F_IFACE))
301 		return -1;
302 #ifdef CONFIG_IPV6_ROUTER_PREF
303 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
304 #endif
305 	n = rt6_check_neigh(rt);
306 	if (n > 1)
307 		m |= 16;
308 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
309 		return -1;
310 	return m;
311 }
312 
313 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
314 				   int strict)
315 {
316 	struct rt6_info *match = NULL, *last = NULL;
317 	struct rt6_info *rt, *rt0 = *head;
318 	u32 metric;
319 	int mpri = -1;
320 
321 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
322 		  __FUNCTION__, head, head ? *head : NULL, oif);
323 
324 	for (rt = rt0, metric = rt0->rt6i_metric;
325 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
326 	     rt = rt->u.next) {
327 		int m;
328 
329 		if (rt6_check_expired(rt))
330 			continue;
331 
332 		last = rt;
333 
334 		m = rt6_score_route(rt, oif, strict);
335 		if (m < 0)
336 			continue;
337 
338 		if (m > mpri) {
339 			rt6_probe(match);
340 			match = rt;
341 			mpri = m;
342 		} else {
343 			rt6_probe(rt);
344 		}
345 	}
346 
347 	if (!match &&
348 	    (strict & RT6_SELECT_F_REACHABLE) &&
349 	    last && last != rt0) {
350 		/* no entries matched; do round-robin */
351 		static DEFINE_SPINLOCK(lock);
352 		spin_lock(&lock);
353 		*head = rt0->u.next;
354 		rt0->u.next = last->u.next;
355 		last->u.next = rt0;
356 		spin_unlock(&lock);
357 	}
358 
359 	RT6_TRACE("%s() => %p, score=%d\n",
360 		  __FUNCTION__, match, mpri);
361 
362 	return (match ? match : &ip6_null_entry);
363 }
364 
365 #ifdef CONFIG_IPV6_ROUTE_INFO
366 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
367 		  struct in6_addr *gwaddr)
368 {
369 	struct route_info *rinfo = (struct route_info *) opt;
370 	struct in6_addr prefix_buf, *prefix;
371 	unsigned int pref;
372 	u32 lifetime;
373 	struct rt6_info *rt;
374 
375 	if (len < sizeof(struct route_info)) {
376 		return -EINVAL;
377 	}
378 
379 	/* Sanity check for prefix_len and length */
380 	if (rinfo->length > 3) {
381 		return -EINVAL;
382 	} else if (rinfo->prefix_len > 128) {
383 		return -EINVAL;
384 	} else if (rinfo->prefix_len > 64) {
385 		if (rinfo->length < 2) {
386 			return -EINVAL;
387 		}
388 	} else if (rinfo->prefix_len > 0) {
389 		if (rinfo->length < 1) {
390 			return -EINVAL;
391 		}
392 	}
393 
394 	pref = rinfo->route_pref;
395 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
396 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
397 
398 	lifetime = htonl(rinfo->lifetime);
399 	if (lifetime == 0xffffffff) {
400 		/* infinity */
401 	} else if (lifetime > 0x7fffffff/HZ) {
402 		/* Avoid arithmetic overflow */
403 		lifetime = 0x7fffffff/HZ - 1;
404 	}
405 
406 	if (rinfo->length == 3)
407 		prefix = (struct in6_addr *)rinfo->prefix;
408 	else {
409 		/* this function is safe */
410 		ipv6_addr_prefix(&prefix_buf,
411 				 (struct in6_addr *)rinfo->prefix,
412 				 rinfo->prefix_len);
413 		prefix = &prefix_buf;
414 	}
415 
416 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
417 
418 	if (rt && !lifetime) {
419 		ip6_del_rt(rt, NULL, NULL, NULL);
420 		rt = NULL;
421 	}
422 
423 	if (!rt && lifetime)
424 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
425 					pref);
426 	else if (rt)
427 		rt->rt6i_flags = RTF_ROUTEINFO |
428 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
429 
430 	if (rt) {
431 		if (lifetime == 0xffffffff) {
432 			rt->rt6i_flags &= ~RTF_EXPIRES;
433 		} else {
434 			rt->rt6i_expires = jiffies + HZ * lifetime;
435 			rt->rt6i_flags |= RTF_EXPIRES;
436 		}
437 		dst_release(&rt->u.dst);
438 	}
439 	return 0;
440 }
441 #endif
442 
443 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
444 			    int oif, int strict)
445 {
446 	struct fib6_node *fn;
447 	struct rt6_info *rt;
448 
449 	read_lock_bh(&rt6_lock);
450 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
451 	rt = rt6_device_match(fn->leaf, oif, strict);
452 	dst_hold(&rt->u.dst);
453 	rt->u.dst.__use++;
454 	read_unlock_bh(&rt6_lock);
455 
456 	rt->u.dst.lastuse = jiffies;
457 	if (rt->u.dst.error == 0)
458 		return rt;
459 	dst_release(&rt->u.dst);
460 	return NULL;
461 }
462 
463 /* ip6_ins_rt is called with FREE rt6_lock.
464    It takes new route entry, the addition fails by any reason the
465    route is freed. In any case, if caller does not hold it, it may
466    be destroyed.
467  */
468 
469 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
470 		void *_rtattr, struct netlink_skb_parms *req)
471 {
472 	int err;
473 
474 	write_lock_bh(&rt6_lock);
475 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
476 	write_unlock_bh(&rt6_lock);
477 
478 	return err;
479 }
480 
481 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
482 				      struct in6_addr *saddr)
483 {
484 	struct rt6_info *rt;
485 
486 	/*
487 	 *	Clone the route.
488 	 */
489 
490 	rt = ip6_rt_copy(ort);
491 
492 	if (rt) {
493 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
494 			if (rt->rt6i_dst.plen != 128 &&
495 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
496 				rt->rt6i_flags |= RTF_ANYCAST;
497 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
498 		}
499 
500 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
501 		rt->rt6i_dst.plen = 128;
502 		rt->rt6i_flags |= RTF_CACHE;
503 		rt->u.dst.flags |= DST_HOST;
504 
505 #ifdef CONFIG_IPV6_SUBTREES
506 		if (rt->rt6i_src.plen && saddr) {
507 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
508 			rt->rt6i_src.plen = 128;
509 		}
510 #endif
511 
512 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
513 
514 	}
515 
516 	return rt;
517 }
518 
519 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
520 {
521 	struct rt6_info *rt = ip6_rt_copy(ort);
522 	if (rt) {
523 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
524 		rt->rt6i_dst.plen = 128;
525 		rt->rt6i_flags |= RTF_CACHE;
526 		if (rt->rt6i_flags & RTF_REJECT)
527 			rt->u.dst.error = ort->u.dst.error;
528 		rt->u.dst.flags |= DST_HOST;
529 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
530 	}
531 	return rt;
532 }
533 
534 #define BACKTRACK() \
535 if (rt == &ip6_null_entry) { \
536        while ((fn = fn->parent) != NULL) { \
537 		if (fn->fn_flags & RTN_ROOT) { \
538 			goto out; \
539 		} \
540 		if (fn->fn_flags & RTN_RTINFO) \
541 			goto restart; \
542 	} \
543 }
544 
545 
546 void ip6_route_input(struct sk_buff *skb)
547 {
548 	struct fib6_node *fn;
549 	struct rt6_info *rt, *nrt;
550 	int strict;
551 	int attempts = 3;
552 	int err;
553 	int reachable = RT6_SELECT_F_REACHABLE;
554 
555 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
556 
557 relookup:
558 	read_lock_bh(&rt6_lock);
559 
560 restart_2:
561 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
562 			 &skb->nh.ipv6h->saddr);
563 
564 restart:
565 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
566 	BACKTRACK();
567 	if (rt == &ip6_null_entry ||
568 	    rt->rt6i_flags & RTF_CACHE)
569 		goto out;
570 
571 	dst_hold(&rt->u.dst);
572 	read_unlock_bh(&rt6_lock);
573 
574 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
575 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
576 	else {
577 #if CLONE_OFFLINK_ROUTE
578 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
579 #else
580 		goto out2;
581 #endif
582 	}
583 
584 	dst_release(&rt->u.dst);
585 	rt = nrt ? : &ip6_null_entry;
586 
587 	dst_hold(&rt->u.dst);
588 	if (nrt) {
589 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
590 		if (!err)
591 			goto out2;
592 	}
593 
594 	if (--attempts <= 0)
595 		goto out2;
596 
597 	/*
598 	 * Race condition! In the gap, when rt6_lock was
599 	 * released someone could insert this route.  Relookup.
600 	 */
601 	dst_release(&rt->u.dst);
602 	goto relookup;
603 
604 out:
605 	if (reachable) {
606 		reachable = 0;
607 		goto restart_2;
608 	}
609 	dst_hold(&rt->u.dst);
610 	read_unlock_bh(&rt6_lock);
611 out2:
612 	rt->u.dst.lastuse = jiffies;
613 	rt->u.dst.__use++;
614 	skb->dst = (struct dst_entry *) rt;
615 	return;
616 }
617 
618 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
619 {
620 	struct fib6_node *fn;
621 	struct rt6_info *rt, *nrt;
622 	int strict;
623 	int attempts = 3;
624 	int err;
625 	int reachable = RT6_SELECT_F_REACHABLE;
626 
627 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
628 
629 relookup:
630 	read_lock_bh(&rt6_lock);
631 
632 restart_2:
633 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
634 
635 restart:
636 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
637 	BACKTRACK();
638 	if (rt == &ip6_null_entry ||
639 	    rt->rt6i_flags & RTF_CACHE)
640 		goto out;
641 
642 	dst_hold(&rt->u.dst);
643 	read_unlock_bh(&rt6_lock);
644 
645 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
646 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
647 	else {
648 #if CLONE_OFFLINK_ROUTE
649 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
650 #else
651 		goto out2;
652 #endif
653 	}
654 
655 	dst_release(&rt->u.dst);
656 	rt = nrt ? : &ip6_null_entry;
657 
658 	dst_hold(&rt->u.dst);
659 	if (nrt) {
660 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
661 		if (!err)
662 			goto out2;
663 	}
664 
665 	if (--attempts <= 0)
666 		goto out2;
667 
668 	/*
669 	 * Race condition! In the gap, when rt6_lock was
670 	 * released someone could insert this route.  Relookup.
671 	 */
672 	dst_release(&rt->u.dst);
673 	goto relookup;
674 
675 out:
676 	if (reachable) {
677 		reachable = 0;
678 		goto restart_2;
679 	}
680 	dst_hold(&rt->u.dst);
681 	read_unlock_bh(&rt6_lock);
682 out2:
683 	rt->u.dst.lastuse = jiffies;
684 	rt->u.dst.__use++;
685 	return &rt->u.dst;
686 }
687 
688 
689 /*
690  *	Destination cache support functions
691  */
692 
693 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
694 {
695 	struct rt6_info *rt;
696 
697 	rt = (struct rt6_info *) dst;
698 
699 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
700 		return dst;
701 
702 	return NULL;
703 }
704 
705 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
706 {
707 	struct rt6_info *rt = (struct rt6_info *) dst;
708 
709 	if (rt) {
710 		if (rt->rt6i_flags & RTF_CACHE)
711 			ip6_del_rt(rt, NULL, NULL, NULL);
712 		else
713 			dst_release(dst);
714 	}
715 	return NULL;
716 }
717 
718 static void ip6_link_failure(struct sk_buff *skb)
719 {
720 	struct rt6_info *rt;
721 
722 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
723 
724 	rt = (struct rt6_info *) skb->dst;
725 	if (rt) {
726 		if (rt->rt6i_flags&RTF_CACHE) {
727 			dst_set_expires(&rt->u.dst, 0);
728 			rt->rt6i_flags |= RTF_EXPIRES;
729 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
730 			rt->rt6i_node->fn_sernum = -1;
731 	}
732 }
733 
734 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
735 {
736 	struct rt6_info *rt6 = (struct rt6_info*)dst;
737 
738 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
739 		rt6->rt6i_flags |= RTF_MODIFIED;
740 		if (mtu < IPV6_MIN_MTU) {
741 			mtu = IPV6_MIN_MTU;
742 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
743 		}
744 		dst->metrics[RTAX_MTU-1] = mtu;
745 	}
746 }
747 
748 /* Protected by rt6_lock.  */
749 static struct dst_entry *ndisc_dst_gc_list;
750 static int ipv6_get_mtu(struct net_device *dev);
751 
752 static inline unsigned int ipv6_advmss(unsigned int mtu)
753 {
754 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
755 
756 	if (mtu < ip6_rt_min_advmss)
757 		mtu = ip6_rt_min_advmss;
758 
759 	/*
760 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
761 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
762 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
763 	 * rely only on pmtu discovery"
764 	 */
765 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
766 		mtu = IPV6_MAXPLEN;
767 	return mtu;
768 }
769 
770 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
771 				  struct neighbour *neigh,
772 				  struct in6_addr *addr,
773 				  int (*output)(struct sk_buff *))
774 {
775 	struct rt6_info *rt;
776 	struct inet6_dev *idev = in6_dev_get(dev);
777 
778 	if (unlikely(idev == NULL))
779 		return NULL;
780 
781 	rt = ip6_dst_alloc();
782 	if (unlikely(rt == NULL)) {
783 		in6_dev_put(idev);
784 		goto out;
785 	}
786 
787 	dev_hold(dev);
788 	if (neigh)
789 		neigh_hold(neigh);
790 	else
791 		neigh = ndisc_get_neigh(dev, addr);
792 
793 	rt->rt6i_dev	  = dev;
794 	rt->rt6i_idev     = idev;
795 	rt->rt6i_nexthop  = neigh;
796 	atomic_set(&rt->u.dst.__refcnt, 1);
797 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
798 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
799 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
800 	rt->u.dst.output  = output;
801 
802 #if 0	/* there's no chance to use these for ndisc */
803 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
804 				? DST_HOST
805 				: 0;
806 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
807 	rt->rt6i_dst.plen = 128;
808 #endif
809 
810 	write_lock_bh(&rt6_lock);
811 	rt->u.dst.next = ndisc_dst_gc_list;
812 	ndisc_dst_gc_list = &rt->u.dst;
813 	write_unlock_bh(&rt6_lock);
814 
815 	fib6_force_start_gc();
816 
817 out:
818 	return (struct dst_entry *)rt;
819 }
820 
821 int ndisc_dst_gc(int *more)
822 {
823 	struct dst_entry *dst, *next, **pprev;
824 	int freed;
825 
826 	next = NULL;
827 	pprev = &ndisc_dst_gc_list;
828 	freed = 0;
829 	while ((dst = *pprev) != NULL) {
830 		if (!atomic_read(&dst->__refcnt)) {
831 			*pprev = dst->next;
832 			dst_free(dst);
833 			freed++;
834 		} else {
835 			pprev = &dst->next;
836 			(*more)++;
837 		}
838 	}
839 
840 	return freed;
841 }
842 
843 static int ip6_dst_gc(void)
844 {
845 	static unsigned expire = 30*HZ;
846 	static unsigned long last_gc;
847 	unsigned long now = jiffies;
848 
849 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
850 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
851 		goto out;
852 
853 	expire++;
854 	fib6_run_gc(expire);
855 	last_gc = now;
856 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
857 		expire = ip6_rt_gc_timeout>>1;
858 
859 out:
860 	expire -= expire>>ip6_rt_gc_elasticity;
861 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
862 }
863 
864 /* Clean host part of a prefix. Not necessary in radix tree,
865    but results in cleaner routing tables.
866 
867    Remove it only when all the things will work!
868  */
869 
870 static int ipv6_get_mtu(struct net_device *dev)
871 {
872 	int mtu = IPV6_MIN_MTU;
873 	struct inet6_dev *idev;
874 
875 	idev = in6_dev_get(dev);
876 	if (idev) {
877 		mtu = idev->cnf.mtu6;
878 		in6_dev_put(idev);
879 	}
880 	return mtu;
881 }
882 
883 int ipv6_get_hoplimit(struct net_device *dev)
884 {
885 	int hoplimit = ipv6_devconf.hop_limit;
886 	struct inet6_dev *idev;
887 
888 	idev = in6_dev_get(dev);
889 	if (idev) {
890 		hoplimit = idev->cnf.hop_limit;
891 		in6_dev_put(idev);
892 	}
893 	return hoplimit;
894 }
895 
896 /*
897  *
898  */
899 
900 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
901 		void *_rtattr, struct netlink_skb_parms *req)
902 {
903 	int err;
904 	struct rtmsg *r;
905 	struct rtattr **rta;
906 	struct rt6_info *rt = NULL;
907 	struct net_device *dev = NULL;
908 	struct inet6_dev *idev = NULL;
909 	int addr_type;
910 
911 	rta = (struct rtattr **) _rtattr;
912 
913 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
914 		return -EINVAL;
915 #ifndef CONFIG_IPV6_SUBTREES
916 	if (rtmsg->rtmsg_src_len)
917 		return -EINVAL;
918 #endif
919 	if (rtmsg->rtmsg_ifindex) {
920 		err = -ENODEV;
921 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
922 		if (!dev)
923 			goto out;
924 		idev = in6_dev_get(dev);
925 		if (!idev)
926 			goto out;
927 	}
928 
929 	if (rtmsg->rtmsg_metric == 0)
930 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
931 
932 	rt = ip6_dst_alloc();
933 
934 	if (rt == NULL) {
935 		err = -ENOMEM;
936 		goto out;
937 	}
938 
939 	rt->u.dst.obsolete = -1;
940 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
941 	if (nlh && (r = NLMSG_DATA(nlh))) {
942 		rt->rt6i_protocol = r->rtm_protocol;
943 	} else {
944 		rt->rt6i_protocol = RTPROT_BOOT;
945 	}
946 
947 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
948 
949 	if (addr_type & IPV6_ADDR_MULTICAST)
950 		rt->u.dst.input = ip6_mc_input;
951 	else
952 		rt->u.dst.input = ip6_forward;
953 
954 	rt->u.dst.output = ip6_output;
955 
956 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
957 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
958 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
959 	if (rt->rt6i_dst.plen == 128)
960 	       rt->u.dst.flags = DST_HOST;
961 
962 #ifdef CONFIG_IPV6_SUBTREES
963 	ipv6_addr_prefix(&rt->rt6i_src.addr,
964 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
965 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
966 #endif
967 
968 	rt->rt6i_metric = rtmsg->rtmsg_metric;
969 
970 	/* We cannot add true routes via loopback here,
971 	   they would result in kernel looping; promote them to reject routes
972 	 */
973 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
974 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
975 		/* hold loopback dev/idev if we haven't done so. */
976 		if (dev != &loopback_dev) {
977 			if (dev) {
978 				dev_put(dev);
979 				in6_dev_put(idev);
980 			}
981 			dev = &loopback_dev;
982 			dev_hold(dev);
983 			idev = in6_dev_get(dev);
984 			if (!idev) {
985 				err = -ENODEV;
986 				goto out;
987 			}
988 		}
989 		rt->u.dst.output = ip6_pkt_discard_out;
990 		rt->u.dst.input = ip6_pkt_discard;
991 		rt->u.dst.error = -ENETUNREACH;
992 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
993 		goto install_route;
994 	}
995 
996 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
997 		struct in6_addr *gw_addr;
998 		int gwa_type;
999 
1000 		gw_addr = &rtmsg->rtmsg_gateway;
1001 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1002 		gwa_type = ipv6_addr_type(gw_addr);
1003 
1004 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1005 			struct rt6_info *grt;
1006 
1007 			/* IPv6 strictly inhibits using not link-local
1008 			   addresses as nexthop address.
1009 			   Otherwise, router will not able to send redirects.
1010 			   It is very good, but in some (rare!) circumstances
1011 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1012 			   some exceptions. --ANK
1013 			 */
1014 			err = -EINVAL;
1015 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1016 				goto out;
1017 
1018 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1019 
1020 			err = -EHOSTUNREACH;
1021 			if (grt == NULL)
1022 				goto out;
1023 			if (dev) {
1024 				if (dev != grt->rt6i_dev) {
1025 					dst_release(&grt->u.dst);
1026 					goto out;
1027 				}
1028 			} else {
1029 				dev = grt->rt6i_dev;
1030 				idev = grt->rt6i_idev;
1031 				dev_hold(dev);
1032 				in6_dev_hold(grt->rt6i_idev);
1033 			}
1034 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1035 				err = 0;
1036 			dst_release(&grt->u.dst);
1037 
1038 			if (err)
1039 				goto out;
1040 		}
1041 		err = -EINVAL;
1042 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1043 			goto out;
1044 	}
1045 
1046 	err = -ENODEV;
1047 	if (dev == NULL)
1048 		goto out;
1049 
1050 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1051 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1052 		if (IS_ERR(rt->rt6i_nexthop)) {
1053 			err = PTR_ERR(rt->rt6i_nexthop);
1054 			rt->rt6i_nexthop = NULL;
1055 			goto out;
1056 		}
1057 	}
1058 
1059 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1060 
1061 install_route:
1062 	if (rta && rta[RTA_METRICS-1]) {
1063 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1064 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1065 
1066 		while (RTA_OK(attr, attrlen)) {
1067 			unsigned flavor = attr->rta_type;
1068 			if (flavor) {
1069 				if (flavor > RTAX_MAX) {
1070 					err = -EINVAL;
1071 					goto out;
1072 				}
1073 				rt->u.dst.metrics[flavor-1] =
1074 					*(u32 *)RTA_DATA(attr);
1075 			}
1076 			attr = RTA_NEXT(attr, attrlen);
1077 		}
1078 	}
1079 
1080 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1081 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1082 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1083 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1084 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1085 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1086 	rt->u.dst.dev = dev;
1087 	rt->rt6i_idev = idev;
1088 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1089 
1090 out:
1091 	if (dev)
1092 		dev_put(dev);
1093 	if (idev)
1094 		in6_dev_put(idev);
1095 	if (rt)
1096 		dst_free((struct dst_entry *) rt);
1097 	return err;
1098 }
1099 
1100 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1101 {
1102 	int err;
1103 
1104 	write_lock_bh(&rt6_lock);
1105 
1106 	err = fib6_del(rt, nlh, _rtattr, req);
1107 	dst_release(&rt->u.dst);
1108 
1109 	write_unlock_bh(&rt6_lock);
1110 
1111 	return err;
1112 }
1113 
1114 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1115 {
1116 	struct fib6_node *fn;
1117 	struct rt6_info *rt;
1118 	int err = -ESRCH;
1119 
1120 	read_lock_bh(&rt6_lock);
1121 
1122 	fn = fib6_locate(&ip6_routing_table,
1123 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1124 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1125 
1126 	if (fn) {
1127 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1128 			if (rtmsg->rtmsg_ifindex &&
1129 			    (rt->rt6i_dev == NULL ||
1130 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1131 				continue;
1132 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1133 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1134 				continue;
1135 			if (rtmsg->rtmsg_metric &&
1136 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1137 				continue;
1138 			dst_hold(&rt->u.dst);
1139 			read_unlock_bh(&rt6_lock);
1140 
1141 			return ip6_del_rt(rt, nlh, _rtattr, req);
1142 		}
1143 	}
1144 	read_unlock_bh(&rt6_lock);
1145 
1146 	return err;
1147 }
1148 
1149 /*
1150  *	Handle redirects
1151  */
1152 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1153 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1154 {
1155 	struct rt6_info *rt, *nrt = NULL;
1156 	int strict;
1157 	struct fib6_node *fn;
1158 
1159 	/*
1160 	 * Get the "current" route for this destination and
1161 	 * check if the redirect has come from approriate router.
1162 	 *
1163 	 * RFC 2461 specifies that redirects should only be
1164 	 * accepted if they come from the nexthop to the target.
1165 	 * Due to the way the routes are chosen, this notion
1166 	 * is a bit fuzzy and one might need to check all possible
1167 	 * routes.
1168 	 */
1169 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1170 
1171 	read_lock_bh(&rt6_lock);
1172 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1173 restart:
1174 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1175 		/*
1176 		 * Current route is on-link; redirect is always invalid.
1177 		 *
1178 		 * Seems, previous statement is not true. It could
1179 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1180 		 * But then router serving it might decide, that we should
1181 		 * know truth 8)8) --ANK (980726).
1182 		 */
1183 		if (rt6_check_expired(rt))
1184 			continue;
1185 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1186 			continue;
1187 		if (neigh->dev != rt->rt6i_dev)
1188 			continue;
1189 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1190 			continue;
1191 		break;
1192 	}
1193 	if (rt)
1194 		dst_hold(&rt->u.dst);
1195 	else if (strict) {
1196 		while ((fn = fn->parent) != NULL) {
1197 			if (fn->fn_flags & RTN_ROOT)
1198 				break;
1199 			if (fn->fn_flags & RTN_RTINFO)
1200 				goto restart;
1201 		}
1202 	}
1203 	read_unlock_bh(&rt6_lock);
1204 
1205 	if (!rt) {
1206 		if (net_ratelimit())
1207 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1208 			       "for redirect target\n");
1209 		return;
1210 	}
1211 
1212 	/*
1213 	 *	We have finally decided to accept it.
1214 	 */
1215 
1216 	neigh_update(neigh, lladdr, NUD_STALE,
1217 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1218 		     NEIGH_UPDATE_F_OVERRIDE|
1219 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1220 				     NEIGH_UPDATE_F_ISROUTER))
1221 		     );
1222 
1223 	/*
1224 	 * Redirect received -> path was valid.
1225 	 * Look, redirects are sent only in response to data packets,
1226 	 * so that this nexthop apparently is reachable. --ANK
1227 	 */
1228 	dst_confirm(&rt->u.dst);
1229 
1230 	/* Duplicate redirect: silently ignore. */
1231 	if (neigh == rt->u.dst.neighbour)
1232 		goto out;
1233 
1234 	nrt = ip6_rt_copy(rt);
1235 	if (nrt == NULL)
1236 		goto out;
1237 
1238 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1239 	if (on_link)
1240 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1241 
1242 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1243 	nrt->rt6i_dst.plen = 128;
1244 	nrt->u.dst.flags |= DST_HOST;
1245 
1246 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1247 	nrt->rt6i_nexthop = neigh_clone(neigh);
1248 	/* Reset pmtu, it may be better */
1249 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1250 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1251 
1252 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1253 		goto out;
1254 
1255 	if (rt->rt6i_flags&RTF_CACHE) {
1256 		ip6_del_rt(rt, NULL, NULL, NULL);
1257 		return;
1258 	}
1259 
1260 out:
1261         dst_release(&rt->u.dst);
1262 	return;
1263 }
1264 
1265 /*
1266  *	Handle ICMP "packet too big" messages
1267  *	i.e. Path MTU discovery
1268  */
1269 
1270 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1271 			struct net_device *dev, u32 pmtu)
1272 {
1273 	struct rt6_info *rt, *nrt;
1274 	int allfrag = 0;
1275 
1276 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1277 	if (rt == NULL)
1278 		return;
1279 
1280 	if (pmtu >= dst_mtu(&rt->u.dst))
1281 		goto out;
1282 
1283 	if (pmtu < IPV6_MIN_MTU) {
1284 		/*
1285 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1286 		 * MTU (1280) and a fragment header should always be included
1287 		 * after a node receiving Too Big message reporting PMTU is
1288 		 * less than the IPv6 Minimum Link MTU.
1289 		 */
1290 		pmtu = IPV6_MIN_MTU;
1291 		allfrag = 1;
1292 	}
1293 
1294 	/* New mtu received -> path was valid.
1295 	   They are sent only in response to data packets,
1296 	   so that this nexthop apparently is reachable. --ANK
1297 	 */
1298 	dst_confirm(&rt->u.dst);
1299 
1300 	/* Host route. If it is static, it would be better
1301 	   not to override it, but add new one, so that
1302 	   when cache entry will expire old pmtu
1303 	   would return automatically.
1304 	 */
1305 	if (rt->rt6i_flags & RTF_CACHE) {
1306 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1307 		if (allfrag)
1308 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1309 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1310 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1311 		goto out;
1312 	}
1313 
1314 	/* Network route.
1315 	   Two cases are possible:
1316 	   1. It is connected route. Action: COW
1317 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1318 	 */
1319 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1320 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1321 	else
1322 		nrt = rt6_alloc_clone(rt, daddr);
1323 
1324 	if (nrt) {
1325 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1326 		if (allfrag)
1327 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1328 
1329 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1330 		 * happened within 5 mins, the recommended timer is 10 mins.
1331 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1332 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1333 		 * and detecting PMTU increase will be automatically happened.
1334 		 */
1335 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1336 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1337 
1338 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1339 	}
1340 out:
1341 	dst_release(&rt->u.dst);
1342 }
1343 
1344 /*
1345  *	Misc support functions
1346  */
1347 
1348 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1349 {
1350 	struct rt6_info *rt = ip6_dst_alloc();
1351 
1352 	if (rt) {
1353 		rt->u.dst.input = ort->u.dst.input;
1354 		rt->u.dst.output = ort->u.dst.output;
1355 
1356 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1357 		rt->u.dst.dev = ort->u.dst.dev;
1358 		if (rt->u.dst.dev)
1359 			dev_hold(rt->u.dst.dev);
1360 		rt->rt6i_idev = ort->rt6i_idev;
1361 		if (rt->rt6i_idev)
1362 			in6_dev_hold(rt->rt6i_idev);
1363 		rt->u.dst.lastuse = jiffies;
1364 		rt->rt6i_expires = 0;
1365 
1366 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1367 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1368 		rt->rt6i_metric = 0;
1369 
1370 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1371 #ifdef CONFIG_IPV6_SUBTREES
1372 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1373 #endif
1374 	}
1375 	return rt;
1376 }
1377 
1378 #ifdef CONFIG_IPV6_ROUTE_INFO
1379 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1380 					   struct in6_addr *gwaddr, int ifindex)
1381 {
1382 	struct fib6_node *fn;
1383 	struct rt6_info *rt = NULL;
1384 
1385 	write_lock_bh(&rt6_lock);
1386 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1387 	if (!fn)
1388 		goto out;
1389 
1390 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1391 		if (rt->rt6i_dev->ifindex != ifindex)
1392 			continue;
1393 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1394 			continue;
1395 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1396 			continue;
1397 		dst_hold(&rt->u.dst);
1398 		break;
1399 	}
1400 out:
1401 	write_unlock_bh(&rt6_lock);
1402 	return rt;
1403 }
1404 
1405 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1406 					   struct in6_addr *gwaddr, int ifindex,
1407 					   unsigned pref)
1408 {
1409 	struct in6_rtmsg rtmsg;
1410 
1411 	memset(&rtmsg, 0, sizeof(rtmsg));
1412 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1413 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1414 	rtmsg.rtmsg_dst_len = prefixlen;
1415 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1416 	rtmsg.rtmsg_metric = 1024;
1417 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1418 	/* We should treat it as a default route if prefix length is 0. */
1419 	if (!prefixlen)
1420 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1421 	rtmsg.rtmsg_ifindex = ifindex;
1422 
1423 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1424 
1425 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1426 }
1427 #endif
1428 
1429 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1430 {
1431 	struct rt6_info *rt;
1432 	struct fib6_node *fn;
1433 
1434 	fn = &ip6_routing_table;
1435 
1436 	write_lock_bh(&rt6_lock);
1437 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1438 		if (dev == rt->rt6i_dev &&
1439 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1440 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1441 			break;
1442 	}
1443 	if (rt)
1444 		dst_hold(&rt->u.dst);
1445 	write_unlock_bh(&rt6_lock);
1446 	return rt;
1447 }
1448 
1449 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1450 				     struct net_device *dev,
1451 				     unsigned int pref)
1452 {
1453 	struct in6_rtmsg rtmsg;
1454 
1455 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1456 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1457 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1458 	rtmsg.rtmsg_metric = 1024;
1459 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1460 			    RTF_PREF(pref);
1461 
1462 	rtmsg.rtmsg_ifindex = dev->ifindex;
1463 
1464 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1465 	return rt6_get_dflt_router(gwaddr, dev);
1466 }
1467 
1468 void rt6_purge_dflt_routers(void)
1469 {
1470 	struct rt6_info *rt;
1471 
1472 restart:
1473 	read_lock_bh(&rt6_lock);
1474 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1475 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1476 			dst_hold(&rt->u.dst);
1477 
1478 			read_unlock_bh(&rt6_lock);
1479 
1480 			ip6_del_rt(rt, NULL, NULL, NULL);
1481 
1482 			goto restart;
1483 		}
1484 	}
1485 	read_unlock_bh(&rt6_lock);
1486 }
1487 
1488 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1489 {
1490 	struct in6_rtmsg rtmsg;
1491 	int err;
1492 
1493 	switch(cmd) {
1494 	case SIOCADDRT:		/* Add a route */
1495 	case SIOCDELRT:		/* Delete a route */
1496 		if (!capable(CAP_NET_ADMIN))
1497 			return -EPERM;
1498 		err = copy_from_user(&rtmsg, arg,
1499 				     sizeof(struct in6_rtmsg));
1500 		if (err)
1501 			return -EFAULT;
1502 
1503 		rtnl_lock();
1504 		switch (cmd) {
1505 		case SIOCADDRT:
1506 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1507 			break;
1508 		case SIOCDELRT:
1509 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1510 			break;
1511 		default:
1512 			err = -EINVAL;
1513 		}
1514 		rtnl_unlock();
1515 
1516 		return err;
1517 	};
1518 
1519 	return -EINVAL;
1520 }
1521 
1522 /*
1523  *	Drop the packet on the floor
1524  */
1525 
1526 static int ip6_pkt_discard(struct sk_buff *skb)
1527 {
1528 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1529 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1530 	kfree_skb(skb);
1531 	return 0;
1532 }
1533 
1534 static int ip6_pkt_discard_out(struct sk_buff *skb)
1535 {
1536 	skb->dev = skb->dst->dev;
1537 	return ip6_pkt_discard(skb);
1538 }
1539 
1540 /*
1541  *	Allocate a dst for local (unicast / anycast) address.
1542  */
1543 
1544 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1545 				    const struct in6_addr *addr,
1546 				    int anycast)
1547 {
1548 	struct rt6_info *rt = ip6_dst_alloc();
1549 
1550 	if (rt == NULL)
1551 		return ERR_PTR(-ENOMEM);
1552 
1553 	dev_hold(&loopback_dev);
1554 	in6_dev_hold(idev);
1555 
1556 	rt->u.dst.flags = DST_HOST;
1557 	rt->u.dst.input = ip6_input;
1558 	rt->u.dst.output = ip6_output;
1559 	rt->rt6i_dev = &loopback_dev;
1560 	rt->rt6i_idev = idev;
1561 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1562 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1563 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1564 	rt->u.dst.obsolete = -1;
1565 
1566 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1567 	if (anycast)
1568 		rt->rt6i_flags |= RTF_ANYCAST;
1569 	else
1570 		rt->rt6i_flags |= RTF_LOCAL;
1571 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1572 	if (rt->rt6i_nexthop == NULL) {
1573 		dst_free((struct dst_entry *) rt);
1574 		return ERR_PTR(-ENOMEM);
1575 	}
1576 
1577 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1578 	rt->rt6i_dst.plen = 128;
1579 
1580 	atomic_set(&rt->u.dst.__refcnt, 1);
1581 
1582 	return rt;
1583 }
1584 
1585 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1586 {
1587 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1588 	    rt != &ip6_null_entry) {
1589 		RT6_TRACE("deleted by ifdown %p\n", rt);
1590 		return -1;
1591 	}
1592 	return 0;
1593 }
1594 
1595 void rt6_ifdown(struct net_device *dev)
1596 {
1597 	write_lock_bh(&rt6_lock);
1598 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1599 	write_unlock_bh(&rt6_lock);
1600 }
1601 
1602 struct rt6_mtu_change_arg
1603 {
1604 	struct net_device *dev;
1605 	unsigned mtu;
1606 };
1607 
1608 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1609 {
1610 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1611 	struct inet6_dev *idev;
1612 
1613 	/* In IPv6 pmtu discovery is not optional,
1614 	   so that RTAX_MTU lock cannot disable it.
1615 	   We still use this lock to block changes
1616 	   caused by addrconf/ndisc.
1617 	*/
1618 
1619 	idev = __in6_dev_get(arg->dev);
1620 	if (idev == NULL)
1621 		return 0;
1622 
1623 	/* For administrative MTU increase, there is no way to discover
1624 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1625 	   Since RFC 1981 doesn't include administrative MTU increase
1626 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1627 	 */
1628 	/*
1629 	   If new MTU is less than route PMTU, this new MTU will be the
1630 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1631 	   decreases; if new MTU is greater than route PMTU, and the
1632 	   old MTU is the lowest MTU in the path, update the route PMTU
1633 	   to reflect the increase. In this case if the other nodes' MTU
1634 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1635 	   PMTU discouvery.
1636 	 */
1637 	if (rt->rt6i_dev == arg->dev &&
1638 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1639             (dst_mtu(&rt->u.dst) > arg->mtu ||
1640              (dst_mtu(&rt->u.dst) < arg->mtu &&
1641 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1642 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1643 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1644 	return 0;
1645 }
1646 
1647 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1648 {
1649 	struct rt6_mtu_change_arg arg;
1650 
1651 	arg.dev = dev;
1652 	arg.mtu = mtu;
1653 	read_lock_bh(&rt6_lock);
1654 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1655 	read_unlock_bh(&rt6_lock);
1656 }
1657 
1658 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1659 			      struct in6_rtmsg *rtmsg)
1660 {
1661 	memset(rtmsg, 0, sizeof(*rtmsg));
1662 
1663 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1664 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1665 	rtmsg->rtmsg_flags = RTF_UP;
1666 	if (r->rtm_type == RTN_UNREACHABLE)
1667 		rtmsg->rtmsg_flags |= RTF_REJECT;
1668 
1669 	if (rta[RTA_GATEWAY-1]) {
1670 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1671 			return -EINVAL;
1672 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1673 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1674 	}
1675 	if (rta[RTA_DST-1]) {
1676 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1677 			return -EINVAL;
1678 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1679 	}
1680 	if (rta[RTA_SRC-1]) {
1681 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1682 			return -EINVAL;
1683 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1684 	}
1685 	if (rta[RTA_OIF-1]) {
1686 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1687 			return -EINVAL;
1688 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1689 	}
1690 	if (rta[RTA_PRIORITY-1]) {
1691 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1692 			return -EINVAL;
1693 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1694 	}
1695 	return 0;
1696 }
1697 
1698 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1699 {
1700 	struct rtmsg *r = NLMSG_DATA(nlh);
1701 	struct in6_rtmsg rtmsg;
1702 
1703 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1704 		return -EINVAL;
1705 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1706 }
1707 
1708 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1709 {
1710 	struct rtmsg *r = NLMSG_DATA(nlh);
1711 	struct in6_rtmsg rtmsg;
1712 
1713 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1714 		return -EINVAL;
1715 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1716 }
1717 
1718 struct rt6_rtnl_dump_arg
1719 {
1720 	struct sk_buff *skb;
1721 	struct netlink_callback *cb;
1722 };
1723 
1724 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1725 			 struct in6_addr *dst, struct in6_addr *src,
1726 			 int iif, int type, u32 pid, u32 seq,
1727 			 int prefix, unsigned int flags)
1728 {
1729 	struct rtmsg *rtm;
1730 	struct nlmsghdr  *nlh;
1731 	unsigned char	 *b = skb->tail;
1732 	struct rta_cacheinfo ci;
1733 
1734 	if (prefix) {	/* user wants prefix routes only */
1735 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1736 			/* success since this is not a prefix route */
1737 			return 1;
1738 		}
1739 	}
1740 
1741 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1742 	rtm = NLMSG_DATA(nlh);
1743 	rtm->rtm_family = AF_INET6;
1744 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1745 	rtm->rtm_src_len = rt->rt6i_src.plen;
1746 	rtm->rtm_tos = 0;
1747 	rtm->rtm_table = RT_TABLE_MAIN;
1748 	if (rt->rt6i_flags&RTF_REJECT)
1749 		rtm->rtm_type = RTN_UNREACHABLE;
1750 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1751 		rtm->rtm_type = RTN_LOCAL;
1752 	else
1753 		rtm->rtm_type = RTN_UNICAST;
1754 	rtm->rtm_flags = 0;
1755 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1756 	rtm->rtm_protocol = rt->rt6i_protocol;
1757 	if (rt->rt6i_flags&RTF_DYNAMIC)
1758 		rtm->rtm_protocol = RTPROT_REDIRECT;
1759 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1760 		rtm->rtm_protocol = RTPROT_KERNEL;
1761 	else if (rt->rt6i_flags&RTF_DEFAULT)
1762 		rtm->rtm_protocol = RTPROT_RA;
1763 
1764 	if (rt->rt6i_flags&RTF_CACHE)
1765 		rtm->rtm_flags |= RTM_F_CLONED;
1766 
1767 	if (dst) {
1768 		RTA_PUT(skb, RTA_DST, 16, dst);
1769 	        rtm->rtm_dst_len = 128;
1770 	} else if (rtm->rtm_dst_len)
1771 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1772 #ifdef CONFIG_IPV6_SUBTREES
1773 	if (src) {
1774 		RTA_PUT(skb, RTA_SRC, 16, src);
1775 	        rtm->rtm_src_len = 128;
1776 	} else if (rtm->rtm_src_len)
1777 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1778 #endif
1779 	if (iif)
1780 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1781 	else if (dst) {
1782 		struct in6_addr saddr_buf;
1783 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1784 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1785 	}
1786 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1787 		goto rtattr_failure;
1788 	if (rt->u.dst.neighbour)
1789 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1790 	if (rt->u.dst.dev)
1791 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1792 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1793 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1794 	if (rt->rt6i_expires)
1795 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1796 	else
1797 		ci.rta_expires = 0;
1798 	ci.rta_used = rt->u.dst.__use;
1799 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1800 	ci.rta_error = rt->u.dst.error;
1801 	ci.rta_id = 0;
1802 	ci.rta_ts = 0;
1803 	ci.rta_tsage = 0;
1804 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1805 	nlh->nlmsg_len = skb->tail - b;
1806 	return skb->len;
1807 
1808 nlmsg_failure:
1809 rtattr_failure:
1810 	skb_trim(skb, b - skb->data);
1811 	return -1;
1812 }
1813 
1814 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1815 {
1816 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1817 	int prefix;
1818 
1819 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1820 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1821 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1822 	} else
1823 		prefix = 0;
1824 
1825 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1826 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1827 		     prefix, NLM_F_MULTI);
1828 }
1829 
1830 static int fib6_dump_node(struct fib6_walker_t *w)
1831 {
1832 	int res;
1833 	struct rt6_info *rt;
1834 
1835 	for (rt = w->leaf; rt; rt = rt->u.next) {
1836 		res = rt6_dump_route(rt, w->args);
1837 		if (res < 0) {
1838 			/* Frame is full, suspend walking */
1839 			w->leaf = rt;
1840 			return 1;
1841 		}
1842 		BUG_TRAP(res!=0);
1843 	}
1844 	w->leaf = NULL;
1845 	return 0;
1846 }
1847 
1848 static void fib6_dump_end(struct netlink_callback *cb)
1849 {
1850 	struct fib6_walker_t *w = (void*)cb->args[0];
1851 
1852 	if (w) {
1853 		cb->args[0] = 0;
1854 		fib6_walker_unlink(w);
1855 		kfree(w);
1856 	}
1857 	cb->done = (void*)cb->args[1];
1858 	cb->args[1] = 0;
1859 }
1860 
1861 static int fib6_dump_done(struct netlink_callback *cb)
1862 {
1863 	fib6_dump_end(cb);
1864 	return cb->done ? cb->done(cb) : 0;
1865 }
1866 
1867 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1868 {
1869 	struct rt6_rtnl_dump_arg arg;
1870 	struct fib6_walker_t *w;
1871 	int res;
1872 
1873 	arg.skb = skb;
1874 	arg.cb = cb;
1875 
1876 	w = (void*)cb->args[0];
1877 	if (w == NULL) {
1878 		/* New dump:
1879 		 *
1880 		 * 1. hook callback destructor.
1881 		 */
1882 		cb->args[1] = (long)cb->done;
1883 		cb->done = fib6_dump_done;
1884 
1885 		/*
1886 		 * 2. allocate and initialize walker.
1887 		 */
1888 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1889 		if (w == NULL)
1890 			return -ENOMEM;
1891 		RT6_TRACE("dump<%p", w);
1892 		w->root = &ip6_routing_table;
1893 		w->func = fib6_dump_node;
1894 		w->args = &arg;
1895 		cb->args[0] = (long)w;
1896 		read_lock_bh(&rt6_lock);
1897 		res = fib6_walk(w);
1898 		read_unlock_bh(&rt6_lock);
1899 	} else {
1900 		w->args = &arg;
1901 		read_lock_bh(&rt6_lock);
1902 		res = fib6_walk_continue(w);
1903 		read_unlock_bh(&rt6_lock);
1904 	}
1905 #if RT6_DEBUG >= 3
1906 	if (res <= 0 && skb->len == 0)
1907 		RT6_TRACE("%p>dump end\n", w);
1908 #endif
1909 	res = res < 0 ? res : skb->len;
1910 	/* res < 0 is an error. (really, impossible)
1911 	   res == 0 means that dump is complete, but skb still can contain data.
1912 	   res > 0 dump is not complete, but frame is full.
1913 	 */
1914 	/* Destroy walker, if dump of this table is complete. */
1915 	if (res <= 0)
1916 		fib6_dump_end(cb);
1917 	return res;
1918 }
1919 
1920 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1921 {
1922 	struct rtattr **rta = arg;
1923 	int iif = 0;
1924 	int err = -ENOBUFS;
1925 	struct sk_buff *skb;
1926 	struct flowi fl;
1927 	struct rt6_info *rt;
1928 
1929 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1930 	if (skb == NULL)
1931 		goto out;
1932 
1933 	/* Reserve room for dummy headers, this skb can pass
1934 	   through good chunk of routing engine.
1935 	 */
1936 	skb->mac.raw = skb->data;
1937 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1938 
1939 	memset(&fl, 0, sizeof(fl));
1940 	if (rta[RTA_SRC-1])
1941 		ipv6_addr_copy(&fl.fl6_src,
1942 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1943 	if (rta[RTA_DST-1])
1944 		ipv6_addr_copy(&fl.fl6_dst,
1945 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1946 
1947 	if (rta[RTA_IIF-1])
1948 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1949 
1950 	if (iif) {
1951 		struct net_device *dev;
1952 		dev = __dev_get_by_index(iif);
1953 		if (!dev) {
1954 			err = -ENODEV;
1955 			goto out_free;
1956 		}
1957 	}
1958 
1959 	fl.oif = 0;
1960 	if (rta[RTA_OIF-1])
1961 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1962 
1963 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1964 
1965 	skb->dst = &rt->u.dst;
1966 
1967 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1968 	err = rt6_fill_node(skb, rt,
1969 			    &fl.fl6_dst, &fl.fl6_src,
1970 			    iif,
1971 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1972 			    nlh->nlmsg_seq, 0, 0);
1973 	if (err < 0) {
1974 		err = -EMSGSIZE;
1975 		goto out_free;
1976 	}
1977 
1978 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1979 	if (err > 0)
1980 		err = 0;
1981 out:
1982 	return err;
1983 out_free:
1984 	kfree_skb(skb);
1985 	goto out;
1986 }
1987 
1988 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1989 			struct netlink_skb_parms *req)
1990 {
1991 	struct sk_buff *skb;
1992 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1993 	u32 pid = current->pid;
1994 	u32 seq = 0;
1995 
1996 	if (req)
1997 		pid = req->pid;
1998 	if (nlh)
1999 		seq = nlh->nlmsg_seq;
2000 
2001 	skb = alloc_skb(size, gfp_any());
2002 	if (!skb) {
2003 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2004 		return;
2005 	}
2006 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2007 		kfree_skb(skb);
2008 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2009 		return;
2010 	}
2011 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2012 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2013 }
2014 
2015 /*
2016  *	/proc
2017  */
2018 
2019 #ifdef CONFIG_PROC_FS
2020 
2021 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2022 
2023 struct rt6_proc_arg
2024 {
2025 	char *buffer;
2026 	int offset;
2027 	int length;
2028 	int skip;
2029 	int len;
2030 };
2031 
2032 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2033 {
2034 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2035 	int i;
2036 
2037 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2038 		arg->skip++;
2039 		return 0;
2040 	}
2041 
2042 	if (arg->len >= arg->length)
2043 		return 0;
2044 
2045 	for (i=0; i<16; i++) {
2046 		sprintf(arg->buffer + arg->len, "%02x",
2047 			rt->rt6i_dst.addr.s6_addr[i]);
2048 		arg->len += 2;
2049 	}
2050 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2051 			    rt->rt6i_dst.plen);
2052 
2053 #ifdef CONFIG_IPV6_SUBTREES
2054 	for (i=0; i<16; i++) {
2055 		sprintf(arg->buffer + arg->len, "%02x",
2056 			rt->rt6i_src.addr.s6_addr[i]);
2057 		arg->len += 2;
2058 	}
2059 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2060 			    rt->rt6i_src.plen);
2061 #else
2062 	sprintf(arg->buffer + arg->len,
2063 		"00000000000000000000000000000000 00 ");
2064 	arg->len += 36;
2065 #endif
2066 
2067 	if (rt->rt6i_nexthop) {
2068 		for (i=0; i<16; i++) {
2069 			sprintf(arg->buffer + arg->len, "%02x",
2070 				rt->rt6i_nexthop->primary_key[i]);
2071 			arg->len += 2;
2072 		}
2073 	} else {
2074 		sprintf(arg->buffer + arg->len,
2075 			"00000000000000000000000000000000");
2076 		arg->len += 32;
2077 	}
2078 	arg->len += sprintf(arg->buffer + arg->len,
2079 			    " %08x %08x %08x %08x %8s\n",
2080 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2081 			    rt->u.dst.__use, rt->rt6i_flags,
2082 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2083 	return 0;
2084 }
2085 
2086 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2087 {
2088 	struct rt6_proc_arg arg;
2089 	arg.buffer = buffer;
2090 	arg.offset = offset;
2091 	arg.length = length;
2092 	arg.skip = 0;
2093 	arg.len = 0;
2094 
2095 	read_lock_bh(&rt6_lock);
2096 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2097 	read_unlock_bh(&rt6_lock);
2098 
2099 	*start = buffer;
2100 	if (offset)
2101 		*start += offset % RT6_INFO_LEN;
2102 
2103 	arg.len -= offset % RT6_INFO_LEN;
2104 
2105 	if (arg.len > length)
2106 		arg.len = length;
2107 	if (arg.len < 0)
2108 		arg.len = 0;
2109 
2110 	return arg.len;
2111 }
2112 
2113 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2114 {
2115 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2116 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2117 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2118 		      rt6_stats.fib_rt_cache,
2119 		      atomic_read(&ip6_dst_ops.entries),
2120 		      rt6_stats.fib_discarded_routes);
2121 
2122 	return 0;
2123 }
2124 
2125 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2126 {
2127 	return single_open(file, rt6_stats_seq_show, NULL);
2128 }
2129 
2130 static struct file_operations rt6_stats_seq_fops = {
2131 	.owner	 = THIS_MODULE,
2132 	.open	 = rt6_stats_seq_open,
2133 	.read	 = seq_read,
2134 	.llseek	 = seq_lseek,
2135 	.release = single_release,
2136 };
2137 #endif	/* CONFIG_PROC_FS */
2138 
2139 #ifdef CONFIG_SYSCTL
2140 
2141 static int flush_delay;
2142 
2143 static
2144 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2145 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2146 {
2147 	if (write) {
2148 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2149 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2150 		return 0;
2151 	} else
2152 		return -EINVAL;
2153 }
2154 
2155 ctl_table ipv6_route_table[] = {
2156         {
2157 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2158 		.procname	=	"flush",
2159          	.data		=	&flush_delay,
2160 		.maxlen		=	sizeof(int),
2161 		.mode		=	0200,
2162          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2163 	},
2164 	{
2165 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2166 		.procname	=	"gc_thresh",
2167          	.data		=	&ip6_dst_ops.gc_thresh,
2168 		.maxlen		=	sizeof(int),
2169 		.mode		=	0644,
2170          	.proc_handler	=	&proc_dointvec,
2171 	},
2172 	{
2173 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2174 		.procname	=	"max_size",
2175          	.data		=	&ip6_rt_max_size,
2176 		.maxlen		=	sizeof(int),
2177 		.mode		=	0644,
2178          	.proc_handler	=	&proc_dointvec,
2179 	},
2180 	{
2181 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2182 		.procname	=	"gc_min_interval",
2183          	.data		=	&ip6_rt_gc_min_interval,
2184 		.maxlen		=	sizeof(int),
2185 		.mode		=	0644,
2186          	.proc_handler	=	&proc_dointvec_jiffies,
2187 		.strategy	=	&sysctl_jiffies,
2188 	},
2189 	{
2190 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2191 		.procname	=	"gc_timeout",
2192          	.data		=	&ip6_rt_gc_timeout,
2193 		.maxlen		=	sizeof(int),
2194 		.mode		=	0644,
2195          	.proc_handler	=	&proc_dointvec_jiffies,
2196 		.strategy	=	&sysctl_jiffies,
2197 	},
2198 	{
2199 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2200 		.procname	=	"gc_interval",
2201          	.data		=	&ip6_rt_gc_interval,
2202 		.maxlen		=	sizeof(int),
2203 		.mode		=	0644,
2204          	.proc_handler	=	&proc_dointvec_jiffies,
2205 		.strategy	=	&sysctl_jiffies,
2206 	},
2207 	{
2208 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2209 		.procname	=	"gc_elasticity",
2210          	.data		=	&ip6_rt_gc_elasticity,
2211 		.maxlen		=	sizeof(int),
2212 		.mode		=	0644,
2213          	.proc_handler	=	&proc_dointvec_jiffies,
2214 		.strategy	=	&sysctl_jiffies,
2215 	},
2216 	{
2217 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2218 		.procname	=	"mtu_expires",
2219          	.data		=	&ip6_rt_mtu_expires,
2220 		.maxlen		=	sizeof(int),
2221 		.mode		=	0644,
2222          	.proc_handler	=	&proc_dointvec_jiffies,
2223 		.strategy	=	&sysctl_jiffies,
2224 	},
2225 	{
2226 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2227 		.procname	=	"min_adv_mss",
2228          	.data		=	&ip6_rt_min_advmss,
2229 		.maxlen		=	sizeof(int),
2230 		.mode		=	0644,
2231          	.proc_handler	=	&proc_dointvec_jiffies,
2232 		.strategy	=	&sysctl_jiffies,
2233 	},
2234 	{
2235 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2236 		.procname	=	"gc_min_interval_ms",
2237          	.data		=	&ip6_rt_gc_min_interval,
2238 		.maxlen		=	sizeof(int),
2239 		.mode		=	0644,
2240          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2241 		.strategy	=	&sysctl_ms_jiffies,
2242 	},
2243 	{ .ctl_name = 0 }
2244 };
2245 
2246 #endif
2247 
2248 void __init ip6_route_init(void)
2249 {
2250 	struct proc_dir_entry *p;
2251 
2252 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2253 						     sizeof(struct rt6_info),
2254 						     0, SLAB_HWCACHE_ALIGN,
2255 						     NULL, NULL);
2256 	if (!ip6_dst_ops.kmem_cachep)
2257 		panic("cannot create ip6_dst_cache");
2258 
2259 	fib6_init();
2260 #ifdef 	CONFIG_PROC_FS
2261 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2262 	if (p)
2263 		p->owner = THIS_MODULE;
2264 
2265 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2266 #endif
2267 #ifdef CONFIG_XFRM
2268 	xfrm6_init();
2269 #endif
2270 }
2271 
2272 void ip6_route_cleanup(void)
2273 {
2274 #ifdef CONFIG_PROC_FS
2275 	proc_net_remove("ipv6_route");
2276 	proc_net_remove("rt6_stats");
2277 #endif
2278 #ifdef CONFIG_XFRM
2279 	xfrm6_fini();
2280 #endif
2281 	rt6_ifdown(NULL);
2282 	fib6_gc_cleanup();
2283 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2284 }
2285