xref: /linux/net/ipv6/route.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41 
42 #ifdef 	CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46 
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 	    !(rt->rt6i_flags & RTF_GATEWAY))
285 		m = 1;
286 	else if (neigh) {
287 		read_lock_bh(&neigh->lock);
288 		if (neigh->nud_state & NUD_VALID)
289 			m = 2;
290 		read_unlock_bh(&neigh->lock);
291 	}
292 	return m;
293 }
294 
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296 			   int strict)
297 {
298 	int m, n;
299 
300 	m = rt6_check_dev(rt, oif);
301 	if (!m && (strict & RT6_SELECT_F_IFACE))
302 		return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306 	n = rt6_check_neigh(rt);
307 	if (n > 1)
308 		m |= 16;
309 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
310 		return -1;
311 	return m;
312 }
313 
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315 				   int strict)
316 {
317 	struct rt6_info *match = NULL, *last = NULL;
318 	struct rt6_info *rt, *rt0 = *head;
319 	u32 metric;
320 	int mpri = -1;
321 
322 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 		  __FUNCTION__, head, head ? *head : NULL, oif);
324 
325 	for (rt = rt0, metric = rt0->rt6i_metric;
326 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327 	     rt = rt->u.next) {
328 		int m;
329 
330 		if (rt6_check_expired(rt))
331 			continue;
332 
333 		last = rt;
334 
335 		m = rt6_score_route(rt, oif, strict);
336 		if (m < 0)
337 			continue;
338 
339 		if (m > mpri) {
340 			rt6_probe(match);
341 			match = rt;
342 			mpri = m;
343 		} else {
344 			rt6_probe(rt);
345 		}
346 	}
347 
348 	if (!match &&
349 	    (strict & RT6_SELECT_F_REACHABLE) &&
350 	    last && last != rt0) {
351 		/* no entries matched; do round-robin */
352 		static spinlock_t lock = SPIN_LOCK_UNLOCKED;
353 		spin_lock(&lock);
354 		*head = rt0->u.next;
355 		rt0->u.next = last->u.next;
356 		last->u.next = rt0;
357 		spin_unlock(&lock);
358 	}
359 
360 	RT6_TRACE("%s() => %p, score=%d\n",
361 		  __FUNCTION__, match, mpri);
362 
363 	return (match ? match : &ip6_null_entry);
364 }
365 
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 		  struct in6_addr *gwaddr)
369 {
370 	struct route_info *rinfo = (struct route_info *) opt;
371 	struct in6_addr prefix_buf, *prefix;
372 	unsigned int pref;
373 	u32 lifetime;
374 	struct rt6_info *rt;
375 
376 	if (len < sizeof(struct route_info)) {
377 		return -EINVAL;
378 	}
379 
380 	/* Sanity check for prefix_len and length */
381 	if (rinfo->length > 3) {
382 		return -EINVAL;
383 	} else if (rinfo->prefix_len > 128) {
384 		return -EINVAL;
385 	} else if (rinfo->prefix_len > 64) {
386 		if (rinfo->length < 2) {
387 			return -EINVAL;
388 		}
389 	} else if (rinfo->prefix_len > 0) {
390 		if (rinfo->length < 1) {
391 			return -EINVAL;
392 		}
393 	}
394 
395 	pref = rinfo->route_pref;
396 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
398 
399 	lifetime = htonl(rinfo->lifetime);
400 	if (lifetime == 0xffffffff) {
401 		/* infinity */
402 	} else if (lifetime > 0x7fffffff/HZ) {
403 		/* Avoid arithmetic overflow */
404 		lifetime = 0x7fffffff/HZ - 1;
405 	}
406 
407 	if (rinfo->length == 3)
408 		prefix = (struct in6_addr *)rinfo->prefix;
409 	else {
410 		/* this function is safe */
411 		ipv6_addr_prefix(&prefix_buf,
412 				 (struct in6_addr *)rinfo->prefix,
413 				 rinfo->prefix_len);
414 		prefix = &prefix_buf;
415 	}
416 
417 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418 
419 	if (rt && !lifetime) {
420 		ip6_del_rt(rt, NULL, NULL, NULL);
421 		rt = NULL;
422 	}
423 
424 	if (!rt && lifetime)
425 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426 					pref);
427 	else if (rt)
428 		rt->rt6i_flags = RTF_ROUTEINFO |
429 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430 
431 	if (rt) {
432 		if (lifetime == 0xffffffff) {
433 			rt->rt6i_flags &= ~RTF_EXPIRES;
434 		} else {
435 			rt->rt6i_expires = jiffies + HZ * lifetime;
436 			rt->rt6i_flags |= RTF_EXPIRES;
437 		}
438 		dst_release(&rt->u.dst);
439 	}
440 	return 0;
441 }
442 #endif
443 
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445 			    int oif, int strict)
446 {
447 	struct fib6_node *fn;
448 	struct rt6_info *rt;
449 
450 	read_lock_bh(&rt6_lock);
451 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 	rt = rt6_device_match(fn->leaf, oif, strict);
453 	dst_hold(&rt->u.dst);
454 	rt->u.dst.__use++;
455 	read_unlock_bh(&rt6_lock);
456 
457 	rt->u.dst.lastuse = jiffies;
458 	if (rt->u.dst.error == 0)
459 		return rt;
460 	dst_release(&rt->u.dst);
461 	return NULL;
462 }
463 
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469 
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 		void *_rtattr, struct netlink_skb_parms *req)
472 {
473 	int err;
474 
475 	write_lock_bh(&rt6_lock);
476 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477 	write_unlock_bh(&rt6_lock);
478 
479 	return err;
480 }
481 
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 				      struct in6_addr *saddr)
484 {
485 	struct rt6_info *rt;
486 
487 	/*
488 	 *	Clone the route.
489 	 */
490 
491 	rt = ip6_rt_copy(ort);
492 
493 	if (rt) {
494 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 			if (rt->rt6i_dst.plen != 128 &&
496 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 				rt->rt6i_flags |= RTF_ANYCAST;
498 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499 		}
500 
501 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502 		rt->rt6i_dst.plen = 128;
503 		rt->rt6i_flags |= RTF_CACHE;
504 		rt->u.dst.flags |= DST_HOST;
505 
506 #ifdef CONFIG_IPV6_SUBTREES
507 		if (rt->rt6i_src.plen && saddr) {
508 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 			rt->rt6i_src.plen = 128;
510 		}
511 #endif
512 
513 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514 
515 	}
516 
517 	return rt;
518 }
519 
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522 	struct rt6_info *rt = ip6_rt_copy(ort);
523 	if (rt) {
524 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 		rt->rt6i_dst.plen = 128;
526 		rt->rt6i_flags |= RTF_CACHE;
527 		if (rt->rt6i_flags & RTF_REJECT)
528 			rt->u.dst.error = ort->u.dst.error;
529 		rt->u.dst.flags |= DST_HOST;
530 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531 	}
532 	return rt;
533 }
534 
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538 		if (fn->fn_flags & RTN_ROOT) { \
539 			goto out; \
540 		} \
541 		if (fn->fn_flags & RTN_RTINFO) \
542 			goto restart; \
543 	} \
544 }
545 
546 
547 void ip6_route_input(struct sk_buff *skb)
548 {
549 	struct fib6_node *fn;
550 	struct rt6_info *rt, *nrt;
551 	int strict;
552 	int attempts = 3;
553 	int err;
554 	int reachable = RT6_SELECT_F_REACHABLE;
555 
556 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557 
558 relookup:
559 	read_lock_bh(&rt6_lock);
560 
561 restart_2:
562 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 			 &skb->nh.ipv6h->saddr);
564 
565 restart:
566 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567 	BACKTRACK();
568 	if (rt == &ip6_null_entry ||
569 	    rt->rt6i_flags & RTF_CACHE)
570 		goto out;
571 
572 	dst_hold(&rt->u.dst);
573 	read_unlock_bh(&rt6_lock);
574 
575 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577 	else {
578 #if CLONE_OFFLINK_ROUTE
579 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581 		goto out2;
582 #endif
583 	}
584 
585 	dst_release(&rt->u.dst);
586 	rt = nrt ? : &ip6_null_entry;
587 
588 	dst_hold(&rt->u.dst);
589 	if (nrt) {
590 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591 		if (!err)
592 			goto out2;
593 	}
594 
595 	if (--attempts <= 0)
596 		goto out2;
597 
598 	/*
599 	 * Race condition! In the gap, when rt6_lock was
600 	 * released someone could insert this route.  Relookup.
601 	 */
602 	dst_release(&rt->u.dst);
603 	goto relookup;
604 
605 out:
606 	if (reachable) {
607 		reachable = 0;
608 		goto restart_2;
609 	}
610 	dst_hold(&rt->u.dst);
611 	read_unlock_bh(&rt6_lock);
612 out2:
613 	rt->u.dst.lastuse = jiffies;
614 	rt->u.dst.__use++;
615 	skb->dst = (struct dst_entry *) rt;
616 	return;
617 }
618 
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621 	struct fib6_node *fn;
622 	struct rt6_info *rt, *nrt;
623 	int strict;
624 	int attempts = 3;
625 	int err;
626 	int reachable = RT6_SELECT_F_REACHABLE;
627 
628 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629 
630 relookup:
631 	read_lock_bh(&rt6_lock);
632 
633 restart_2:
634 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635 
636 restart:
637 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638 	BACKTRACK();
639 	if (rt == &ip6_null_entry ||
640 	    rt->rt6i_flags & RTF_CACHE)
641 		goto out;
642 
643 	dst_hold(&rt->u.dst);
644 	read_unlock_bh(&rt6_lock);
645 
646 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648 	else {
649 #if CLONE_OFFLINK_ROUTE
650 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652 		goto out2;
653 #endif
654 	}
655 
656 	dst_release(&rt->u.dst);
657 	rt = nrt ? : &ip6_null_entry;
658 
659 	dst_hold(&rt->u.dst);
660 	if (nrt) {
661 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662 		if (!err)
663 			goto out2;
664 	}
665 
666 	if (--attempts <= 0)
667 		goto out2;
668 
669 	/*
670 	 * Race condition! In the gap, when rt6_lock was
671 	 * released someone could insert this route.  Relookup.
672 	 */
673 	dst_release(&rt->u.dst);
674 	goto relookup;
675 
676 out:
677 	if (reachable) {
678 		reachable = 0;
679 		goto restart_2;
680 	}
681 	dst_hold(&rt->u.dst);
682 	read_unlock_bh(&rt6_lock);
683 out2:
684 	rt->u.dst.lastuse = jiffies;
685 	rt->u.dst.__use++;
686 	return &rt->u.dst;
687 }
688 
689 
690 /*
691  *	Destination cache support functions
692  */
693 
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696 	struct rt6_info *rt;
697 
698 	rt = (struct rt6_info *) dst;
699 
700 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701 		return dst;
702 
703 	return NULL;
704 }
705 
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708 	struct rt6_info *rt = (struct rt6_info *) dst;
709 
710 	if (rt) {
711 		if (rt->rt6i_flags & RTF_CACHE)
712 			ip6_del_rt(rt, NULL, NULL, NULL);
713 		else
714 			dst_release(dst);
715 	}
716 	return NULL;
717 }
718 
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721 	struct rt6_info *rt;
722 
723 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724 
725 	rt = (struct rt6_info *) skb->dst;
726 	if (rt) {
727 		if (rt->rt6i_flags&RTF_CACHE) {
728 			dst_set_expires(&rt->u.dst, 0);
729 			rt->rt6i_flags |= RTF_EXPIRES;
730 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 			rt->rt6i_node->fn_sernum = -1;
732 	}
733 }
734 
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737 	struct rt6_info *rt6 = (struct rt6_info*)dst;
738 
739 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 		rt6->rt6i_flags |= RTF_MODIFIED;
741 		if (mtu < IPV6_MIN_MTU) {
742 			mtu = IPV6_MIN_MTU;
743 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744 		}
745 		dst->metrics[RTAX_MTU-1] = mtu;
746 	}
747 }
748 
749 /* Protected by rt6_lock.  */
750 static struct dst_entry *ndisc_dst_gc_list;
751 static int ipv6_get_mtu(struct net_device *dev);
752 
753 static inline unsigned int ipv6_advmss(unsigned int mtu)
754 {
755 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
756 
757 	if (mtu < ip6_rt_min_advmss)
758 		mtu = ip6_rt_min_advmss;
759 
760 	/*
761 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
762 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
763 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
764 	 * rely only on pmtu discovery"
765 	 */
766 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
767 		mtu = IPV6_MAXPLEN;
768 	return mtu;
769 }
770 
771 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
772 				  struct neighbour *neigh,
773 				  struct in6_addr *addr,
774 				  int (*output)(struct sk_buff *))
775 {
776 	struct rt6_info *rt;
777 	struct inet6_dev *idev = in6_dev_get(dev);
778 
779 	if (unlikely(idev == NULL))
780 		return NULL;
781 
782 	rt = ip6_dst_alloc();
783 	if (unlikely(rt == NULL)) {
784 		in6_dev_put(idev);
785 		goto out;
786 	}
787 
788 	dev_hold(dev);
789 	if (neigh)
790 		neigh_hold(neigh);
791 	else
792 		neigh = ndisc_get_neigh(dev, addr);
793 
794 	rt->rt6i_dev	  = dev;
795 	rt->rt6i_idev     = idev;
796 	rt->rt6i_nexthop  = neigh;
797 	atomic_set(&rt->u.dst.__refcnt, 1);
798 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
799 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
800 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
801 	rt->u.dst.output  = output;
802 
803 #if 0	/* there's no chance to use these for ndisc */
804 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
805 				? DST_HOST
806 				: 0;
807 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
808 	rt->rt6i_dst.plen = 128;
809 #endif
810 
811 	write_lock_bh(&rt6_lock);
812 	rt->u.dst.next = ndisc_dst_gc_list;
813 	ndisc_dst_gc_list = &rt->u.dst;
814 	write_unlock_bh(&rt6_lock);
815 
816 	fib6_force_start_gc();
817 
818 out:
819 	return (struct dst_entry *)rt;
820 }
821 
822 int ndisc_dst_gc(int *more)
823 {
824 	struct dst_entry *dst, *next, **pprev;
825 	int freed;
826 
827 	next = NULL;
828 	pprev = &ndisc_dst_gc_list;
829 	freed = 0;
830 	while ((dst = *pprev) != NULL) {
831 		if (!atomic_read(&dst->__refcnt)) {
832 			*pprev = dst->next;
833 			dst_free(dst);
834 			freed++;
835 		} else {
836 			pprev = &dst->next;
837 			(*more)++;
838 		}
839 	}
840 
841 	return freed;
842 }
843 
844 static int ip6_dst_gc(void)
845 {
846 	static unsigned expire = 30*HZ;
847 	static unsigned long last_gc;
848 	unsigned long now = jiffies;
849 
850 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
851 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
852 		goto out;
853 
854 	expire++;
855 	fib6_run_gc(expire);
856 	last_gc = now;
857 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
858 		expire = ip6_rt_gc_timeout>>1;
859 
860 out:
861 	expire -= expire>>ip6_rt_gc_elasticity;
862 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
863 }
864 
865 /* Clean host part of a prefix. Not necessary in radix tree,
866    but results in cleaner routing tables.
867 
868    Remove it only when all the things will work!
869  */
870 
871 static int ipv6_get_mtu(struct net_device *dev)
872 {
873 	int mtu = IPV6_MIN_MTU;
874 	struct inet6_dev *idev;
875 
876 	idev = in6_dev_get(dev);
877 	if (idev) {
878 		mtu = idev->cnf.mtu6;
879 		in6_dev_put(idev);
880 	}
881 	return mtu;
882 }
883 
884 int ipv6_get_hoplimit(struct net_device *dev)
885 {
886 	int hoplimit = ipv6_devconf.hop_limit;
887 	struct inet6_dev *idev;
888 
889 	idev = in6_dev_get(dev);
890 	if (idev) {
891 		hoplimit = idev->cnf.hop_limit;
892 		in6_dev_put(idev);
893 	}
894 	return hoplimit;
895 }
896 
897 /*
898  *
899  */
900 
901 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
902 		void *_rtattr, struct netlink_skb_parms *req)
903 {
904 	int err;
905 	struct rtmsg *r;
906 	struct rtattr **rta;
907 	struct rt6_info *rt = NULL;
908 	struct net_device *dev = NULL;
909 	struct inet6_dev *idev = NULL;
910 	int addr_type;
911 
912 	rta = (struct rtattr **) _rtattr;
913 
914 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
915 		return -EINVAL;
916 #ifndef CONFIG_IPV6_SUBTREES
917 	if (rtmsg->rtmsg_src_len)
918 		return -EINVAL;
919 #endif
920 	if (rtmsg->rtmsg_ifindex) {
921 		err = -ENODEV;
922 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
923 		if (!dev)
924 			goto out;
925 		idev = in6_dev_get(dev);
926 		if (!idev)
927 			goto out;
928 	}
929 
930 	if (rtmsg->rtmsg_metric == 0)
931 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
932 
933 	rt = ip6_dst_alloc();
934 
935 	if (rt == NULL) {
936 		err = -ENOMEM;
937 		goto out;
938 	}
939 
940 	rt->u.dst.obsolete = -1;
941 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
942 	if (nlh && (r = NLMSG_DATA(nlh))) {
943 		rt->rt6i_protocol = r->rtm_protocol;
944 	} else {
945 		rt->rt6i_protocol = RTPROT_BOOT;
946 	}
947 
948 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
949 
950 	if (addr_type & IPV6_ADDR_MULTICAST)
951 		rt->u.dst.input = ip6_mc_input;
952 	else
953 		rt->u.dst.input = ip6_forward;
954 
955 	rt->u.dst.output = ip6_output;
956 
957 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
958 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
959 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
960 	if (rt->rt6i_dst.plen == 128)
961 	       rt->u.dst.flags = DST_HOST;
962 
963 #ifdef CONFIG_IPV6_SUBTREES
964 	ipv6_addr_prefix(&rt->rt6i_src.addr,
965 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
966 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
967 #endif
968 
969 	rt->rt6i_metric = rtmsg->rtmsg_metric;
970 
971 	/* We cannot add true routes via loopback here,
972 	   they would result in kernel looping; promote them to reject routes
973 	 */
974 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
975 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
976 		/* hold loopback dev/idev if we haven't done so. */
977 		if (dev != &loopback_dev) {
978 			if (dev) {
979 				dev_put(dev);
980 				in6_dev_put(idev);
981 			}
982 			dev = &loopback_dev;
983 			dev_hold(dev);
984 			idev = in6_dev_get(dev);
985 			if (!idev) {
986 				err = -ENODEV;
987 				goto out;
988 			}
989 		}
990 		rt->u.dst.output = ip6_pkt_discard_out;
991 		rt->u.dst.input = ip6_pkt_discard;
992 		rt->u.dst.error = -ENETUNREACH;
993 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
994 		goto install_route;
995 	}
996 
997 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
998 		struct in6_addr *gw_addr;
999 		int gwa_type;
1000 
1001 		gw_addr = &rtmsg->rtmsg_gateway;
1002 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1003 		gwa_type = ipv6_addr_type(gw_addr);
1004 
1005 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1006 			struct rt6_info *grt;
1007 
1008 			/* IPv6 strictly inhibits using not link-local
1009 			   addresses as nexthop address.
1010 			   Otherwise, router will not able to send redirects.
1011 			   It is very good, but in some (rare!) circumstances
1012 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1013 			   some exceptions. --ANK
1014 			 */
1015 			err = -EINVAL;
1016 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1017 				goto out;
1018 
1019 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1020 
1021 			err = -EHOSTUNREACH;
1022 			if (grt == NULL)
1023 				goto out;
1024 			if (dev) {
1025 				if (dev != grt->rt6i_dev) {
1026 					dst_release(&grt->u.dst);
1027 					goto out;
1028 				}
1029 			} else {
1030 				dev = grt->rt6i_dev;
1031 				idev = grt->rt6i_idev;
1032 				dev_hold(dev);
1033 				in6_dev_hold(grt->rt6i_idev);
1034 			}
1035 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1036 				err = 0;
1037 			dst_release(&grt->u.dst);
1038 
1039 			if (err)
1040 				goto out;
1041 		}
1042 		err = -EINVAL;
1043 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1044 			goto out;
1045 	}
1046 
1047 	err = -ENODEV;
1048 	if (dev == NULL)
1049 		goto out;
1050 
1051 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1052 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1053 		if (IS_ERR(rt->rt6i_nexthop)) {
1054 			err = PTR_ERR(rt->rt6i_nexthop);
1055 			rt->rt6i_nexthop = NULL;
1056 			goto out;
1057 		}
1058 	}
1059 
1060 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1061 
1062 install_route:
1063 	if (rta && rta[RTA_METRICS-1]) {
1064 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1065 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1066 
1067 		while (RTA_OK(attr, attrlen)) {
1068 			unsigned flavor = attr->rta_type;
1069 			if (flavor) {
1070 				if (flavor > RTAX_MAX) {
1071 					err = -EINVAL;
1072 					goto out;
1073 				}
1074 				rt->u.dst.metrics[flavor-1] =
1075 					*(u32 *)RTA_DATA(attr);
1076 			}
1077 			attr = RTA_NEXT(attr, attrlen);
1078 		}
1079 	}
1080 
1081 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1082 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1083 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1084 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1085 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1086 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1087 	rt->u.dst.dev = dev;
1088 	rt->rt6i_idev = idev;
1089 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1090 
1091 out:
1092 	if (dev)
1093 		dev_put(dev);
1094 	if (idev)
1095 		in6_dev_put(idev);
1096 	if (rt)
1097 		dst_free((struct dst_entry *) rt);
1098 	return err;
1099 }
1100 
1101 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1102 {
1103 	int err;
1104 
1105 	write_lock_bh(&rt6_lock);
1106 
1107 	err = fib6_del(rt, nlh, _rtattr, req);
1108 	dst_release(&rt->u.dst);
1109 
1110 	write_unlock_bh(&rt6_lock);
1111 
1112 	return err;
1113 }
1114 
1115 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1116 {
1117 	struct fib6_node *fn;
1118 	struct rt6_info *rt;
1119 	int err = -ESRCH;
1120 
1121 	read_lock_bh(&rt6_lock);
1122 
1123 	fn = fib6_locate(&ip6_routing_table,
1124 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1125 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1126 
1127 	if (fn) {
1128 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1129 			if (rtmsg->rtmsg_ifindex &&
1130 			    (rt->rt6i_dev == NULL ||
1131 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1132 				continue;
1133 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1134 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1135 				continue;
1136 			if (rtmsg->rtmsg_metric &&
1137 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1138 				continue;
1139 			dst_hold(&rt->u.dst);
1140 			read_unlock_bh(&rt6_lock);
1141 
1142 			return ip6_del_rt(rt, nlh, _rtattr, req);
1143 		}
1144 	}
1145 	read_unlock_bh(&rt6_lock);
1146 
1147 	return err;
1148 }
1149 
1150 /*
1151  *	Handle redirects
1152  */
1153 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1154 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1155 {
1156 	struct rt6_info *rt, *nrt = NULL;
1157 	int strict;
1158 	struct fib6_node *fn;
1159 
1160 	/*
1161 	 * Get the "current" route for this destination and
1162 	 * check if the redirect has come from approriate router.
1163 	 *
1164 	 * RFC 2461 specifies that redirects should only be
1165 	 * accepted if they come from the nexthop to the target.
1166 	 * Due to the way the routes are chosen, this notion
1167 	 * is a bit fuzzy and one might need to check all possible
1168 	 * routes.
1169 	 */
1170 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1171 
1172 	read_lock_bh(&rt6_lock);
1173 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1174 restart:
1175 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1176 		/*
1177 		 * Current route is on-link; redirect is always invalid.
1178 		 *
1179 		 * Seems, previous statement is not true. It could
1180 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1181 		 * But then router serving it might decide, that we should
1182 		 * know truth 8)8) --ANK (980726).
1183 		 */
1184 		if (rt6_check_expired(rt))
1185 			continue;
1186 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1187 			continue;
1188 		if (neigh->dev != rt->rt6i_dev)
1189 			continue;
1190 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1191 			continue;
1192 		break;
1193 	}
1194 	if (rt)
1195 		dst_hold(&rt->u.dst);
1196 	else if (strict) {
1197 		while ((fn = fn->parent) != NULL) {
1198 			if (fn->fn_flags & RTN_ROOT)
1199 				break;
1200 			if (fn->fn_flags & RTN_RTINFO)
1201 				goto restart;
1202 		}
1203 	}
1204 	read_unlock_bh(&rt6_lock);
1205 
1206 	if (!rt) {
1207 		if (net_ratelimit())
1208 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1209 			       "for redirect target\n");
1210 		return;
1211 	}
1212 
1213 	/*
1214 	 *	We have finally decided to accept it.
1215 	 */
1216 
1217 	neigh_update(neigh, lladdr, NUD_STALE,
1218 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1219 		     NEIGH_UPDATE_F_OVERRIDE|
1220 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1221 				     NEIGH_UPDATE_F_ISROUTER))
1222 		     );
1223 
1224 	/*
1225 	 * Redirect received -> path was valid.
1226 	 * Look, redirects are sent only in response to data packets,
1227 	 * so that this nexthop apparently is reachable. --ANK
1228 	 */
1229 	dst_confirm(&rt->u.dst);
1230 
1231 	/* Duplicate redirect: silently ignore. */
1232 	if (neigh == rt->u.dst.neighbour)
1233 		goto out;
1234 
1235 	nrt = ip6_rt_copy(rt);
1236 	if (nrt == NULL)
1237 		goto out;
1238 
1239 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1240 	if (on_link)
1241 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1242 
1243 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1244 	nrt->rt6i_dst.plen = 128;
1245 	nrt->u.dst.flags |= DST_HOST;
1246 
1247 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1248 	nrt->rt6i_nexthop = neigh_clone(neigh);
1249 	/* Reset pmtu, it may be better */
1250 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1251 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1252 
1253 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1254 		goto out;
1255 
1256 	if (rt->rt6i_flags&RTF_CACHE) {
1257 		ip6_del_rt(rt, NULL, NULL, NULL);
1258 		return;
1259 	}
1260 
1261 out:
1262         dst_release(&rt->u.dst);
1263 	return;
1264 }
1265 
1266 /*
1267  *	Handle ICMP "packet too big" messages
1268  *	i.e. Path MTU discovery
1269  */
1270 
1271 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1272 			struct net_device *dev, u32 pmtu)
1273 {
1274 	struct rt6_info *rt, *nrt;
1275 	int allfrag = 0;
1276 
1277 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1278 	if (rt == NULL)
1279 		return;
1280 
1281 	if (pmtu >= dst_mtu(&rt->u.dst))
1282 		goto out;
1283 
1284 	if (pmtu < IPV6_MIN_MTU) {
1285 		/*
1286 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1287 		 * MTU (1280) and a fragment header should always be included
1288 		 * after a node receiving Too Big message reporting PMTU is
1289 		 * less than the IPv6 Minimum Link MTU.
1290 		 */
1291 		pmtu = IPV6_MIN_MTU;
1292 		allfrag = 1;
1293 	}
1294 
1295 	/* New mtu received -> path was valid.
1296 	   They are sent only in response to data packets,
1297 	   so that this nexthop apparently is reachable. --ANK
1298 	 */
1299 	dst_confirm(&rt->u.dst);
1300 
1301 	/* Host route. If it is static, it would be better
1302 	   not to override it, but add new one, so that
1303 	   when cache entry will expire old pmtu
1304 	   would return automatically.
1305 	 */
1306 	if (rt->rt6i_flags & RTF_CACHE) {
1307 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1308 		if (allfrag)
1309 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1310 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1311 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1312 		goto out;
1313 	}
1314 
1315 	/* Network route.
1316 	   Two cases are possible:
1317 	   1. It is connected route. Action: COW
1318 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1319 	 */
1320 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1321 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1322 	else
1323 		nrt = rt6_alloc_clone(rt, daddr);
1324 
1325 	if (nrt) {
1326 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1327 		if (allfrag)
1328 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1329 
1330 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1331 		 * happened within 5 mins, the recommended timer is 10 mins.
1332 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1333 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1334 		 * and detecting PMTU increase will be automatically happened.
1335 		 */
1336 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1337 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1338 
1339 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1340 	}
1341 out:
1342 	dst_release(&rt->u.dst);
1343 }
1344 
1345 /*
1346  *	Misc support functions
1347  */
1348 
1349 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1350 {
1351 	struct rt6_info *rt = ip6_dst_alloc();
1352 
1353 	if (rt) {
1354 		rt->u.dst.input = ort->u.dst.input;
1355 		rt->u.dst.output = ort->u.dst.output;
1356 
1357 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1358 		rt->u.dst.dev = ort->u.dst.dev;
1359 		if (rt->u.dst.dev)
1360 			dev_hold(rt->u.dst.dev);
1361 		rt->rt6i_idev = ort->rt6i_idev;
1362 		if (rt->rt6i_idev)
1363 			in6_dev_hold(rt->rt6i_idev);
1364 		rt->u.dst.lastuse = jiffies;
1365 		rt->rt6i_expires = 0;
1366 
1367 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1368 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1369 		rt->rt6i_metric = 0;
1370 
1371 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1372 #ifdef CONFIG_IPV6_SUBTREES
1373 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1374 #endif
1375 	}
1376 	return rt;
1377 }
1378 
1379 #ifdef CONFIG_IPV6_ROUTE_INFO
1380 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1381 					   struct in6_addr *gwaddr, int ifindex)
1382 {
1383 	struct fib6_node *fn;
1384 	struct rt6_info *rt = NULL;
1385 
1386 	write_lock_bh(&rt6_lock);
1387 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1388 	if (!fn)
1389 		goto out;
1390 
1391 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1392 		if (rt->rt6i_dev->ifindex != ifindex)
1393 			continue;
1394 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1395 			continue;
1396 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1397 			continue;
1398 		dst_hold(&rt->u.dst);
1399 		break;
1400 	}
1401 out:
1402 	write_unlock_bh(&rt6_lock);
1403 	return rt;
1404 }
1405 
1406 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1407 					   struct in6_addr *gwaddr, int ifindex,
1408 					   unsigned pref)
1409 {
1410 	struct in6_rtmsg rtmsg;
1411 
1412 	memset(&rtmsg, 0, sizeof(rtmsg));
1413 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1414 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1415 	rtmsg.rtmsg_dst_len = prefixlen;
1416 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1417 	rtmsg.rtmsg_metric = 1024;
1418 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1419 	/* We should treat it as a default route if prefix length is 0. */
1420 	if (!prefixlen)
1421 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1422 	rtmsg.rtmsg_ifindex = ifindex;
1423 
1424 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1425 
1426 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1427 }
1428 #endif
1429 
1430 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1431 {
1432 	struct rt6_info *rt;
1433 	struct fib6_node *fn;
1434 
1435 	fn = &ip6_routing_table;
1436 
1437 	write_lock_bh(&rt6_lock);
1438 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1439 		if (dev == rt->rt6i_dev &&
1440 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1441 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1442 			break;
1443 	}
1444 	if (rt)
1445 		dst_hold(&rt->u.dst);
1446 	write_unlock_bh(&rt6_lock);
1447 	return rt;
1448 }
1449 
1450 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1451 				     struct net_device *dev,
1452 				     unsigned int pref)
1453 {
1454 	struct in6_rtmsg rtmsg;
1455 
1456 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1457 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1458 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1459 	rtmsg.rtmsg_metric = 1024;
1460 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1461 			    RTF_PREF(pref);
1462 
1463 	rtmsg.rtmsg_ifindex = dev->ifindex;
1464 
1465 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1466 	return rt6_get_dflt_router(gwaddr, dev);
1467 }
1468 
1469 void rt6_purge_dflt_routers(void)
1470 {
1471 	struct rt6_info *rt;
1472 
1473 restart:
1474 	read_lock_bh(&rt6_lock);
1475 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1476 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1477 			dst_hold(&rt->u.dst);
1478 
1479 			read_unlock_bh(&rt6_lock);
1480 
1481 			ip6_del_rt(rt, NULL, NULL, NULL);
1482 
1483 			goto restart;
1484 		}
1485 	}
1486 	read_unlock_bh(&rt6_lock);
1487 }
1488 
1489 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1490 {
1491 	struct in6_rtmsg rtmsg;
1492 	int err;
1493 
1494 	switch(cmd) {
1495 	case SIOCADDRT:		/* Add a route */
1496 	case SIOCDELRT:		/* Delete a route */
1497 		if (!capable(CAP_NET_ADMIN))
1498 			return -EPERM;
1499 		err = copy_from_user(&rtmsg, arg,
1500 				     sizeof(struct in6_rtmsg));
1501 		if (err)
1502 			return -EFAULT;
1503 
1504 		rtnl_lock();
1505 		switch (cmd) {
1506 		case SIOCADDRT:
1507 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1508 			break;
1509 		case SIOCDELRT:
1510 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1511 			break;
1512 		default:
1513 			err = -EINVAL;
1514 		}
1515 		rtnl_unlock();
1516 
1517 		return err;
1518 	};
1519 
1520 	return -EINVAL;
1521 }
1522 
1523 /*
1524  *	Drop the packet on the floor
1525  */
1526 
1527 static int ip6_pkt_discard(struct sk_buff *skb)
1528 {
1529 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1530 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1531 	kfree_skb(skb);
1532 	return 0;
1533 }
1534 
1535 static int ip6_pkt_discard_out(struct sk_buff *skb)
1536 {
1537 	skb->dev = skb->dst->dev;
1538 	return ip6_pkt_discard(skb);
1539 }
1540 
1541 /*
1542  *	Allocate a dst for local (unicast / anycast) address.
1543  */
1544 
1545 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1546 				    const struct in6_addr *addr,
1547 				    int anycast)
1548 {
1549 	struct rt6_info *rt = ip6_dst_alloc();
1550 
1551 	if (rt == NULL)
1552 		return ERR_PTR(-ENOMEM);
1553 
1554 	dev_hold(&loopback_dev);
1555 	in6_dev_hold(idev);
1556 
1557 	rt->u.dst.flags = DST_HOST;
1558 	rt->u.dst.input = ip6_input;
1559 	rt->u.dst.output = ip6_output;
1560 	rt->rt6i_dev = &loopback_dev;
1561 	rt->rt6i_idev = idev;
1562 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1563 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1564 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1565 	rt->u.dst.obsolete = -1;
1566 
1567 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1568 	if (anycast)
1569 		rt->rt6i_flags |= RTF_ANYCAST;
1570 	else
1571 		rt->rt6i_flags |= RTF_LOCAL;
1572 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1573 	if (rt->rt6i_nexthop == NULL) {
1574 		dst_free((struct dst_entry *) rt);
1575 		return ERR_PTR(-ENOMEM);
1576 	}
1577 
1578 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1579 	rt->rt6i_dst.plen = 128;
1580 
1581 	atomic_set(&rt->u.dst.__refcnt, 1);
1582 
1583 	return rt;
1584 }
1585 
1586 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1587 {
1588 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1589 	    rt != &ip6_null_entry) {
1590 		RT6_TRACE("deleted by ifdown %p\n", rt);
1591 		return -1;
1592 	}
1593 	return 0;
1594 }
1595 
1596 void rt6_ifdown(struct net_device *dev)
1597 {
1598 	write_lock_bh(&rt6_lock);
1599 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1600 	write_unlock_bh(&rt6_lock);
1601 }
1602 
1603 struct rt6_mtu_change_arg
1604 {
1605 	struct net_device *dev;
1606 	unsigned mtu;
1607 };
1608 
1609 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1610 {
1611 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1612 	struct inet6_dev *idev;
1613 
1614 	/* In IPv6 pmtu discovery is not optional,
1615 	   so that RTAX_MTU lock cannot disable it.
1616 	   We still use this lock to block changes
1617 	   caused by addrconf/ndisc.
1618 	*/
1619 
1620 	idev = __in6_dev_get(arg->dev);
1621 	if (idev == NULL)
1622 		return 0;
1623 
1624 	/* For administrative MTU increase, there is no way to discover
1625 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1626 	   Since RFC 1981 doesn't include administrative MTU increase
1627 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1628 	 */
1629 	/*
1630 	   If new MTU is less than route PMTU, this new MTU will be the
1631 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1632 	   decreases; if new MTU is greater than route PMTU, and the
1633 	   old MTU is the lowest MTU in the path, update the route PMTU
1634 	   to reflect the increase. In this case if the other nodes' MTU
1635 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1636 	   PMTU discouvery.
1637 	 */
1638 	if (rt->rt6i_dev == arg->dev &&
1639 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1640             (dst_mtu(&rt->u.dst) > arg->mtu ||
1641              (dst_mtu(&rt->u.dst) < arg->mtu &&
1642 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1643 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1644 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1645 	return 0;
1646 }
1647 
1648 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1649 {
1650 	struct rt6_mtu_change_arg arg;
1651 
1652 	arg.dev = dev;
1653 	arg.mtu = mtu;
1654 	read_lock_bh(&rt6_lock);
1655 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1656 	read_unlock_bh(&rt6_lock);
1657 }
1658 
1659 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1660 			      struct in6_rtmsg *rtmsg)
1661 {
1662 	memset(rtmsg, 0, sizeof(*rtmsg));
1663 
1664 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1665 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1666 	rtmsg->rtmsg_flags = RTF_UP;
1667 	if (r->rtm_type == RTN_UNREACHABLE)
1668 		rtmsg->rtmsg_flags |= RTF_REJECT;
1669 
1670 	if (rta[RTA_GATEWAY-1]) {
1671 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1672 			return -EINVAL;
1673 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1674 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1675 	}
1676 	if (rta[RTA_DST-1]) {
1677 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1678 			return -EINVAL;
1679 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1680 	}
1681 	if (rta[RTA_SRC-1]) {
1682 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1683 			return -EINVAL;
1684 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1685 	}
1686 	if (rta[RTA_OIF-1]) {
1687 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1688 			return -EINVAL;
1689 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1690 	}
1691 	if (rta[RTA_PRIORITY-1]) {
1692 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1693 			return -EINVAL;
1694 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1695 	}
1696 	return 0;
1697 }
1698 
1699 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1700 {
1701 	struct rtmsg *r = NLMSG_DATA(nlh);
1702 	struct in6_rtmsg rtmsg;
1703 
1704 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1705 		return -EINVAL;
1706 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1707 }
1708 
1709 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1710 {
1711 	struct rtmsg *r = NLMSG_DATA(nlh);
1712 	struct in6_rtmsg rtmsg;
1713 
1714 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1715 		return -EINVAL;
1716 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1717 }
1718 
1719 struct rt6_rtnl_dump_arg
1720 {
1721 	struct sk_buff *skb;
1722 	struct netlink_callback *cb;
1723 };
1724 
1725 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1726 			 struct in6_addr *dst, struct in6_addr *src,
1727 			 int iif, int type, u32 pid, u32 seq,
1728 			 int prefix, unsigned int flags)
1729 {
1730 	struct rtmsg *rtm;
1731 	struct nlmsghdr  *nlh;
1732 	unsigned char	 *b = skb->tail;
1733 	struct rta_cacheinfo ci;
1734 
1735 	if (prefix) {	/* user wants prefix routes only */
1736 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1737 			/* success since this is not a prefix route */
1738 			return 1;
1739 		}
1740 	}
1741 
1742 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1743 	rtm = NLMSG_DATA(nlh);
1744 	rtm->rtm_family = AF_INET6;
1745 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1746 	rtm->rtm_src_len = rt->rt6i_src.plen;
1747 	rtm->rtm_tos = 0;
1748 	rtm->rtm_table = RT_TABLE_MAIN;
1749 	if (rt->rt6i_flags&RTF_REJECT)
1750 		rtm->rtm_type = RTN_UNREACHABLE;
1751 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1752 		rtm->rtm_type = RTN_LOCAL;
1753 	else
1754 		rtm->rtm_type = RTN_UNICAST;
1755 	rtm->rtm_flags = 0;
1756 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1757 	rtm->rtm_protocol = rt->rt6i_protocol;
1758 	if (rt->rt6i_flags&RTF_DYNAMIC)
1759 		rtm->rtm_protocol = RTPROT_REDIRECT;
1760 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1761 		rtm->rtm_protocol = RTPROT_KERNEL;
1762 	else if (rt->rt6i_flags&RTF_DEFAULT)
1763 		rtm->rtm_protocol = RTPROT_RA;
1764 
1765 	if (rt->rt6i_flags&RTF_CACHE)
1766 		rtm->rtm_flags |= RTM_F_CLONED;
1767 
1768 	if (dst) {
1769 		RTA_PUT(skb, RTA_DST, 16, dst);
1770 	        rtm->rtm_dst_len = 128;
1771 	} else if (rtm->rtm_dst_len)
1772 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1773 #ifdef CONFIG_IPV6_SUBTREES
1774 	if (src) {
1775 		RTA_PUT(skb, RTA_SRC, 16, src);
1776 	        rtm->rtm_src_len = 128;
1777 	} else if (rtm->rtm_src_len)
1778 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1779 #endif
1780 	if (iif)
1781 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1782 	else if (dst) {
1783 		struct in6_addr saddr_buf;
1784 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1785 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1786 	}
1787 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1788 		goto rtattr_failure;
1789 	if (rt->u.dst.neighbour)
1790 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1791 	if (rt->u.dst.dev)
1792 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1793 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1794 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1795 	if (rt->rt6i_expires)
1796 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1797 	else
1798 		ci.rta_expires = 0;
1799 	ci.rta_used = rt->u.dst.__use;
1800 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1801 	ci.rta_error = rt->u.dst.error;
1802 	ci.rta_id = 0;
1803 	ci.rta_ts = 0;
1804 	ci.rta_tsage = 0;
1805 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1806 	nlh->nlmsg_len = skb->tail - b;
1807 	return skb->len;
1808 
1809 nlmsg_failure:
1810 rtattr_failure:
1811 	skb_trim(skb, b - skb->data);
1812 	return -1;
1813 }
1814 
1815 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1816 {
1817 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1818 	int prefix;
1819 
1820 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1821 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1822 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1823 	} else
1824 		prefix = 0;
1825 
1826 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1827 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1828 		     prefix, NLM_F_MULTI);
1829 }
1830 
1831 static int fib6_dump_node(struct fib6_walker_t *w)
1832 {
1833 	int res;
1834 	struct rt6_info *rt;
1835 
1836 	for (rt = w->leaf; rt; rt = rt->u.next) {
1837 		res = rt6_dump_route(rt, w->args);
1838 		if (res < 0) {
1839 			/* Frame is full, suspend walking */
1840 			w->leaf = rt;
1841 			return 1;
1842 		}
1843 		BUG_TRAP(res!=0);
1844 	}
1845 	w->leaf = NULL;
1846 	return 0;
1847 }
1848 
1849 static void fib6_dump_end(struct netlink_callback *cb)
1850 {
1851 	struct fib6_walker_t *w = (void*)cb->args[0];
1852 
1853 	if (w) {
1854 		cb->args[0] = 0;
1855 		fib6_walker_unlink(w);
1856 		kfree(w);
1857 	}
1858 	cb->done = (void*)cb->args[1];
1859 	cb->args[1] = 0;
1860 }
1861 
1862 static int fib6_dump_done(struct netlink_callback *cb)
1863 {
1864 	fib6_dump_end(cb);
1865 	return cb->done ? cb->done(cb) : 0;
1866 }
1867 
1868 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1869 {
1870 	struct rt6_rtnl_dump_arg arg;
1871 	struct fib6_walker_t *w;
1872 	int res;
1873 
1874 	arg.skb = skb;
1875 	arg.cb = cb;
1876 
1877 	w = (void*)cb->args[0];
1878 	if (w == NULL) {
1879 		/* New dump:
1880 		 *
1881 		 * 1. hook callback destructor.
1882 		 */
1883 		cb->args[1] = (long)cb->done;
1884 		cb->done = fib6_dump_done;
1885 
1886 		/*
1887 		 * 2. allocate and initialize walker.
1888 		 */
1889 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1890 		if (w == NULL)
1891 			return -ENOMEM;
1892 		RT6_TRACE("dump<%p", w);
1893 		w->root = &ip6_routing_table;
1894 		w->func = fib6_dump_node;
1895 		w->args = &arg;
1896 		cb->args[0] = (long)w;
1897 		read_lock_bh(&rt6_lock);
1898 		res = fib6_walk(w);
1899 		read_unlock_bh(&rt6_lock);
1900 	} else {
1901 		w->args = &arg;
1902 		read_lock_bh(&rt6_lock);
1903 		res = fib6_walk_continue(w);
1904 		read_unlock_bh(&rt6_lock);
1905 	}
1906 #if RT6_DEBUG >= 3
1907 	if (res <= 0 && skb->len == 0)
1908 		RT6_TRACE("%p>dump end\n", w);
1909 #endif
1910 	res = res < 0 ? res : skb->len;
1911 	/* res < 0 is an error. (really, impossible)
1912 	   res == 0 means that dump is complete, but skb still can contain data.
1913 	   res > 0 dump is not complete, but frame is full.
1914 	 */
1915 	/* Destroy walker, if dump of this table is complete. */
1916 	if (res <= 0)
1917 		fib6_dump_end(cb);
1918 	return res;
1919 }
1920 
1921 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1922 {
1923 	struct rtattr **rta = arg;
1924 	int iif = 0;
1925 	int err = -ENOBUFS;
1926 	struct sk_buff *skb;
1927 	struct flowi fl;
1928 	struct rt6_info *rt;
1929 
1930 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1931 	if (skb == NULL)
1932 		goto out;
1933 
1934 	/* Reserve room for dummy headers, this skb can pass
1935 	   through good chunk of routing engine.
1936 	 */
1937 	skb->mac.raw = skb->data;
1938 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1939 
1940 	memset(&fl, 0, sizeof(fl));
1941 	if (rta[RTA_SRC-1])
1942 		ipv6_addr_copy(&fl.fl6_src,
1943 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1944 	if (rta[RTA_DST-1])
1945 		ipv6_addr_copy(&fl.fl6_dst,
1946 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1947 
1948 	if (rta[RTA_IIF-1])
1949 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1950 
1951 	if (iif) {
1952 		struct net_device *dev;
1953 		dev = __dev_get_by_index(iif);
1954 		if (!dev) {
1955 			err = -ENODEV;
1956 			goto out_free;
1957 		}
1958 	}
1959 
1960 	fl.oif = 0;
1961 	if (rta[RTA_OIF-1])
1962 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1963 
1964 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1965 
1966 	skb->dst = &rt->u.dst;
1967 
1968 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1969 	err = rt6_fill_node(skb, rt,
1970 			    &fl.fl6_dst, &fl.fl6_src,
1971 			    iif,
1972 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1973 			    nlh->nlmsg_seq, 0, 0);
1974 	if (err < 0) {
1975 		err = -EMSGSIZE;
1976 		goto out_free;
1977 	}
1978 
1979 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1980 	if (err > 0)
1981 		err = 0;
1982 out:
1983 	return err;
1984 out_free:
1985 	kfree_skb(skb);
1986 	goto out;
1987 }
1988 
1989 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1990 			struct netlink_skb_parms *req)
1991 {
1992 	struct sk_buff *skb;
1993 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1994 	u32 pid = current->pid;
1995 	u32 seq = 0;
1996 
1997 	if (req)
1998 		pid = req->pid;
1999 	if (nlh)
2000 		seq = nlh->nlmsg_seq;
2001 
2002 	skb = alloc_skb(size, gfp_any());
2003 	if (!skb) {
2004 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2005 		return;
2006 	}
2007 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2008 		kfree_skb(skb);
2009 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2010 		return;
2011 	}
2012 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2013 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2014 }
2015 
2016 /*
2017  *	/proc
2018  */
2019 
2020 #ifdef CONFIG_PROC_FS
2021 
2022 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2023 
2024 struct rt6_proc_arg
2025 {
2026 	char *buffer;
2027 	int offset;
2028 	int length;
2029 	int skip;
2030 	int len;
2031 };
2032 
2033 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2034 {
2035 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2036 	int i;
2037 
2038 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2039 		arg->skip++;
2040 		return 0;
2041 	}
2042 
2043 	if (arg->len >= arg->length)
2044 		return 0;
2045 
2046 	for (i=0; i<16; i++) {
2047 		sprintf(arg->buffer + arg->len, "%02x",
2048 			rt->rt6i_dst.addr.s6_addr[i]);
2049 		arg->len += 2;
2050 	}
2051 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2052 			    rt->rt6i_dst.plen);
2053 
2054 #ifdef CONFIG_IPV6_SUBTREES
2055 	for (i=0; i<16; i++) {
2056 		sprintf(arg->buffer + arg->len, "%02x",
2057 			rt->rt6i_src.addr.s6_addr[i]);
2058 		arg->len += 2;
2059 	}
2060 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2061 			    rt->rt6i_src.plen);
2062 #else
2063 	sprintf(arg->buffer + arg->len,
2064 		"00000000000000000000000000000000 00 ");
2065 	arg->len += 36;
2066 #endif
2067 
2068 	if (rt->rt6i_nexthop) {
2069 		for (i=0; i<16; i++) {
2070 			sprintf(arg->buffer + arg->len, "%02x",
2071 				rt->rt6i_nexthop->primary_key[i]);
2072 			arg->len += 2;
2073 		}
2074 	} else {
2075 		sprintf(arg->buffer + arg->len,
2076 			"00000000000000000000000000000000");
2077 		arg->len += 32;
2078 	}
2079 	arg->len += sprintf(arg->buffer + arg->len,
2080 			    " %08x %08x %08x %08x %8s\n",
2081 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2082 			    rt->u.dst.__use, rt->rt6i_flags,
2083 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2084 	return 0;
2085 }
2086 
2087 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2088 {
2089 	struct rt6_proc_arg arg;
2090 	arg.buffer = buffer;
2091 	arg.offset = offset;
2092 	arg.length = length;
2093 	arg.skip = 0;
2094 	arg.len = 0;
2095 
2096 	read_lock_bh(&rt6_lock);
2097 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2098 	read_unlock_bh(&rt6_lock);
2099 
2100 	*start = buffer;
2101 	if (offset)
2102 		*start += offset % RT6_INFO_LEN;
2103 
2104 	arg.len -= offset % RT6_INFO_LEN;
2105 
2106 	if (arg.len > length)
2107 		arg.len = length;
2108 	if (arg.len < 0)
2109 		arg.len = 0;
2110 
2111 	return arg.len;
2112 }
2113 
2114 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2115 {
2116 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2117 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2118 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2119 		      rt6_stats.fib_rt_cache,
2120 		      atomic_read(&ip6_dst_ops.entries),
2121 		      rt6_stats.fib_discarded_routes);
2122 
2123 	return 0;
2124 }
2125 
2126 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2127 {
2128 	return single_open(file, rt6_stats_seq_show, NULL);
2129 }
2130 
2131 static struct file_operations rt6_stats_seq_fops = {
2132 	.owner	 = THIS_MODULE,
2133 	.open	 = rt6_stats_seq_open,
2134 	.read	 = seq_read,
2135 	.llseek	 = seq_lseek,
2136 	.release = single_release,
2137 };
2138 #endif	/* CONFIG_PROC_FS */
2139 
2140 #ifdef CONFIG_SYSCTL
2141 
2142 static int flush_delay;
2143 
2144 static
2145 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2146 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2147 {
2148 	if (write) {
2149 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2150 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2151 		return 0;
2152 	} else
2153 		return -EINVAL;
2154 }
2155 
2156 ctl_table ipv6_route_table[] = {
2157         {
2158 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2159 		.procname	=	"flush",
2160          	.data		=	&flush_delay,
2161 		.maxlen		=	sizeof(int),
2162 		.mode		=	0200,
2163          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2164 	},
2165 	{
2166 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2167 		.procname	=	"gc_thresh",
2168          	.data		=	&ip6_dst_ops.gc_thresh,
2169 		.maxlen		=	sizeof(int),
2170 		.mode		=	0644,
2171          	.proc_handler	=	&proc_dointvec,
2172 	},
2173 	{
2174 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2175 		.procname	=	"max_size",
2176          	.data		=	&ip6_rt_max_size,
2177 		.maxlen		=	sizeof(int),
2178 		.mode		=	0644,
2179          	.proc_handler	=	&proc_dointvec,
2180 	},
2181 	{
2182 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2183 		.procname	=	"gc_min_interval",
2184          	.data		=	&ip6_rt_gc_min_interval,
2185 		.maxlen		=	sizeof(int),
2186 		.mode		=	0644,
2187          	.proc_handler	=	&proc_dointvec_jiffies,
2188 		.strategy	=	&sysctl_jiffies,
2189 	},
2190 	{
2191 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2192 		.procname	=	"gc_timeout",
2193          	.data		=	&ip6_rt_gc_timeout,
2194 		.maxlen		=	sizeof(int),
2195 		.mode		=	0644,
2196          	.proc_handler	=	&proc_dointvec_jiffies,
2197 		.strategy	=	&sysctl_jiffies,
2198 	},
2199 	{
2200 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2201 		.procname	=	"gc_interval",
2202          	.data		=	&ip6_rt_gc_interval,
2203 		.maxlen		=	sizeof(int),
2204 		.mode		=	0644,
2205          	.proc_handler	=	&proc_dointvec_jiffies,
2206 		.strategy	=	&sysctl_jiffies,
2207 	},
2208 	{
2209 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2210 		.procname	=	"gc_elasticity",
2211          	.data		=	&ip6_rt_gc_elasticity,
2212 		.maxlen		=	sizeof(int),
2213 		.mode		=	0644,
2214          	.proc_handler	=	&proc_dointvec_jiffies,
2215 		.strategy	=	&sysctl_jiffies,
2216 	},
2217 	{
2218 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2219 		.procname	=	"mtu_expires",
2220          	.data		=	&ip6_rt_mtu_expires,
2221 		.maxlen		=	sizeof(int),
2222 		.mode		=	0644,
2223          	.proc_handler	=	&proc_dointvec_jiffies,
2224 		.strategy	=	&sysctl_jiffies,
2225 	},
2226 	{
2227 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2228 		.procname	=	"min_adv_mss",
2229          	.data		=	&ip6_rt_min_advmss,
2230 		.maxlen		=	sizeof(int),
2231 		.mode		=	0644,
2232          	.proc_handler	=	&proc_dointvec_jiffies,
2233 		.strategy	=	&sysctl_jiffies,
2234 	},
2235 	{
2236 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2237 		.procname	=	"gc_min_interval_ms",
2238          	.data		=	&ip6_rt_gc_min_interval,
2239 		.maxlen		=	sizeof(int),
2240 		.mode		=	0644,
2241          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2242 		.strategy	=	&sysctl_ms_jiffies,
2243 	},
2244 	{ .ctl_name = 0 }
2245 };
2246 
2247 #endif
2248 
2249 void __init ip6_route_init(void)
2250 {
2251 	struct proc_dir_entry *p;
2252 
2253 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2254 						     sizeof(struct rt6_info),
2255 						     0, SLAB_HWCACHE_ALIGN,
2256 						     NULL, NULL);
2257 	if (!ip6_dst_ops.kmem_cachep)
2258 		panic("cannot create ip6_dst_cache");
2259 
2260 	fib6_init();
2261 #ifdef 	CONFIG_PROC_FS
2262 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2263 	if (p)
2264 		p->owner = THIS_MODULE;
2265 
2266 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2267 #endif
2268 #ifdef CONFIG_XFRM
2269 	xfrm6_init();
2270 #endif
2271 }
2272 
2273 void ip6_route_cleanup(void)
2274 {
2275 #ifdef CONFIG_PROC_FS
2276 	proc_net_remove("ipv6_route");
2277 	proc_net_remove("rt6_stats");
2278 #endif
2279 #ifdef CONFIG_XFRM
2280 	xfrm6_fini();
2281 #endif
2282 	rt6_ifdown(NULL);
2283 	fib6_gc_cleanup();
2284 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2285 }
2286