xref: /linux/net/ipv6/route.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66 
67 #include <asm/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 enum rt6_nud_state {
74 	RT6_NUD_FAIL_HARD = -3,
75 	RT6_NUD_FAIL_PROBE = -2,
76 	RT6_NUD_FAIL_DO_RR = -1,
77 	RT6_NUD_SUCCEED = 1
78 };
79 
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104 					   const struct in6_addr *prefix, int prefixlen,
105 					   const struct in6_addr *gwaddr, int ifindex,
106 					   unsigned int pref);
107 static struct rt6_info *rt6_get_route_info(struct net *net,
108 					   const struct in6_addr *prefix, int prefixlen,
109 					   const struct in6_addr *gwaddr, int ifindex);
110 #endif
111 
112 struct uncached_list {
113 	spinlock_t		lock;
114 	struct list_head	head;
115 };
116 
117 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
118 
119 static void rt6_uncached_list_add(struct rt6_info *rt)
120 {
121 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
122 
123 	rt->dst.flags |= DST_NOCACHE;
124 	rt->rt6i_uncached_list = ul;
125 
126 	spin_lock_bh(&ul->lock);
127 	list_add_tail(&rt->rt6i_uncached, &ul->head);
128 	spin_unlock_bh(&ul->lock);
129 }
130 
131 static void rt6_uncached_list_del(struct rt6_info *rt)
132 {
133 	if (!list_empty(&rt->rt6i_uncached)) {
134 		struct uncached_list *ul = rt->rt6i_uncached_list;
135 
136 		spin_lock_bh(&ul->lock);
137 		list_del(&rt->rt6i_uncached);
138 		spin_unlock_bh(&ul->lock);
139 	}
140 }
141 
142 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
143 {
144 	struct net_device *loopback_dev = net->loopback_dev;
145 	int cpu;
146 
147 	if (dev == loopback_dev)
148 		return;
149 
150 	for_each_possible_cpu(cpu) {
151 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152 		struct rt6_info *rt;
153 
154 		spin_lock_bh(&ul->lock);
155 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
156 			struct inet6_dev *rt_idev = rt->rt6i_idev;
157 			struct net_device *rt_dev = rt->dst.dev;
158 
159 			if (rt_idev->dev == dev) {
160 				rt->rt6i_idev = in6_dev_get(loopback_dev);
161 				in6_dev_put(rt_idev);
162 			}
163 
164 			if (rt_dev == dev) {
165 				rt->dst.dev = loopback_dev;
166 				dev_hold(rt->dst.dev);
167 				dev_put(rt_dev);
168 			}
169 		}
170 		spin_unlock_bh(&ul->lock);
171 	}
172 }
173 
174 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
175 {
176 	return dst_metrics_write_ptr(rt->dst.from);
177 }
178 
179 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
180 {
181 	struct rt6_info *rt = (struct rt6_info *)dst;
182 
183 	if (rt->rt6i_flags & RTF_PCPU)
184 		return rt6_pcpu_cow_metrics(rt);
185 	else if (rt->rt6i_flags & RTF_CACHE)
186 		return NULL;
187 	else
188 		return dst_cow_metrics_generic(dst, old);
189 }
190 
191 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	struct in6_addr *p = &rt->rt6i_gateway;
196 
197 	if (!ipv6_addr_any(p))
198 		return (const void *) p;
199 	else if (skb)
200 		return &ipv6_hdr(skb)->daddr;
201 	return daddr;
202 }
203 
204 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
205 					  struct sk_buff *skb,
206 					  const void *daddr)
207 {
208 	struct rt6_info *rt = (struct rt6_info *) dst;
209 	struct neighbour *n;
210 
211 	daddr = choose_neigh_daddr(rt, skb, daddr);
212 	n = __ipv6_neigh_lookup(dst->dev, daddr);
213 	if (n)
214 		return n;
215 	return neigh_create(&nd_tbl, daddr, dst->dev);
216 }
217 
218 static struct dst_ops ip6_dst_ops_template = {
219 	.family			=	AF_INET6,
220 	.gc			=	ip6_dst_gc,
221 	.gc_thresh		=	1024,
222 	.check			=	ip6_dst_check,
223 	.default_advmss		=	ip6_default_advmss,
224 	.mtu			=	ip6_mtu,
225 	.cow_metrics		=	ipv6_cow_metrics,
226 	.destroy		=	ip6_dst_destroy,
227 	.ifdown			=	ip6_dst_ifdown,
228 	.negative_advice	=	ip6_negative_advice,
229 	.link_failure		=	ip6_link_failure,
230 	.update_pmtu		=	ip6_rt_update_pmtu,
231 	.redirect		=	rt6_do_redirect,
232 	.local_out		=	__ip6_local_out,
233 	.neigh_lookup		=	ip6_neigh_lookup,
234 };
235 
236 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
237 {
238 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
239 
240 	return mtu ? : dst->dev->mtu;
241 }
242 
243 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
244 					 struct sk_buff *skb, u32 mtu)
245 {
246 }
247 
248 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
249 				      struct sk_buff *skb)
250 {
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	dst_cow_metrics_generic,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 static void rt6_info_init(struct rt6_info *rt)
319 {
320 	struct dst_entry *dst = &rt->dst;
321 
322 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
323 	INIT_LIST_HEAD(&rt->rt6i_siblings);
324 	INIT_LIST_HEAD(&rt->rt6i_uncached);
325 }
326 
327 /* allocate dst with ip6_dst_ops */
328 static struct rt6_info *__ip6_dst_alloc(struct net *net,
329 					struct net_device *dev,
330 					int flags)
331 {
332 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
333 					0, DST_OBSOLETE_FORCE_CHK, flags);
334 
335 	if (rt)
336 		rt6_info_init(rt);
337 
338 	return rt;
339 }
340 
341 struct rt6_info *ip6_dst_alloc(struct net *net,
342 			       struct net_device *dev,
343 			       int flags)
344 {
345 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346 
347 	if (rt) {
348 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349 		if (rt->rt6i_pcpu) {
350 			int cpu;
351 
352 			for_each_possible_cpu(cpu) {
353 				struct rt6_info **p;
354 
355 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356 				/* no one shares rt */
357 				*p =  NULL;
358 			}
359 		} else {
360 			dst_destroy((struct dst_entry *)rt);
361 			return NULL;
362 		}
363 	}
364 
365 	return rt;
366 }
367 EXPORT_SYMBOL(ip6_dst_alloc);
368 
369 static void ip6_dst_destroy(struct dst_entry *dst)
370 {
371 	struct rt6_info *rt = (struct rt6_info *)dst;
372 	struct dst_entry *from = dst->from;
373 	struct inet6_dev *idev;
374 
375 	dst_destroy_metrics_generic(dst);
376 	free_percpu(rt->rt6i_pcpu);
377 	rt6_uncached_list_del(rt);
378 
379 	idev = rt->rt6i_idev;
380 	if (idev) {
381 		rt->rt6i_idev = NULL;
382 		in6_dev_put(idev);
383 	}
384 
385 	dst->from = NULL;
386 	dst_release(from);
387 }
388 
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 			   int how)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct inet6_dev *idev = rt->rt6i_idev;
394 	struct net_device *loopback_dev =
395 		dev_net(dev)->loopback_dev;
396 
397 	if (dev != loopback_dev) {
398 		if (idev && idev->dev == dev) {
399 			struct inet6_dev *loopback_idev =
400 				in6_dev_get(loopback_dev);
401 			if (loopback_idev) {
402 				rt->rt6i_idev = loopback_idev;
403 				in6_dev_put(idev);
404 			}
405 		}
406 	}
407 }
408 
409 static bool __rt6_check_expired(const struct rt6_info *rt)
410 {
411 	if (rt->rt6i_flags & RTF_EXPIRES)
412 		return time_after(jiffies, rt->dst.expires);
413 	else
414 		return false;
415 }
416 
417 static bool rt6_check_expired(const struct rt6_info *rt)
418 {
419 	if (rt->rt6i_flags & RTF_EXPIRES) {
420 		if (time_after(jiffies, rt->dst.expires))
421 			return true;
422 	} else if (rt->dst.from) {
423 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
424 	}
425 	return false;
426 }
427 
428 /* Multipath route selection:
429  *   Hash based function using packet header and flowlabel.
430  * Adapted from fib_info_hashfn()
431  */
432 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
433 			       const struct flowi6 *fl6)
434 {
435 	return get_hash_from_flowi6(fl6) % candidate_count;
436 }
437 
438 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
439 					     struct flowi6 *fl6, int oif,
440 					     int strict)
441 {
442 	struct rt6_info *sibling, *next_sibling;
443 	int route_choosen;
444 
445 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
446 	/* Don't change the route, if route_choosen == 0
447 	 * (siblings does not include ourself)
448 	 */
449 	if (route_choosen)
450 		list_for_each_entry_safe(sibling, next_sibling,
451 				&match->rt6i_siblings, rt6i_siblings) {
452 			route_choosen--;
453 			if (route_choosen == 0) {
454 				if (rt6_score_route(sibling, oif, strict) < 0)
455 					break;
456 				match = sibling;
457 				break;
458 			}
459 		}
460 	return match;
461 }
462 
463 /*
464  *	Route lookup. Any table->tb6_lock is implied.
465  */
466 
467 static inline struct rt6_info *rt6_device_match(struct net *net,
468 						    struct rt6_info *rt,
469 						    const struct in6_addr *saddr,
470 						    int oif,
471 						    int flags)
472 {
473 	struct rt6_info *local = NULL;
474 	struct rt6_info *sprt;
475 
476 	if (!oif && ipv6_addr_any(saddr))
477 		goto out;
478 
479 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
480 		struct net_device *dev = sprt->dst.dev;
481 
482 		if (oif) {
483 			if (dev->ifindex == oif)
484 				return sprt;
485 			if (dev->flags & IFF_LOOPBACK) {
486 				if (!sprt->rt6i_idev ||
487 				    sprt->rt6i_idev->dev->ifindex != oif) {
488 					if (flags & RT6_LOOKUP_F_IFACE)
489 						continue;
490 					if (local &&
491 					    local->rt6i_idev->dev->ifindex == oif)
492 						continue;
493 				}
494 				local = sprt;
495 			}
496 		} else {
497 			if (ipv6_chk_addr(net, saddr, dev,
498 					  flags & RT6_LOOKUP_F_IFACE))
499 				return sprt;
500 		}
501 	}
502 
503 	if (oif) {
504 		if (local)
505 			return local;
506 
507 		if (flags & RT6_LOOKUP_F_IFACE)
508 			return net->ipv6.ip6_null_entry;
509 	}
510 out:
511 	return rt;
512 }
513 
514 #ifdef CONFIG_IPV6_ROUTER_PREF
515 struct __rt6_probe_work {
516 	struct work_struct work;
517 	struct in6_addr target;
518 	struct net_device *dev;
519 };
520 
521 static void rt6_probe_deferred(struct work_struct *w)
522 {
523 	struct in6_addr mcaddr;
524 	struct __rt6_probe_work *work =
525 		container_of(w, struct __rt6_probe_work, work);
526 
527 	addrconf_addr_solict_mult(&work->target, &mcaddr);
528 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
529 	dev_put(work->dev);
530 	kfree(work);
531 }
532 
533 static void rt6_probe(struct rt6_info *rt)
534 {
535 	struct __rt6_probe_work *work;
536 	struct neighbour *neigh;
537 	/*
538 	 * Okay, this does not seem to be appropriate
539 	 * for now, however, we need to check if it
540 	 * is really so; aka Router Reachability Probing.
541 	 *
542 	 * Router Reachability Probe MUST be rate-limited
543 	 * to no more than one per minute.
544 	 */
545 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
546 		return;
547 	rcu_read_lock_bh();
548 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
549 	if (neigh) {
550 		if (neigh->nud_state & NUD_VALID)
551 			goto out;
552 
553 		work = NULL;
554 		write_lock(&neigh->lock);
555 		if (!(neigh->nud_state & NUD_VALID) &&
556 		    time_after(jiffies,
557 			       neigh->updated +
558 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
559 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
560 			if (work)
561 				__neigh_set_probe_once(neigh);
562 		}
563 		write_unlock(&neigh->lock);
564 	} else {
565 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
566 	}
567 
568 	if (work) {
569 		INIT_WORK(&work->work, rt6_probe_deferred);
570 		work->target = rt->rt6i_gateway;
571 		dev_hold(rt->dst.dev);
572 		work->dev = rt->dst.dev;
573 		schedule_work(&work->work);
574 	}
575 
576 out:
577 	rcu_read_unlock_bh();
578 }
579 #else
580 static inline void rt6_probe(struct rt6_info *rt)
581 {
582 }
583 #endif
584 
585 /*
586  * Default Router Selection (RFC 2461 6.3.6)
587  */
588 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
589 {
590 	struct net_device *dev = rt->dst.dev;
591 	if (!oif || dev->ifindex == oif)
592 		return 2;
593 	if ((dev->flags & IFF_LOOPBACK) &&
594 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
595 		return 1;
596 	return 0;
597 }
598 
599 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
600 {
601 	struct neighbour *neigh;
602 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
603 
604 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
605 	    !(rt->rt6i_flags & RTF_GATEWAY))
606 		return RT6_NUD_SUCCEED;
607 
608 	rcu_read_lock_bh();
609 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
610 	if (neigh) {
611 		read_lock(&neigh->lock);
612 		if (neigh->nud_state & NUD_VALID)
613 			ret = RT6_NUD_SUCCEED;
614 #ifdef CONFIG_IPV6_ROUTER_PREF
615 		else if (!(neigh->nud_state & NUD_FAILED))
616 			ret = RT6_NUD_SUCCEED;
617 		else
618 			ret = RT6_NUD_FAIL_PROBE;
619 #endif
620 		read_unlock(&neigh->lock);
621 	} else {
622 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
623 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
624 	}
625 	rcu_read_unlock_bh();
626 
627 	return ret;
628 }
629 
630 static int rt6_score_route(struct rt6_info *rt, int oif,
631 			   int strict)
632 {
633 	int m;
634 
635 	m = rt6_check_dev(rt, oif);
636 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
637 		return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
640 #endif
641 	if (strict & RT6_LOOKUP_F_REACHABLE) {
642 		int n = rt6_check_neigh(rt);
643 		if (n < 0)
644 			return n;
645 	}
646 	return m;
647 }
648 
649 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
650 				   int *mpri, struct rt6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 	struct inet6_dev *idev = rt->rt6i_idev;
656 	struct net_device *dev = rt->dst.dev;
657 
658 	if (dev && !netif_carrier_ok(dev) &&
659 	    idev->cnf.ignore_routes_with_linkdown)
660 		goto out;
661 
662 	if (rt6_check_expired(rt))
663 		goto out;
664 
665 	m = rt6_score_route(rt, oif, strict);
666 	if (m == RT6_NUD_FAIL_DO_RR) {
667 		match_do_rr = true;
668 		m = 0; /* lowest valid score */
669 	} else if (m == RT6_NUD_FAIL_HARD) {
670 		goto out;
671 	}
672 
673 	if (strict & RT6_LOOKUP_F_REACHABLE)
674 		rt6_probe(rt);
675 
676 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
677 	if (m > *mpri) {
678 		*do_rr = match_do_rr;
679 		*mpri = m;
680 		match = rt;
681 	}
682 out:
683 	return match;
684 }
685 
686 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
687 				     struct rt6_info *rr_head,
688 				     u32 metric, int oif, int strict,
689 				     bool *do_rr)
690 {
691 	struct rt6_info *rt, *match, *cont;
692 	int mpri = -1;
693 
694 	match = NULL;
695 	cont = NULL;
696 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
697 		if (rt->rt6i_metric != metric) {
698 			cont = rt;
699 			break;
700 		}
701 
702 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
703 	}
704 
705 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
706 		if (rt->rt6i_metric != metric) {
707 			cont = rt;
708 			break;
709 		}
710 
711 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
712 	}
713 
714 	if (match || !cont)
715 		return match;
716 
717 	for (rt = cont; rt; rt = rt->dst.rt6_next)
718 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
719 
720 	return match;
721 }
722 
723 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
724 {
725 	struct rt6_info *match, *rt0;
726 	struct net *net;
727 	bool do_rr = false;
728 
729 	rt0 = fn->rr_ptr;
730 	if (!rt0)
731 		fn->rr_ptr = rt0 = fn->leaf;
732 
733 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
734 			     &do_rr);
735 
736 	if (do_rr) {
737 		struct rt6_info *next = rt0->dst.rt6_next;
738 
739 		/* no entries matched; do round-robin */
740 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
741 			next = fn->leaf;
742 
743 		if (next != rt0)
744 			fn->rr_ptr = next;
745 	}
746 
747 	net = dev_net(rt0->dst.dev);
748 	return match ? match : net->ipv6.ip6_null_entry;
749 }
750 
751 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
752 {
753 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
754 }
755 
756 #ifdef CONFIG_IPV6_ROUTE_INFO
757 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
758 		  const struct in6_addr *gwaddr)
759 {
760 	struct net *net = dev_net(dev);
761 	struct route_info *rinfo = (struct route_info *) opt;
762 	struct in6_addr prefix_buf, *prefix;
763 	unsigned int pref;
764 	unsigned long lifetime;
765 	struct rt6_info *rt;
766 
767 	if (len < sizeof(struct route_info)) {
768 		return -EINVAL;
769 	}
770 
771 	/* Sanity check for prefix_len and length */
772 	if (rinfo->length > 3) {
773 		return -EINVAL;
774 	} else if (rinfo->prefix_len > 128) {
775 		return -EINVAL;
776 	} else if (rinfo->prefix_len > 64) {
777 		if (rinfo->length < 2) {
778 			return -EINVAL;
779 		}
780 	} else if (rinfo->prefix_len > 0) {
781 		if (rinfo->length < 1) {
782 			return -EINVAL;
783 		}
784 	}
785 
786 	pref = rinfo->route_pref;
787 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
788 		return -EINVAL;
789 
790 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
791 
792 	if (rinfo->length == 3)
793 		prefix = (struct in6_addr *)rinfo->prefix;
794 	else {
795 		/* this function is safe */
796 		ipv6_addr_prefix(&prefix_buf,
797 				 (struct in6_addr *)rinfo->prefix,
798 				 rinfo->prefix_len);
799 		prefix = &prefix_buf;
800 	}
801 
802 	if (rinfo->prefix_len == 0)
803 		rt = rt6_get_dflt_router(gwaddr, dev);
804 	else
805 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
806 					gwaddr, dev->ifindex);
807 
808 	if (rt && !lifetime) {
809 		ip6_del_rt(rt);
810 		rt = NULL;
811 	}
812 
813 	if (!rt && lifetime)
814 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
815 					pref);
816 	else if (rt)
817 		rt->rt6i_flags = RTF_ROUTEINFO |
818 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
819 
820 	if (rt) {
821 		if (!addrconf_finite_timeout(lifetime))
822 			rt6_clean_expires(rt);
823 		else
824 			rt6_set_expires(rt, jiffies + HZ * lifetime);
825 
826 		ip6_rt_put(rt);
827 	}
828 	return 0;
829 }
830 #endif
831 
832 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
833 					struct in6_addr *saddr)
834 {
835 	struct fib6_node *pn;
836 	while (1) {
837 		if (fn->fn_flags & RTN_TL_ROOT)
838 			return NULL;
839 		pn = fn->parent;
840 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
841 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
842 		else
843 			fn = pn;
844 		if (fn->fn_flags & RTN_RTINFO)
845 			return fn;
846 	}
847 }
848 
849 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
850 					     struct fib6_table *table,
851 					     struct flowi6 *fl6, int flags)
852 {
853 	struct fib6_node *fn;
854 	struct rt6_info *rt;
855 
856 	read_lock_bh(&table->tb6_lock);
857 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
858 restart:
859 	rt = fn->leaf;
860 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
861 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
862 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
863 	if (rt == net->ipv6.ip6_null_entry) {
864 		fn = fib6_backtrack(fn, &fl6->saddr);
865 		if (fn)
866 			goto restart;
867 	}
868 	dst_use(&rt->dst, jiffies);
869 	read_unlock_bh(&table->tb6_lock);
870 
871 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
872 
873 	return rt;
874 
875 }
876 
877 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
878 				    int flags)
879 {
880 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
881 }
882 EXPORT_SYMBOL_GPL(ip6_route_lookup);
883 
884 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
885 			    const struct in6_addr *saddr, int oif, int strict)
886 {
887 	struct flowi6 fl6 = {
888 		.flowi6_oif = oif,
889 		.daddr = *daddr,
890 	};
891 	struct dst_entry *dst;
892 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
893 
894 	if (saddr) {
895 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
896 		flags |= RT6_LOOKUP_F_HAS_SADDR;
897 	}
898 
899 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
900 	if (dst->error == 0)
901 		return (struct rt6_info *) dst;
902 
903 	dst_release(dst);
904 
905 	return NULL;
906 }
907 EXPORT_SYMBOL(rt6_lookup);
908 
909 /* ip6_ins_rt is called with FREE table->tb6_lock.
910    It takes new route entry, the addition fails by any reason the
911    route is freed. In any case, if caller does not hold it, it may
912    be destroyed.
913  */
914 
915 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
916 			struct mx6_config *mxc)
917 {
918 	int err;
919 	struct fib6_table *table;
920 
921 	table = rt->rt6i_table;
922 	write_lock_bh(&table->tb6_lock);
923 	err = fib6_add(&table->tb6_root, rt, info, mxc);
924 	write_unlock_bh(&table->tb6_lock);
925 
926 	return err;
927 }
928 
929 int ip6_ins_rt(struct rt6_info *rt)
930 {
931 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
932 	struct mx6_config mxc = { .mx = NULL, };
933 
934 	return __ip6_ins_rt(rt, &info, &mxc);
935 }
936 
937 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
938 					   const struct in6_addr *daddr,
939 					   const struct in6_addr *saddr)
940 {
941 	struct rt6_info *rt;
942 
943 	/*
944 	 *	Clone the route.
945 	 */
946 
947 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
948 		ort = (struct rt6_info *)ort->dst.from;
949 
950 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
951 
952 	if (!rt)
953 		return NULL;
954 
955 	ip6_rt_copy_init(rt, ort);
956 	rt->rt6i_flags |= RTF_CACHE;
957 	rt->rt6i_metric = 0;
958 	rt->dst.flags |= DST_HOST;
959 	rt->rt6i_dst.addr = *daddr;
960 	rt->rt6i_dst.plen = 128;
961 
962 	if (!rt6_is_gw_or_nonexthop(ort)) {
963 		if (ort->rt6i_dst.plen != 128 &&
964 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
965 			rt->rt6i_flags |= RTF_ANYCAST;
966 #ifdef CONFIG_IPV6_SUBTREES
967 		if (rt->rt6i_src.plen && saddr) {
968 			rt->rt6i_src.addr = *saddr;
969 			rt->rt6i_src.plen = 128;
970 		}
971 #endif
972 	}
973 
974 	return rt;
975 }
976 
977 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
978 {
979 	struct rt6_info *pcpu_rt;
980 
981 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
982 				  rt->dst.dev, rt->dst.flags);
983 
984 	if (!pcpu_rt)
985 		return NULL;
986 	ip6_rt_copy_init(pcpu_rt, rt);
987 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
988 	pcpu_rt->rt6i_flags |= RTF_PCPU;
989 	return pcpu_rt;
990 }
991 
992 /* It should be called with read_lock_bh(&tb6_lock) acquired */
993 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
994 {
995 	struct rt6_info *pcpu_rt, **p;
996 
997 	p = this_cpu_ptr(rt->rt6i_pcpu);
998 	pcpu_rt = *p;
999 
1000 	if (pcpu_rt) {
1001 		dst_hold(&pcpu_rt->dst);
1002 		rt6_dst_from_metrics_check(pcpu_rt);
1003 	}
1004 	return pcpu_rt;
1005 }
1006 
1007 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1008 {
1009 	struct fib6_table *table = rt->rt6i_table;
1010 	struct rt6_info *pcpu_rt, *prev, **p;
1011 
1012 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1013 	if (!pcpu_rt) {
1014 		struct net *net = dev_net(rt->dst.dev);
1015 
1016 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1017 		return net->ipv6.ip6_null_entry;
1018 	}
1019 
1020 	read_lock_bh(&table->tb6_lock);
1021 	if (rt->rt6i_pcpu) {
1022 		p = this_cpu_ptr(rt->rt6i_pcpu);
1023 		prev = cmpxchg(p, NULL, pcpu_rt);
1024 		if (prev) {
1025 			/* If someone did it before us, return prev instead */
1026 			dst_destroy(&pcpu_rt->dst);
1027 			pcpu_rt = prev;
1028 		}
1029 	} else {
1030 		/* rt has been removed from the fib6 tree
1031 		 * before we have a chance to acquire the read_lock.
1032 		 * In this case, don't brother to create a pcpu rt
1033 		 * since rt is going away anyway.  The next
1034 		 * dst_check() will trigger a re-lookup.
1035 		 */
1036 		dst_destroy(&pcpu_rt->dst);
1037 		pcpu_rt = rt;
1038 	}
1039 	dst_hold(&pcpu_rt->dst);
1040 	rt6_dst_from_metrics_check(pcpu_rt);
1041 	read_unlock_bh(&table->tb6_lock);
1042 	return pcpu_rt;
1043 }
1044 
1045 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1046 				      struct flowi6 *fl6, int flags)
1047 {
1048 	struct fib6_node *fn, *saved_fn;
1049 	struct rt6_info *rt;
1050 	int strict = 0;
1051 
1052 	strict |= flags & RT6_LOOKUP_F_IFACE;
1053 	if (net->ipv6.devconf_all->forwarding == 0)
1054 		strict |= RT6_LOOKUP_F_REACHABLE;
1055 
1056 	read_lock_bh(&table->tb6_lock);
1057 
1058 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1059 	saved_fn = fn;
1060 
1061 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1062 		oif = 0;
1063 
1064 redo_rt6_select:
1065 	rt = rt6_select(fn, oif, strict);
1066 	if (rt->rt6i_nsiblings)
1067 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1068 	if (rt == net->ipv6.ip6_null_entry) {
1069 		fn = fib6_backtrack(fn, &fl6->saddr);
1070 		if (fn)
1071 			goto redo_rt6_select;
1072 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1073 			/* also consider unreachable route */
1074 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1075 			fn = saved_fn;
1076 			goto redo_rt6_select;
1077 		}
1078 	}
1079 
1080 
1081 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1082 		dst_use(&rt->dst, jiffies);
1083 		read_unlock_bh(&table->tb6_lock);
1084 
1085 		rt6_dst_from_metrics_check(rt);
1086 
1087 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1088 		return rt;
1089 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1090 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1091 		/* Create a RTF_CACHE clone which will not be
1092 		 * owned by the fib6 tree.  It is for the special case where
1093 		 * the daddr in the skb during the neighbor look-up is different
1094 		 * from the fl6->daddr used to look-up route here.
1095 		 */
1096 
1097 		struct rt6_info *uncached_rt;
1098 
1099 		dst_use(&rt->dst, jiffies);
1100 		read_unlock_bh(&table->tb6_lock);
1101 
1102 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1103 		dst_release(&rt->dst);
1104 
1105 		if (uncached_rt)
1106 			rt6_uncached_list_add(uncached_rt);
1107 		else
1108 			uncached_rt = net->ipv6.ip6_null_entry;
1109 
1110 		dst_hold(&uncached_rt->dst);
1111 
1112 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1113 		return uncached_rt;
1114 
1115 	} else {
1116 		/* Get a percpu copy */
1117 
1118 		struct rt6_info *pcpu_rt;
1119 
1120 		rt->dst.lastuse = jiffies;
1121 		rt->dst.__use++;
1122 		pcpu_rt = rt6_get_pcpu_route(rt);
1123 
1124 		if (pcpu_rt) {
1125 			read_unlock_bh(&table->tb6_lock);
1126 		} else {
1127 			/* We have to do the read_unlock first
1128 			 * because rt6_make_pcpu_route() may trigger
1129 			 * ip6_dst_gc() which will take the write_lock.
1130 			 */
1131 			dst_hold(&rt->dst);
1132 			read_unlock_bh(&table->tb6_lock);
1133 			pcpu_rt = rt6_make_pcpu_route(rt);
1134 			dst_release(&rt->dst);
1135 		}
1136 
1137 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1138 		return pcpu_rt;
1139 
1140 	}
1141 }
1142 
1143 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1144 					    struct flowi6 *fl6, int flags)
1145 {
1146 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1147 }
1148 
1149 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1150 						struct net_device *dev,
1151 						struct flowi6 *fl6, int flags)
1152 {
1153 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1154 		flags |= RT6_LOOKUP_F_IFACE;
1155 
1156 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1157 }
1158 
1159 void ip6_route_input(struct sk_buff *skb)
1160 {
1161 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1162 	struct net *net = dev_net(skb->dev);
1163 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1164 	struct ip_tunnel_info *tun_info;
1165 	struct flowi6 fl6 = {
1166 		.flowi6_iif = l3mdev_fib_oif(skb->dev),
1167 		.daddr = iph->daddr,
1168 		.saddr = iph->saddr,
1169 		.flowlabel = ip6_flowinfo(iph),
1170 		.flowi6_mark = skb->mark,
1171 		.flowi6_proto = iph->nexthdr,
1172 	};
1173 
1174 	tun_info = skb_tunnel_info(skb);
1175 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1176 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1177 	skb_dst_drop(skb);
1178 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1179 }
1180 
1181 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1182 					     struct flowi6 *fl6, int flags)
1183 {
1184 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1185 }
1186 
1187 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1188 					 struct flowi6 *fl6, int flags)
1189 {
1190 	struct dst_entry *dst;
1191 	bool any_src;
1192 
1193 	dst = l3mdev_get_rt6_dst(net, fl6);
1194 	if (dst)
1195 		return dst;
1196 
1197 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1198 
1199 	any_src = ipv6_addr_any(&fl6->saddr);
1200 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1201 	    (fl6->flowi6_oif && any_src))
1202 		flags |= RT6_LOOKUP_F_IFACE;
1203 
1204 	if (!any_src)
1205 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1206 	else if (sk)
1207 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1208 
1209 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1210 }
1211 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1212 
1213 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1214 {
1215 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1216 	struct dst_entry *new = NULL;
1217 
1218 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1219 	if (rt) {
1220 		rt6_info_init(rt);
1221 
1222 		new = &rt->dst;
1223 		new->__use = 1;
1224 		new->input = dst_discard;
1225 		new->output = dst_discard_out;
1226 
1227 		dst_copy_metrics(new, &ort->dst);
1228 		rt->rt6i_idev = ort->rt6i_idev;
1229 		if (rt->rt6i_idev)
1230 			in6_dev_hold(rt->rt6i_idev);
1231 
1232 		rt->rt6i_gateway = ort->rt6i_gateway;
1233 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1234 		rt->rt6i_metric = 0;
1235 
1236 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1237 #ifdef CONFIG_IPV6_SUBTREES
1238 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1239 #endif
1240 
1241 		dst_free(new);
1242 	}
1243 
1244 	dst_release(dst_orig);
1245 	return new ? new : ERR_PTR(-ENOMEM);
1246 }
1247 
1248 /*
1249  *	Destination cache support functions
1250  */
1251 
1252 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1253 {
1254 	if (rt->dst.from &&
1255 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1256 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1257 }
1258 
1259 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1260 {
1261 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1262 		return NULL;
1263 
1264 	if (rt6_check_expired(rt))
1265 		return NULL;
1266 
1267 	return &rt->dst;
1268 }
1269 
1270 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1271 {
1272 	if (!__rt6_check_expired(rt) &&
1273 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1274 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1275 		return &rt->dst;
1276 	else
1277 		return NULL;
1278 }
1279 
1280 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1281 {
1282 	struct rt6_info *rt;
1283 
1284 	rt = (struct rt6_info *) dst;
1285 
1286 	/* All IPV6 dsts are created with ->obsolete set to the value
1287 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1288 	 * into this function always.
1289 	 */
1290 
1291 	rt6_dst_from_metrics_check(rt);
1292 
1293 	if (rt->rt6i_flags & RTF_PCPU ||
1294 	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1295 		return rt6_dst_from_check(rt, cookie);
1296 	else
1297 		return rt6_check(rt, cookie);
1298 }
1299 
1300 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1301 {
1302 	struct rt6_info *rt = (struct rt6_info *) dst;
1303 
1304 	if (rt) {
1305 		if (rt->rt6i_flags & RTF_CACHE) {
1306 			if (rt6_check_expired(rt)) {
1307 				ip6_del_rt(rt);
1308 				dst = NULL;
1309 			}
1310 		} else {
1311 			dst_release(dst);
1312 			dst = NULL;
1313 		}
1314 	}
1315 	return dst;
1316 }
1317 
1318 static void ip6_link_failure(struct sk_buff *skb)
1319 {
1320 	struct rt6_info *rt;
1321 
1322 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1323 
1324 	rt = (struct rt6_info *) skb_dst(skb);
1325 	if (rt) {
1326 		if (rt->rt6i_flags & RTF_CACHE) {
1327 			dst_hold(&rt->dst);
1328 			ip6_del_rt(rt);
1329 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1330 			rt->rt6i_node->fn_sernum = -1;
1331 		}
1332 	}
1333 }
1334 
1335 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1336 {
1337 	struct net *net = dev_net(rt->dst.dev);
1338 
1339 	rt->rt6i_flags |= RTF_MODIFIED;
1340 	rt->rt6i_pmtu = mtu;
1341 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1342 }
1343 
1344 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1345 {
1346 	return !(rt->rt6i_flags & RTF_CACHE) &&
1347 		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1348 }
1349 
1350 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1351 				 const struct ipv6hdr *iph, u32 mtu)
1352 {
1353 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1354 
1355 	if (rt6->rt6i_flags & RTF_LOCAL)
1356 		return;
1357 
1358 	dst_confirm(dst);
1359 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1360 	if (mtu >= dst_mtu(dst))
1361 		return;
1362 
1363 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1364 		rt6_do_update_pmtu(rt6, mtu);
1365 	} else {
1366 		const struct in6_addr *daddr, *saddr;
1367 		struct rt6_info *nrt6;
1368 
1369 		if (iph) {
1370 			daddr = &iph->daddr;
1371 			saddr = &iph->saddr;
1372 		} else if (sk) {
1373 			daddr = &sk->sk_v6_daddr;
1374 			saddr = &inet6_sk(sk)->saddr;
1375 		} else {
1376 			return;
1377 		}
1378 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1379 		if (nrt6) {
1380 			rt6_do_update_pmtu(nrt6, mtu);
1381 
1382 			/* ip6_ins_rt(nrt6) will bump the
1383 			 * rt6->rt6i_node->fn_sernum
1384 			 * which will fail the next rt6_check() and
1385 			 * invalidate the sk->sk_dst_cache.
1386 			 */
1387 			ip6_ins_rt(nrt6);
1388 		}
1389 	}
1390 }
1391 
1392 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1393 			       struct sk_buff *skb, u32 mtu)
1394 {
1395 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1396 }
1397 
1398 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1399 		     int oif, u32 mark)
1400 {
1401 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1402 	struct dst_entry *dst;
1403 	struct flowi6 fl6;
1404 
1405 	memset(&fl6, 0, sizeof(fl6));
1406 	fl6.flowi6_oif = oif;
1407 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1408 	fl6.daddr = iph->daddr;
1409 	fl6.saddr = iph->saddr;
1410 	fl6.flowlabel = ip6_flowinfo(iph);
1411 
1412 	dst = ip6_route_output(net, NULL, &fl6);
1413 	if (!dst->error)
1414 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1415 	dst_release(dst);
1416 }
1417 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1418 
1419 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1420 {
1421 	struct dst_entry *dst;
1422 
1423 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1424 			sk->sk_bound_dev_if, sk->sk_mark);
1425 
1426 	dst = __sk_dst_get(sk);
1427 	if (!dst || !dst->obsolete ||
1428 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1429 		return;
1430 
1431 	bh_lock_sock(sk);
1432 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1433 		ip6_datagram_dst_update(sk, false);
1434 	bh_unlock_sock(sk);
1435 }
1436 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1437 
1438 /* Handle redirects */
1439 struct ip6rd_flowi {
1440 	struct flowi6 fl6;
1441 	struct in6_addr gateway;
1442 };
1443 
1444 static struct rt6_info *__ip6_route_redirect(struct net *net,
1445 					     struct fib6_table *table,
1446 					     struct flowi6 *fl6,
1447 					     int flags)
1448 {
1449 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1450 	struct rt6_info *rt;
1451 	struct fib6_node *fn;
1452 
1453 	/* Get the "current" route for this destination and
1454 	 * check if the redirect has come from approriate router.
1455 	 *
1456 	 * RFC 4861 specifies that redirects should only be
1457 	 * accepted if they come from the nexthop to the target.
1458 	 * Due to the way the routes are chosen, this notion
1459 	 * is a bit fuzzy and one might need to check all possible
1460 	 * routes.
1461 	 */
1462 
1463 	read_lock_bh(&table->tb6_lock);
1464 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1465 restart:
1466 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1467 		if (rt6_check_expired(rt))
1468 			continue;
1469 		if (rt->dst.error)
1470 			break;
1471 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1472 			continue;
1473 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1474 			continue;
1475 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1476 			continue;
1477 		break;
1478 	}
1479 
1480 	if (!rt)
1481 		rt = net->ipv6.ip6_null_entry;
1482 	else if (rt->dst.error) {
1483 		rt = net->ipv6.ip6_null_entry;
1484 		goto out;
1485 	}
1486 
1487 	if (rt == net->ipv6.ip6_null_entry) {
1488 		fn = fib6_backtrack(fn, &fl6->saddr);
1489 		if (fn)
1490 			goto restart;
1491 	}
1492 
1493 out:
1494 	dst_hold(&rt->dst);
1495 
1496 	read_unlock_bh(&table->tb6_lock);
1497 
1498 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1499 	return rt;
1500 };
1501 
1502 static struct dst_entry *ip6_route_redirect(struct net *net,
1503 					const struct flowi6 *fl6,
1504 					const struct in6_addr *gateway)
1505 {
1506 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1507 	struct ip6rd_flowi rdfl;
1508 
1509 	rdfl.fl6 = *fl6;
1510 	rdfl.gateway = *gateway;
1511 
1512 	return fib6_rule_lookup(net, &rdfl.fl6,
1513 				flags, __ip6_route_redirect);
1514 }
1515 
1516 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1517 {
1518 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1519 	struct dst_entry *dst;
1520 	struct flowi6 fl6;
1521 
1522 	memset(&fl6, 0, sizeof(fl6));
1523 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1524 	fl6.flowi6_oif = oif;
1525 	fl6.flowi6_mark = mark;
1526 	fl6.daddr = iph->daddr;
1527 	fl6.saddr = iph->saddr;
1528 	fl6.flowlabel = ip6_flowinfo(iph);
1529 
1530 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1531 	rt6_do_redirect(dst, NULL, skb);
1532 	dst_release(dst);
1533 }
1534 EXPORT_SYMBOL_GPL(ip6_redirect);
1535 
1536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1537 			    u32 mark)
1538 {
1539 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1540 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1541 	struct dst_entry *dst;
1542 	struct flowi6 fl6;
1543 
1544 	memset(&fl6, 0, sizeof(fl6));
1545 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1546 	fl6.flowi6_oif = oif;
1547 	fl6.flowi6_mark = mark;
1548 	fl6.daddr = msg->dest;
1549 	fl6.saddr = iph->daddr;
1550 
1551 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1552 	rt6_do_redirect(dst, NULL, skb);
1553 	dst_release(dst);
1554 }
1555 
1556 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1557 {
1558 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1559 }
1560 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1561 
1562 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1563 {
1564 	struct net_device *dev = dst->dev;
1565 	unsigned int mtu = dst_mtu(dst);
1566 	struct net *net = dev_net(dev);
1567 
1568 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1569 
1570 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1571 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1572 
1573 	/*
1574 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1575 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1576 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1577 	 * rely only on pmtu discovery"
1578 	 */
1579 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1580 		mtu = IPV6_MAXPLEN;
1581 	return mtu;
1582 }
1583 
1584 static unsigned int ip6_mtu(const struct dst_entry *dst)
1585 {
1586 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1587 	unsigned int mtu = rt->rt6i_pmtu;
1588 	struct inet6_dev *idev;
1589 
1590 	if (mtu)
1591 		goto out;
1592 
1593 	mtu = dst_metric_raw(dst, RTAX_MTU);
1594 	if (mtu)
1595 		goto out;
1596 
1597 	mtu = IPV6_MIN_MTU;
1598 
1599 	rcu_read_lock();
1600 	idev = __in6_dev_get(dst->dev);
1601 	if (idev)
1602 		mtu = idev->cnf.mtu6;
1603 	rcu_read_unlock();
1604 
1605 out:
1606 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1607 }
1608 
1609 static struct dst_entry *icmp6_dst_gc_list;
1610 static DEFINE_SPINLOCK(icmp6_dst_lock);
1611 
1612 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1613 				  struct flowi6 *fl6)
1614 {
1615 	struct dst_entry *dst;
1616 	struct rt6_info *rt;
1617 	struct inet6_dev *idev = in6_dev_get(dev);
1618 	struct net *net = dev_net(dev);
1619 
1620 	if (unlikely(!idev))
1621 		return ERR_PTR(-ENODEV);
1622 
1623 	rt = ip6_dst_alloc(net, dev, 0);
1624 	if (unlikely(!rt)) {
1625 		in6_dev_put(idev);
1626 		dst = ERR_PTR(-ENOMEM);
1627 		goto out;
1628 	}
1629 
1630 	rt->dst.flags |= DST_HOST;
1631 	rt->dst.output  = ip6_output;
1632 	atomic_set(&rt->dst.__refcnt, 1);
1633 	rt->rt6i_gateway  = fl6->daddr;
1634 	rt->rt6i_dst.addr = fl6->daddr;
1635 	rt->rt6i_dst.plen = 128;
1636 	rt->rt6i_idev     = idev;
1637 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1638 
1639 	spin_lock_bh(&icmp6_dst_lock);
1640 	rt->dst.next = icmp6_dst_gc_list;
1641 	icmp6_dst_gc_list = &rt->dst;
1642 	spin_unlock_bh(&icmp6_dst_lock);
1643 
1644 	fib6_force_start_gc(net);
1645 
1646 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1647 
1648 out:
1649 	return dst;
1650 }
1651 
1652 int icmp6_dst_gc(void)
1653 {
1654 	struct dst_entry *dst, **pprev;
1655 	int more = 0;
1656 
1657 	spin_lock_bh(&icmp6_dst_lock);
1658 	pprev = &icmp6_dst_gc_list;
1659 
1660 	while ((dst = *pprev) != NULL) {
1661 		if (!atomic_read(&dst->__refcnt)) {
1662 			*pprev = dst->next;
1663 			dst_free(dst);
1664 		} else {
1665 			pprev = &dst->next;
1666 			++more;
1667 		}
1668 	}
1669 
1670 	spin_unlock_bh(&icmp6_dst_lock);
1671 
1672 	return more;
1673 }
1674 
1675 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1676 			    void *arg)
1677 {
1678 	struct dst_entry *dst, **pprev;
1679 
1680 	spin_lock_bh(&icmp6_dst_lock);
1681 	pprev = &icmp6_dst_gc_list;
1682 	while ((dst = *pprev) != NULL) {
1683 		struct rt6_info *rt = (struct rt6_info *) dst;
1684 		if (func(rt, arg)) {
1685 			*pprev = dst->next;
1686 			dst_free(dst);
1687 		} else {
1688 			pprev = &dst->next;
1689 		}
1690 	}
1691 	spin_unlock_bh(&icmp6_dst_lock);
1692 }
1693 
1694 static int ip6_dst_gc(struct dst_ops *ops)
1695 {
1696 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1697 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1698 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1699 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1700 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1701 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1702 	int entries;
1703 
1704 	entries = dst_entries_get_fast(ops);
1705 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1706 	    entries <= rt_max_size)
1707 		goto out;
1708 
1709 	net->ipv6.ip6_rt_gc_expire++;
1710 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1711 	entries = dst_entries_get_slow(ops);
1712 	if (entries < ops->gc_thresh)
1713 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1714 out:
1715 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1716 	return entries > rt_max_size;
1717 }
1718 
1719 static int ip6_convert_metrics(struct mx6_config *mxc,
1720 			       const struct fib6_config *cfg)
1721 {
1722 	bool ecn_ca = false;
1723 	struct nlattr *nla;
1724 	int remaining;
1725 	u32 *mp;
1726 
1727 	if (!cfg->fc_mx)
1728 		return 0;
1729 
1730 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1731 	if (unlikely(!mp))
1732 		return -ENOMEM;
1733 
1734 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1735 		int type = nla_type(nla);
1736 		u32 val;
1737 
1738 		if (!type)
1739 			continue;
1740 		if (unlikely(type > RTAX_MAX))
1741 			goto err;
1742 
1743 		if (type == RTAX_CC_ALGO) {
1744 			char tmp[TCP_CA_NAME_MAX];
1745 
1746 			nla_strlcpy(tmp, nla, sizeof(tmp));
1747 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1748 			if (val == TCP_CA_UNSPEC)
1749 				goto err;
1750 		} else {
1751 			val = nla_get_u32(nla);
1752 		}
1753 		if (type == RTAX_HOPLIMIT && val > 255)
1754 			val = 255;
1755 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1756 			goto err;
1757 
1758 		mp[type - 1] = val;
1759 		__set_bit(type - 1, mxc->mx_valid);
1760 	}
1761 
1762 	if (ecn_ca) {
1763 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1764 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1765 	}
1766 
1767 	mxc->mx = mp;
1768 	return 0;
1769  err:
1770 	kfree(mp);
1771 	return -EINVAL;
1772 }
1773 
1774 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1775 					    struct fib6_config *cfg,
1776 					    const struct in6_addr *gw_addr)
1777 {
1778 	struct flowi6 fl6 = {
1779 		.flowi6_oif = cfg->fc_ifindex,
1780 		.daddr = *gw_addr,
1781 		.saddr = cfg->fc_prefsrc,
1782 	};
1783 	struct fib6_table *table;
1784 	struct rt6_info *rt;
1785 	int flags = 0;
1786 
1787 	table = fib6_get_table(net, cfg->fc_table);
1788 	if (!table)
1789 		return NULL;
1790 
1791 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
1792 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1793 
1794 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1795 
1796 	/* if table lookup failed, fall back to full lookup */
1797 	if (rt == net->ipv6.ip6_null_entry) {
1798 		ip6_rt_put(rt);
1799 		rt = NULL;
1800 	}
1801 
1802 	return rt;
1803 }
1804 
1805 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1806 {
1807 	struct net *net = cfg->fc_nlinfo.nl_net;
1808 	struct rt6_info *rt = NULL;
1809 	struct net_device *dev = NULL;
1810 	struct inet6_dev *idev = NULL;
1811 	struct fib6_table *table;
1812 	int addr_type;
1813 	int err = -EINVAL;
1814 
1815 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1816 		goto out;
1817 #ifndef CONFIG_IPV6_SUBTREES
1818 	if (cfg->fc_src_len)
1819 		goto out;
1820 #endif
1821 	if (cfg->fc_ifindex) {
1822 		err = -ENODEV;
1823 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1824 		if (!dev)
1825 			goto out;
1826 		idev = in6_dev_get(dev);
1827 		if (!idev)
1828 			goto out;
1829 	}
1830 
1831 	if (cfg->fc_metric == 0)
1832 		cfg->fc_metric = IP6_RT_PRIO_USER;
1833 
1834 	err = -ENOBUFS;
1835 	if (cfg->fc_nlinfo.nlh &&
1836 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1837 		table = fib6_get_table(net, cfg->fc_table);
1838 		if (!table) {
1839 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1840 			table = fib6_new_table(net, cfg->fc_table);
1841 		}
1842 	} else {
1843 		table = fib6_new_table(net, cfg->fc_table);
1844 	}
1845 
1846 	if (!table)
1847 		goto out;
1848 
1849 	rt = ip6_dst_alloc(net, NULL,
1850 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1851 
1852 	if (!rt) {
1853 		err = -ENOMEM;
1854 		goto out;
1855 	}
1856 
1857 	if (cfg->fc_flags & RTF_EXPIRES)
1858 		rt6_set_expires(rt, jiffies +
1859 				clock_t_to_jiffies(cfg->fc_expires));
1860 	else
1861 		rt6_clean_expires(rt);
1862 
1863 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1864 		cfg->fc_protocol = RTPROT_BOOT;
1865 	rt->rt6i_protocol = cfg->fc_protocol;
1866 
1867 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1868 
1869 	if (addr_type & IPV6_ADDR_MULTICAST)
1870 		rt->dst.input = ip6_mc_input;
1871 	else if (cfg->fc_flags & RTF_LOCAL)
1872 		rt->dst.input = ip6_input;
1873 	else
1874 		rt->dst.input = ip6_forward;
1875 
1876 	rt->dst.output = ip6_output;
1877 
1878 	if (cfg->fc_encap) {
1879 		struct lwtunnel_state *lwtstate;
1880 
1881 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1882 					   cfg->fc_encap, AF_INET6, cfg,
1883 					   &lwtstate);
1884 		if (err)
1885 			goto out;
1886 		rt->dst.lwtstate = lwtstate_get(lwtstate);
1887 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1888 			rt->dst.lwtstate->orig_output = rt->dst.output;
1889 			rt->dst.output = lwtunnel_output;
1890 		}
1891 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1892 			rt->dst.lwtstate->orig_input = rt->dst.input;
1893 			rt->dst.input = lwtunnel_input;
1894 		}
1895 	}
1896 
1897 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1898 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1899 	if (rt->rt6i_dst.plen == 128)
1900 		rt->dst.flags |= DST_HOST;
1901 
1902 #ifdef CONFIG_IPV6_SUBTREES
1903 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1904 	rt->rt6i_src.plen = cfg->fc_src_len;
1905 #endif
1906 
1907 	rt->rt6i_metric = cfg->fc_metric;
1908 
1909 	/* We cannot add true routes via loopback here,
1910 	   they would result in kernel looping; promote them to reject routes
1911 	 */
1912 	if ((cfg->fc_flags & RTF_REJECT) ||
1913 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1914 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1915 	     !(cfg->fc_flags & RTF_LOCAL))) {
1916 		/* hold loopback dev/idev if we haven't done so. */
1917 		if (dev != net->loopback_dev) {
1918 			if (dev) {
1919 				dev_put(dev);
1920 				in6_dev_put(idev);
1921 			}
1922 			dev = net->loopback_dev;
1923 			dev_hold(dev);
1924 			idev = in6_dev_get(dev);
1925 			if (!idev) {
1926 				err = -ENODEV;
1927 				goto out;
1928 			}
1929 		}
1930 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1931 		switch (cfg->fc_type) {
1932 		case RTN_BLACKHOLE:
1933 			rt->dst.error = -EINVAL;
1934 			rt->dst.output = dst_discard_out;
1935 			rt->dst.input = dst_discard;
1936 			break;
1937 		case RTN_PROHIBIT:
1938 			rt->dst.error = -EACCES;
1939 			rt->dst.output = ip6_pkt_prohibit_out;
1940 			rt->dst.input = ip6_pkt_prohibit;
1941 			break;
1942 		case RTN_THROW:
1943 		case RTN_UNREACHABLE:
1944 		default:
1945 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1946 					: (cfg->fc_type == RTN_UNREACHABLE)
1947 					? -EHOSTUNREACH : -ENETUNREACH;
1948 			rt->dst.output = ip6_pkt_discard_out;
1949 			rt->dst.input = ip6_pkt_discard;
1950 			break;
1951 		}
1952 		goto install_route;
1953 	}
1954 
1955 	if (cfg->fc_flags & RTF_GATEWAY) {
1956 		const struct in6_addr *gw_addr;
1957 		int gwa_type;
1958 
1959 		gw_addr = &cfg->fc_gateway;
1960 		gwa_type = ipv6_addr_type(gw_addr);
1961 
1962 		/* if gw_addr is local we will fail to detect this in case
1963 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1964 		 * will return already-added prefix route via interface that
1965 		 * prefix route was assigned to, which might be non-loopback.
1966 		 */
1967 		err = -EINVAL;
1968 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1969 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1970 					    dev : NULL, 0, 0))
1971 			goto out;
1972 
1973 		rt->rt6i_gateway = *gw_addr;
1974 
1975 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1976 			struct rt6_info *grt = NULL;
1977 
1978 			/* IPv6 strictly inhibits using not link-local
1979 			   addresses as nexthop address.
1980 			   Otherwise, router will not able to send redirects.
1981 			   It is very good, but in some (rare!) circumstances
1982 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1983 			   some exceptions. --ANK
1984 			 */
1985 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1986 				goto out;
1987 
1988 			if (cfg->fc_table)
1989 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
1990 
1991 			if (!grt)
1992 				grt = rt6_lookup(net, gw_addr, NULL,
1993 						 cfg->fc_ifindex, 1);
1994 
1995 			err = -EHOSTUNREACH;
1996 			if (!grt)
1997 				goto out;
1998 			if (dev) {
1999 				if (dev != grt->dst.dev) {
2000 					ip6_rt_put(grt);
2001 					goto out;
2002 				}
2003 			} else {
2004 				dev = grt->dst.dev;
2005 				idev = grt->rt6i_idev;
2006 				dev_hold(dev);
2007 				in6_dev_hold(grt->rt6i_idev);
2008 			}
2009 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2010 				err = 0;
2011 			ip6_rt_put(grt);
2012 
2013 			if (err)
2014 				goto out;
2015 		}
2016 		err = -EINVAL;
2017 		if (!dev || (dev->flags & IFF_LOOPBACK))
2018 			goto out;
2019 	}
2020 
2021 	err = -ENODEV;
2022 	if (!dev)
2023 		goto out;
2024 
2025 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2026 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2027 			err = -EINVAL;
2028 			goto out;
2029 		}
2030 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2031 		rt->rt6i_prefsrc.plen = 128;
2032 	} else
2033 		rt->rt6i_prefsrc.plen = 0;
2034 
2035 	rt->rt6i_flags = cfg->fc_flags;
2036 
2037 install_route:
2038 	rt->dst.dev = dev;
2039 	rt->rt6i_idev = idev;
2040 	rt->rt6i_table = table;
2041 
2042 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2043 
2044 	return rt;
2045 out:
2046 	if (dev)
2047 		dev_put(dev);
2048 	if (idev)
2049 		in6_dev_put(idev);
2050 	if (rt)
2051 		dst_free(&rt->dst);
2052 
2053 	return ERR_PTR(err);
2054 }
2055 
2056 int ip6_route_add(struct fib6_config *cfg)
2057 {
2058 	struct mx6_config mxc = { .mx = NULL, };
2059 	struct rt6_info *rt;
2060 	int err;
2061 
2062 	rt = ip6_route_info_create(cfg);
2063 	if (IS_ERR(rt)) {
2064 		err = PTR_ERR(rt);
2065 		rt = NULL;
2066 		goto out;
2067 	}
2068 
2069 	err = ip6_convert_metrics(&mxc, cfg);
2070 	if (err)
2071 		goto out;
2072 
2073 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2074 
2075 	kfree(mxc.mx);
2076 
2077 	return err;
2078 out:
2079 	if (rt)
2080 		dst_free(&rt->dst);
2081 
2082 	return err;
2083 }
2084 
2085 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2086 {
2087 	int err;
2088 	struct fib6_table *table;
2089 	struct net *net = dev_net(rt->dst.dev);
2090 
2091 	if (rt == net->ipv6.ip6_null_entry ||
2092 	    rt->dst.flags & DST_NOCACHE) {
2093 		err = -ENOENT;
2094 		goto out;
2095 	}
2096 
2097 	table = rt->rt6i_table;
2098 	write_lock_bh(&table->tb6_lock);
2099 	err = fib6_del(rt, info);
2100 	write_unlock_bh(&table->tb6_lock);
2101 
2102 out:
2103 	ip6_rt_put(rt);
2104 	return err;
2105 }
2106 
2107 int ip6_del_rt(struct rt6_info *rt)
2108 {
2109 	struct nl_info info = {
2110 		.nl_net = dev_net(rt->dst.dev),
2111 	};
2112 	return __ip6_del_rt(rt, &info);
2113 }
2114 
2115 static int ip6_route_del(struct fib6_config *cfg)
2116 {
2117 	struct fib6_table *table;
2118 	struct fib6_node *fn;
2119 	struct rt6_info *rt;
2120 	int err = -ESRCH;
2121 
2122 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2123 	if (!table)
2124 		return err;
2125 
2126 	read_lock_bh(&table->tb6_lock);
2127 
2128 	fn = fib6_locate(&table->tb6_root,
2129 			 &cfg->fc_dst, cfg->fc_dst_len,
2130 			 &cfg->fc_src, cfg->fc_src_len);
2131 
2132 	if (fn) {
2133 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2134 			if ((rt->rt6i_flags & RTF_CACHE) &&
2135 			    !(cfg->fc_flags & RTF_CACHE))
2136 				continue;
2137 			if (cfg->fc_ifindex &&
2138 			    (!rt->dst.dev ||
2139 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2140 				continue;
2141 			if (cfg->fc_flags & RTF_GATEWAY &&
2142 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2143 				continue;
2144 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2145 				continue;
2146 			dst_hold(&rt->dst);
2147 			read_unlock_bh(&table->tb6_lock);
2148 
2149 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2150 		}
2151 	}
2152 	read_unlock_bh(&table->tb6_lock);
2153 
2154 	return err;
2155 }
2156 
2157 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2158 {
2159 	struct netevent_redirect netevent;
2160 	struct rt6_info *rt, *nrt = NULL;
2161 	struct ndisc_options ndopts;
2162 	struct inet6_dev *in6_dev;
2163 	struct neighbour *neigh;
2164 	struct rd_msg *msg;
2165 	int optlen, on_link;
2166 	u8 *lladdr;
2167 
2168 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2169 	optlen -= sizeof(*msg);
2170 
2171 	if (optlen < 0) {
2172 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2173 		return;
2174 	}
2175 
2176 	msg = (struct rd_msg *)icmp6_hdr(skb);
2177 
2178 	if (ipv6_addr_is_multicast(&msg->dest)) {
2179 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2180 		return;
2181 	}
2182 
2183 	on_link = 0;
2184 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2185 		on_link = 1;
2186 	} else if (ipv6_addr_type(&msg->target) !=
2187 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2188 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2189 		return;
2190 	}
2191 
2192 	in6_dev = __in6_dev_get(skb->dev);
2193 	if (!in6_dev)
2194 		return;
2195 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2196 		return;
2197 
2198 	/* RFC2461 8.1:
2199 	 *	The IP source address of the Redirect MUST be the same as the current
2200 	 *	first-hop router for the specified ICMP Destination Address.
2201 	 */
2202 
2203 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2204 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2205 		return;
2206 	}
2207 
2208 	lladdr = NULL;
2209 	if (ndopts.nd_opts_tgt_lladdr) {
2210 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2211 					     skb->dev);
2212 		if (!lladdr) {
2213 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2214 			return;
2215 		}
2216 	}
2217 
2218 	rt = (struct rt6_info *) dst;
2219 	if (rt->rt6i_flags & RTF_REJECT) {
2220 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2221 		return;
2222 	}
2223 
2224 	/* Redirect received -> path was valid.
2225 	 * Look, redirects are sent only in response to data packets,
2226 	 * so that this nexthop apparently is reachable. --ANK
2227 	 */
2228 	dst_confirm(&rt->dst);
2229 
2230 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2231 	if (!neigh)
2232 		return;
2233 
2234 	/*
2235 	 *	We have finally decided to accept it.
2236 	 */
2237 
2238 	neigh_update(neigh, lladdr, NUD_STALE,
2239 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2240 		     NEIGH_UPDATE_F_OVERRIDE|
2241 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2242 				     NEIGH_UPDATE_F_ISROUTER))
2243 		     );
2244 
2245 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2246 	if (!nrt)
2247 		goto out;
2248 
2249 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2250 	if (on_link)
2251 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2252 
2253 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2254 
2255 	if (ip6_ins_rt(nrt))
2256 		goto out;
2257 
2258 	netevent.old = &rt->dst;
2259 	netevent.new = &nrt->dst;
2260 	netevent.daddr = &msg->dest;
2261 	netevent.neigh = neigh;
2262 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2263 
2264 	if (rt->rt6i_flags & RTF_CACHE) {
2265 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2266 		ip6_del_rt(rt);
2267 	}
2268 
2269 out:
2270 	neigh_release(neigh);
2271 }
2272 
2273 /*
2274  *	Misc support functions
2275  */
2276 
2277 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2278 {
2279 	BUG_ON(from->dst.from);
2280 
2281 	rt->rt6i_flags &= ~RTF_EXPIRES;
2282 	dst_hold(&from->dst);
2283 	rt->dst.from = &from->dst;
2284 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2285 }
2286 
2287 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2288 {
2289 	rt->dst.input = ort->dst.input;
2290 	rt->dst.output = ort->dst.output;
2291 	rt->rt6i_dst = ort->rt6i_dst;
2292 	rt->dst.error = ort->dst.error;
2293 	rt->rt6i_idev = ort->rt6i_idev;
2294 	if (rt->rt6i_idev)
2295 		in6_dev_hold(rt->rt6i_idev);
2296 	rt->dst.lastuse = jiffies;
2297 	rt->rt6i_gateway = ort->rt6i_gateway;
2298 	rt->rt6i_flags = ort->rt6i_flags;
2299 	rt6_set_from(rt, ort);
2300 	rt->rt6i_metric = ort->rt6i_metric;
2301 #ifdef CONFIG_IPV6_SUBTREES
2302 	rt->rt6i_src = ort->rt6i_src;
2303 #endif
2304 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2305 	rt->rt6i_table = ort->rt6i_table;
2306 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2307 }
2308 
2309 #ifdef CONFIG_IPV6_ROUTE_INFO
2310 static struct rt6_info *rt6_get_route_info(struct net *net,
2311 					   const struct in6_addr *prefix, int prefixlen,
2312 					   const struct in6_addr *gwaddr, int ifindex)
2313 {
2314 	struct fib6_node *fn;
2315 	struct rt6_info *rt = NULL;
2316 	struct fib6_table *table;
2317 
2318 	table = fib6_get_table(net, RT6_TABLE_INFO);
2319 	if (!table)
2320 		return NULL;
2321 
2322 	read_lock_bh(&table->tb6_lock);
2323 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2324 	if (!fn)
2325 		goto out;
2326 
2327 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2328 		if (rt->dst.dev->ifindex != ifindex)
2329 			continue;
2330 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2331 			continue;
2332 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2333 			continue;
2334 		dst_hold(&rt->dst);
2335 		break;
2336 	}
2337 out:
2338 	read_unlock_bh(&table->tb6_lock);
2339 	return rt;
2340 }
2341 
2342 static struct rt6_info *rt6_add_route_info(struct net *net,
2343 					   const struct in6_addr *prefix, int prefixlen,
2344 					   const struct in6_addr *gwaddr, int ifindex,
2345 					   unsigned int pref)
2346 {
2347 	struct fib6_config cfg = {
2348 		.fc_metric	= IP6_RT_PRIO_USER,
2349 		.fc_ifindex	= ifindex,
2350 		.fc_dst_len	= prefixlen,
2351 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2352 				  RTF_UP | RTF_PREF(pref),
2353 		.fc_nlinfo.portid = 0,
2354 		.fc_nlinfo.nlh = NULL,
2355 		.fc_nlinfo.nl_net = net,
2356 	};
2357 
2358 	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2359 	cfg.fc_dst = *prefix;
2360 	cfg.fc_gateway = *gwaddr;
2361 
2362 	/* We should treat it as a default route if prefix length is 0. */
2363 	if (!prefixlen)
2364 		cfg.fc_flags |= RTF_DEFAULT;
2365 
2366 	ip6_route_add(&cfg);
2367 
2368 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2369 }
2370 #endif
2371 
2372 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2373 {
2374 	struct rt6_info *rt;
2375 	struct fib6_table *table;
2376 
2377 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2378 	if (!table)
2379 		return NULL;
2380 
2381 	read_lock_bh(&table->tb6_lock);
2382 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2383 		if (dev == rt->dst.dev &&
2384 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2385 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2386 			break;
2387 	}
2388 	if (rt)
2389 		dst_hold(&rt->dst);
2390 	read_unlock_bh(&table->tb6_lock);
2391 	return rt;
2392 }
2393 
2394 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2395 				     struct net_device *dev,
2396 				     unsigned int pref)
2397 {
2398 	struct fib6_config cfg = {
2399 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2400 		.fc_metric	= IP6_RT_PRIO_USER,
2401 		.fc_ifindex	= dev->ifindex,
2402 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2403 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2404 		.fc_nlinfo.portid = 0,
2405 		.fc_nlinfo.nlh = NULL,
2406 		.fc_nlinfo.nl_net = dev_net(dev),
2407 	};
2408 
2409 	cfg.fc_gateway = *gwaddr;
2410 
2411 	ip6_route_add(&cfg);
2412 
2413 	return rt6_get_dflt_router(gwaddr, dev);
2414 }
2415 
2416 void rt6_purge_dflt_routers(struct net *net)
2417 {
2418 	struct rt6_info *rt;
2419 	struct fib6_table *table;
2420 
2421 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2422 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2423 	if (!table)
2424 		return;
2425 
2426 restart:
2427 	read_lock_bh(&table->tb6_lock);
2428 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2429 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2430 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2431 			dst_hold(&rt->dst);
2432 			read_unlock_bh(&table->tb6_lock);
2433 			ip6_del_rt(rt);
2434 			goto restart;
2435 		}
2436 	}
2437 	read_unlock_bh(&table->tb6_lock);
2438 }
2439 
2440 static void rtmsg_to_fib6_config(struct net *net,
2441 				 struct in6_rtmsg *rtmsg,
2442 				 struct fib6_config *cfg)
2443 {
2444 	memset(cfg, 0, sizeof(*cfg));
2445 
2446 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2447 			 : RT6_TABLE_MAIN;
2448 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2449 	cfg->fc_metric = rtmsg->rtmsg_metric;
2450 	cfg->fc_expires = rtmsg->rtmsg_info;
2451 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2452 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2453 	cfg->fc_flags = rtmsg->rtmsg_flags;
2454 
2455 	cfg->fc_nlinfo.nl_net = net;
2456 
2457 	cfg->fc_dst = rtmsg->rtmsg_dst;
2458 	cfg->fc_src = rtmsg->rtmsg_src;
2459 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2460 }
2461 
2462 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2463 {
2464 	struct fib6_config cfg;
2465 	struct in6_rtmsg rtmsg;
2466 	int err;
2467 
2468 	switch (cmd) {
2469 	case SIOCADDRT:		/* Add a route */
2470 	case SIOCDELRT:		/* Delete a route */
2471 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2472 			return -EPERM;
2473 		err = copy_from_user(&rtmsg, arg,
2474 				     sizeof(struct in6_rtmsg));
2475 		if (err)
2476 			return -EFAULT;
2477 
2478 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2479 
2480 		rtnl_lock();
2481 		switch (cmd) {
2482 		case SIOCADDRT:
2483 			err = ip6_route_add(&cfg);
2484 			break;
2485 		case SIOCDELRT:
2486 			err = ip6_route_del(&cfg);
2487 			break;
2488 		default:
2489 			err = -EINVAL;
2490 		}
2491 		rtnl_unlock();
2492 
2493 		return err;
2494 	}
2495 
2496 	return -EINVAL;
2497 }
2498 
2499 /*
2500  *	Drop the packet on the floor
2501  */
2502 
2503 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2504 {
2505 	int type;
2506 	struct dst_entry *dst = skb_dst(skb);
2507 	switch (ipstats_mib_noroutes) {
2508 	case IPSTATS_MIB_INNOROUTES:
2509 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2510 		if (type == IPV6_ADDR_ANY) {
2511 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2512 				      IPSTATS_MIB_INADDRERRORS);
2513 			break;
2514 		}
2515 		/* FALLTHROUGH */
2516 	case IPSTATS_MIB_OUTNOROUTES:
2517 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2518 			      ipstats_mib_noroutes);
2519 		break;
2520 	}
2521 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2522 	kfree_skb(skb);
2523 	return 0;
2524 }
2525 
2526 static int ip6_pkt_discard(struct sk_buff *skb)
2527 {
2528 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2529 }
2530 
2531 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2532 {
2533 	skb->dev = skb_dst(skb)->dev;
2534 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2535 }
2536 
2537 static int ip6_pkt_prohibit(struct sk_buff *skb)
2538 {
2539 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2540 }
2541 
2542 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2543 {
2544 	skb->dev = skb_dst(skb)->dev;
2545 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2546 }
2547 
2548 /*
2549  *	Allocate a dst for local (unicast / anycast) address.
2550  */
2551 
2552 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2553 				    const struct in6_addr *addr,
2554 				    bool anycast)
2555 {
2556 	u32 tb_id;
2557 	struct net *net = dev_net(idev->dev);
2558 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2559 					    DST_NOCOUNT);
2560 	if (!rt)
2561 		return ERR_PTR(-ENOMEM);
2562 
2563 	in6_dev_hold(idev);
2564 
2565 	rt->dst.flags |= DST_HOST;
2566 	rt->dst.input = ip6_input;
2567 	rt->dst.output = ip6_output;
2568 	rt->rt6i_idev = idev;
2569 
2570 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2571 	if (anycast)
2572 		rt->rt6i_flags |= RTF_ANYCAST;
2573 	else
2574 		rt->rt6i_flags |= RTF_LOCAL;
2575 
2576 	rt->rt6i_gateway  = *addr;
2577 	rt->rt6i_dst.addr = *addr;
2578 	rt->rt6i_dst.plen = 128;
2579 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2580 	rt->rt6i_table = fib6_get_table(net, tb_id);
2581 	rt->dst.flags |= DST_NOCACHE;
2582 
2583 	atomic_set(&rt->dst.__refcnt, 1);
2584 
2585 	return rt;
2586 }
2587 
2588 int ip6_route_get_saddr(struct net *net,
2589 			struct rt6_info *rt,
2590 			const struct in6_addr *daddr,
2591 			unsigned int prefs,
2592 			struct in6_addr *saddr)
2593 {
2594 	struct inet6_dev *idev =
2595 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2596 	int err = 0;
2597 	if (rt && rt->rt6i_prefsrc.plen)
2598 		*saddr = rt->rt6i_prefsrc.addr;
2599 	else
2600 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2601 					 daddr, prefs, saddr);
2602 	return err;
2603 }
2604 
2605 /* remove deleted ip from prefsrc entries */
2606 struct arg_dev_net_ip {
2607 	struct net_device *dev;
2608 	struct net *net;
2609 	struct in6_addr *addr;
2610 };
2611 
2612 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2613 {
2614 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2615 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2616 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2617 
2618 	if (((void *)rt->dst.dev == dev || !dev) &&
2619 	    rt != net->ipv6.ip6_null_entry &&
2620 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2621 		/* remove prefsrc entry */
2622 		rt->rt6i_prefsrc.plen = 0;
2623 	}
2624 	return 0;
2625 }
2626 
2627 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2628 {
2629 	struct net *net = dev_net(ifp->idev->dev);
2630 	struct arg_dev_net_ip adni = {
2631 		.dev = ifp->idev->dev,
2632 		.net = net,
2633 		.addr = &ifp->addr,
2634 	};
2635 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2636 }
2637 
2638 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2639 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2640 
2641 /* Remove routers and update dst entries when gateway turn into host. */
2642 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2643 {
2644 	struct in6_addr *gateway = (struct in6_addr *)arg;
2645 
2646 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2647 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2648 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2649 		return -1;
2650 	}
2651 	return 0;
2652 }
2653 
2654 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2655 {
2656 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2657 }
2658 
2659 struct arg_dev_net {
2660 	struct net_device *dev;
2661 	struct net *net;
2662 };
2663 
2664 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2665 {
2666 	const struct arg_dev_net *adn = arg;
2667 	const struct net_device *dev = adn->dev;
2668 
2669 	if ((rt->dst.dev == dev || !dev) &&
2670 	    rt != adn->net->ipv6.ip6_null_entry)
2671 		return -1;
2672 
2673 	return 0;
2674 }
2675 
2676 void rt6_ifdown(struct net *net, struct net_device *dev)
2677 {
2678 	struct arg_dev_net adn = {
2679 		.dev = dev,
2680 		.net = net,
2681 	};
2682 
2683 	fib6_clean_all(net, fib6_ifdown, &adn);
2684 	icmp6_clean_all(fib6_ifdown, &adn);
2685 	if (dev)
2686 		rt6_uncached_list_flush_dev(net, dev);
2687 }
2688 
2689 struct rt6_mtu_change_arg {
2690 	struct net_device *dev;
2691 	unsigned int mtu;
2692 };
2693 
2694 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2695 {
2696 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2697 	struct inet6_dev *idev;
2698 
2699 	/* In IPv6 pmtu discovery is not optional,
2700 	   so that RTAX_MTU lock cannot disable it.
2701 	   We still use this lock to block changes
2702 	   caused by addrconf/ndisc.
2703 	*/
2704 
2705 	idev = __in6_dev_get(arg->dev);
2706 	if (!idev)
2707 		return 0;
2708 
2709 	/* For administrative MTU increase, there is no way to discover
2710 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2711 	   Since RFC 1981 doesn't include administrative MTU increase
2712 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2713 	 */
2714 	/*
2715 	   If new MTU is less than route PMTU, this new MTU will be the
2716 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2717 	   decreases; if new MTU is greater than route PMTU, and the
2718 	   old MTU is the lowest MTU in the path, update the route PMTU
2719 	   to reflect the increase. In this case if the other nodes' MTU
2720 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2721 	   PMTU discouvery.
2722 	 */
2723 	if (rt->dst.dev == arg->dev &&
2724 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2725 		if (rt->rt6i_flags & RTF_CACHE) {
2726 			/* For RTF_CACHE with rt6i_pmtu == 0
2727 			 * (i.e. a redirected route),
2728 			 * the metrics of its rt->dst.from has already
2729 			 * been updated.
2730 			 */
2731 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2732 				rt->rt6i_pmtu = arg->mtu;
2733 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2734 			   (dst_mtu(&rt->dst) < arg->mtu &&
2735 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2736 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2737 		}
2738 	}
2739 	return 0;
2740 }
2741 
2742 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2743 {
2744 	struct rt6_mtu_change_arg arg = {
2745 		.dev = dev,
2746 		.mtu = mtu,
2747 	};
2748 
2749 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2750 }
2751 
2752 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2753 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2754 	[RTA_OIF]               = { .type = NLA_U32 },
2755 	[RTA_IIF]		= { .type = NLA_U32 },
2756 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2757 	[RTA_METRICS]           = { .type = NLA_NESTED },
2758 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2759 	[RTA_PREF]              = { .type = NLA_U8 },
2760 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2761 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2762 	[RTA_EXPIRES]		= { .type = NLA_U32 },
2763 };
2764 
2765 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2766 			      struct fib6_config *cfg)
2767 {
2768 	struct rtmsg *rtm;
2769 	struct nlattr *tb[RTA_MAX+1];
2770 	unsigned int pref;
2771 	int err;
2772 
2773 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2774 	if (err < 0)
2775 		goto errout;
2776 
2777 	err = -EINVAL;
2778 	rtm = nlmsg_data(nlh);
2779 	memset(cfg, 0, sizeof(*cfg));
2780 
2781 	cfg->fc_table = rtm->rtm_table;
2782 	cfg->fc_dst_len = rtm->rtm_dst_len;
2783 	cfg->fc_src_len = rtm->rtm_src_len;
2784 	cfg->fc_flags = RTF_UP;
2785 	cfg->fc_protocol = rtm->rtm_protocol;
2786 	cfg->fc_type = rtm->rtm_type;
2787 
2788 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2789 	    rtm->rtm_type == RTN_BLACKHOLE ||
2790 	    rtm->rtm_type == RTN_PROHIBIT ||
2791 	    rtm->rtm_type == RTN_THROW)
2792 		cfg->fc_flags |= RTF_REJECT;
2793 
2794 	if (rtm->rtm_type == RTN_LOCAL)
2795 		cfg->fc_flags |= RTF_LOCAL;
2796 
2797 	if (rtm->rtm_flags & RTM_F_CLONED)
2798 		cfg->fc_flags |= RTF_CACHE;
2799 
2800 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2801 	cfg->fc_nlinfo.nlh = nlh;
2802 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2803 
2804 	if (tb[RTA_GATEWAY]) {
2805 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2806 		cfg->fc_flags |= RTF_GATEWAY;
2807 	}
2808 
2809 	if (tb[RTA_DST]) {
2810 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2811 
2812 		if (nla_len(tb[RTA_DST]) < plen)
2813 			goto errout;
2814 
2815 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2816 	}
2817 
2818 	if (tb[RTA_SRC]) {
2819 		int plen = (rtm->rtm_src_len + 7) >> 3;
2820 
2821 		if (nla_len(tb[RTA_SRC]) < plen)
2822 			goto errout;
2823 
2824 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2825 	}
2826 
2827 	if (tb[RTA_PREFSRC])
2828 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2829 
2830 	if (tb[RTA_OIF])
2831 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2832 
2833 	if (tb[RTA_PRIORITY])
2834 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2835 
2836 	if (tb[RTA_METRICS]) {
2837 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2838 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2839 	}
2840 
2841 	if (tb[RTA_TABLE])
2842 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2843 
2844 	if (tb[RTA_MULTIPATH]) {
2845 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2846 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2847 	}
2848 
2849 	if (tb[RTA_PREF]) {
2850 		pref = nla_get_u8(tb[RTA_PREF]);
2851 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2852 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2853 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2854 		cfg->fc_flags |= RTF_PREF(pref);
2855 	}
2856 
2857 	if (tb[RTA_ENCAP])
2858 		cfg->fc_encap = tb[RTA_ENCAP];
2859 
2860 	if (tb[RTA_ENCAP_TYPE])
2861 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2862 
2863 	if (tb[RTA_EXPIRES]) {
2864 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2865 
2866 		if (addrconf_finite_timeout(timeout)) {
2867 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2868 			cfg->fc_flags |= RTF_EXPIRES;
2869 		}
2870 	}
2871 
2872 	err = 0;
2873 errout:
2874 	return err;
2875 }
2876 
2877 struct rt6_nh {
2878 	struct rt6_info *rt6_info;
2879 	struct fib6_config r_cfg;
2880 	struct mx6_config mxc;
2881 	struct list_head next;
2882 };
2883 
2884 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2885 {
2886 	struct rt6_nh *nh;
2887 
2888 	list_for_each_entry(nh, rt6_nh_list, next) {
2889 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2890 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2891 		        nh->r_cfg.fc_ifindex);
2892 	}
2893 }
2894 
2895 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2896 				 struct rt6_info *rt, struct fib6_config *r_cfg)
2897 {
2898 	struct rt6_nh *nh;
2899 	struct rt6_info *rtnh;
2900 	int err = -EEXIST;
2901 
2902 	list_for_each_entry(nh, rt6_nh_list, next) {
2903 		/* check if rt6_info already exists */
2904 		rtnh = nh->rt6_info;
2905 
2906 		if (rtnh->dst.dev == rt->dst.dev &&
2907 		    rtnh->rt6i_idev == rt->rt6i_idev &&
2908 		    ipv6_addr_equal(&rtnh->rt6i_gateway,
2909 				    &rt->rt6i_gateway))
2910 			return err;
2911 	}
2912 
2913 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2914 	if (!nh)
2915 		return -ENOMEM;
2916 	nh->rt6_info = rt;
2917 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
2918 	if (err) {
2919 		kfree(nh);
2920 		return err;
2921 	}
2922 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2923 	list_add_tail(&nh->next, rt6_nh_list);
2924 
2925 	return 0;
2926 }
2927 
2928 static int ip6_route_multipath_add(struct fib6_config *cfg)
2929 {
2930 	struct fib6_config r_cfg;
2931 	struct rtnexthop *rtnh;
2932 	struct rt6_info *rt;
2933 	struct rt6_nh *err_nh;
2934 	struct rt6_nh *nh, *nh_safe;
2935 	int remaining;
2936 	int attrlen;
2937 	int err = 1;
2938 	int nhn = 0;
2939 	int replace = (cfg->fc_nlinfo.nlh &&
2940 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2941 	LIST_HEAD(rt6_nh_list);
2942 
2943 	remaining = cfg->fc_mp_len;
2944 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2945 
2946 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
2947 	 * rt6_info structs per nexthop
2948 	 */
2949 	while (rtnh_ok(rtnh, remaining)) {
2950 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2951 		if (rtnh->rtnh_ifindex)
2952 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2953 
2954 		attrlen = rtnh_attrlen(rtnh);
2955 		if (attrlen > 0) {
2956 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2957 
2958 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2959 			if (nla) {
2960 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2961 				r_cfg.fc_flags |= RTF_GATEWAY;
2962 			}
2963 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2964 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2965 			if (nla)
2966 				r_cfg.fc_encap_type = nla_get_u16(nla);
2967 		}
2968 
2969 		rt = ip6_route_info_create(&r_cfg);
2970 		if (IS_ERR(rt)) {
2971 			err = PTR_ERR(rt);
2972 			rt = NULL;
2973 			goto cleanup;
2974 		}
2975 
2976 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2977 		if (err) {
2978 			dst_free(&rt->dst);
2979 			goto cleanup;
2980 		}
2981 
2982 		rtnh = rtnh_next(rtnh, &remaining);
2983 	}
2984 
2985 	err_nh = NULL;
2986 	list_for_each_entry(nh, &rt6_nh_list, next) {
2987 		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2988 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
2989 		nh->rt6_info = NULL;
2990 		if (err) {
2991 			if (replace && nhn)
2992 				ip6_print_replace_route_err(&rt6_nh_list);
2993 			err_nh = nh;
2994 			goto add_errout;
2995 		}
2996 
2997 		/* Because each route is added like a single route we remove
2998 		 * these flags after the first nexthop: if there is a collision,
2999 		 * we have already failed to add the first nexthop:
3000 		 * fib6_add_rt2node() has rejected it; when replacing, old
3001 		 * nexthops have been replaced by first new, the rest should
3002 		 * be added to it.
3003 		 */
3004 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3005 						     NLM_F_REPLACE);
3006 		nhn++;
3007 	}
3008 
3009 	goto cleanup;
3010 
3011 add_errout:
3012 	/* Delete routes that were already added */
3013 	list_for_each_entry(nh, &rt6_nh_list, next) {
3014 		if (err_nh == nh)
3015 			break;
3016 		ip6_route_del(&nh->r_cfg);
3017 	}
3018 
3019 cleanup:
3020 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3021 		if (nh->rt6_info)
3022 			dst_free(&nh->rt6_info->dst);
3023 		kfree(nh->mxc.mx);
3024 		list_del(&nh->next);
3025 		kfree(nh);
3026 	}
3027 
3028 	return err;
3029 }
3030 
3031 static int ip6_route_multipath_del(struct fib6_config *cfg)
3032 {
3033 	struct fib6_config r_cfg;
3034 	struct rtnexthop *rtnh;
3035 	int remaining;
3036 	int attrlen;
3037 	int err = 1, last_err = 0;
3038 
3039 	remaining = cfg->fc_mp_len;
3040 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3041 
3042 	/* Parse a Multipath Entry */
3043 	while (rtnh_ok(rtnh, remaining)) {
3044 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3045 		if (rtnh->rtnh_ifindex)
3046 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3047 
3048 		attrlen = rtnh_attrlen(rtnh);
3049 		if (attrlen > 0) {
3050 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3051 
3052 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3053 			if (nla) {
3054 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3055 				r_cfg.fc_flags |= RTF_GATEWAY;
3056 			}
3057 		}
3058 		err = ip6_route_del(&r_cfg);
3059 		if (err)
3060 			last_err = err;
3061 
3062 		rtnh = rtnh_next(rtnh, &remaining);
3063 	}
3064 
3065 	return last_err;
3066 }
3067 
3068 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3069 {
3070 	struct fib6_config cfg;
3071 	int err;
3072 
3073 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3074 	if (err < 0)
3075 		return err;
3076 
3077 	if (cfg.fc_mp)
3078 		return ip6_route_multipath_del(&cfg);
3079 	else
3080 		return ip6_route_del(&cfg);
3081 }
3082 
3083 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3084 {
3085 	struct fib6_config cfg;
3086 	int err;
3087 
3088 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3089 	if (err < 0)
3090 		return err;
3091 
3092 	if (cfg.fc_mp)
3093 		return ip6_route_multipath_add(&cfg);
3094 	else
3095 		return ip6_route_add(&cfg);
3096 }
3097 
3098 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3099 {
3100 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3101 	       + nla_total_size(16) /* RTA_SRC */
3102 	       + nla_total_size(16) /* RTA_DST */
3103 	       + nla_total_size(16) /* RTA_GATEWAY */
3104 	       + nla_total_size(16) /* RTA_PREFSRC */
3105 	       + nla_total_size(4) /* RTA_TABLE */
3106 	       + nla_total_size(4) /* RTA_IIF */
3107 	       + nla_total_size(4) /* RTA_OIF */
3108 	       + nla_total_size(4) /* RTA_PRIORITY */
3109 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3110 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3111 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3112 	       + nla_total_size(1) /* RTA_PREF */
3113 	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
3114 }
3115 
3116 static int rt6_fill_node(struct net *net,
3117 			 struct sk_buff *skb, struct rt6_info *rt,
3118 			 struct in6_addr *dst, struct in6_addr *src,
3119 			 int iif, int type, u32 portid, u32 seq,
3120 			 int prefix, int nowait, unsigned int flags)
3121 {
3122 	u32 metrics[RTAX_MAX];
3123 	struct rtmsg *rtm;
3124 	struct nlmsghdr *nlh;
3125 	long expires;
3126 	u32 table;
3127 
3128 	if (prefix) {	/* user wants prefix routes only */
3129 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3130 			/* success since this is not a prefix route */
3131 			return 1;
3132 		}
3133 	}
3134 
3135 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3136 	if (!nlh)
3137 		return -EMSGSIZE;
3138 
3139 	rtm = nlmsg_data(nlh);
3140 	rtm->rtm_family = AF_INET6;
3141 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3142 	rtm->rtm_src_len = rt->rt6i_src.plen;
3143 	rtm->rtm_tos = 0;
3144 	if (rt->rt6i_table)
3145 		table = rt->rt6i_table->tb6_id;
3146 	else
3147 		table = RT6_TABLE_UNSPEC;
3148 	rtm->rtm_table = table;
3149 	if (nla_put_u32(skb, RTA_TABLE, table))
3150 		goto nla_put_failure;
3151 	if (rt->rt6i_flags & RTF_REJECT) {
3152 		switch (rt->dst.error) {
3153 		case -EINVAL:
3154 			rtm->rtm_type = RTN_BLACKHOLE;
3155 			break;
3156 		case -EACCES:
3157 			rtm->rtm_type = RTN_PROHIBIT;
3158 			break;
3159 		case -EAGAIN:
3160 			rtm->rtm_type = RTN_THROW;
3161 			break;
3162 		default:
3163 			rtm->rtm_type = RTN_UNREACHABLE;
3164 			break;
3165 		}
3166 	}
3167 	else if (rt->rt6i_flags & RTF_LOCAL)
3168 		rtm->rtm_type = RTN_LOCAL;
3169 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3170 		rtm->rtm_type = RTN_LOCAL;
3171 	else
3172 		rtm->rtm_type = RTN_UNICAST;
3173 	rtm->rtm_flags = 0;
3174 	if (!netif_carrier_ok(rt->dst.dev)) {
3175 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
3176 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3177 			rtm->rtm_flags |= RTNH_F_DEAD;
3178 	}
3179 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3180 	rtm->rtm_protocol = rt->rt6i_protocol;
3181 	if (rt->rt6i_flags & RTF_DYNAMIC)
3182 		rtm->rtm_protocol = RTPROT_REDIRECT;
3183 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
3184 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3185 			rtm->rtm_protocol = RTPROT_RA;
3186 		else
3187 			rtm->rtm_protocol = RTPROT_KERNEL;
3188 	}
3189 
3190 	if (rt->rt6i_flags & RTF_CACHE)
3191 		rtm->rtm_flags |= RTM_F_CLONED;
3192 
3193 	if (dst) {
3194 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3195 			goto nla_put_failure;
3196 		rtm->rtm_dst_len = 128;
3197 	} else if (rtm->rtm_dst_len)
3198 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3199 			goto nla_put_failure;
3200 #ifdef CONFIG_IPV6_SUBTREES
3201 	if (src) {
3202 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3203 			goto nla_put_failure;
3204 		rtm->rtm_src_len = 128;
3205 	} else if (rtm->rtm_src_len &&
3206 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3207 		goto nla_put_failure;
3208 #endif
3209 	if (iif) {
3210 #ifdef CONFIG_IPV6_MROUTE
3211 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3212 			int err = ip6mr_get_route(net, skb, rtm, nowait);
3213 			if (err <= 0) {
3214 				if (!nowait) {
3215 					if (err == 0)
3216 						return 0;
3217 					goto nla_put_failure;
3218 				} else {
3219 					if (err == -EMSGSIZE)
3220 						goto nla_put_failure;
3221 				}
3222 			}
3223 		} else
3224 #endif
3225 			if (nla_put_u32(skb, RTA_IIF, iif))
3226 				goto nla_put_failure;
3227 	} else if (dst) {
3228 		struct in6_addr saddr_buf;
3229 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3230 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3231 			goto nla_put_failure;
3232 	}
3233 
3234 	if (rt->rt6i_prefsrc.plen) {
3235 		struct in6_addr saddr_buf;
3236 		saddr_buf = rt->rt6i_prefsrc.addr;
3237 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3238 			goto nla_put_failure;
3239 	}
3240 
3241 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3242 	if (rt->rt6i_pmtu)
3243 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3244 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3245 		goto nla_put_failure;
3246 
3247 	if (rt->rt6i_flags & RTF_GATEWAY) {
3248 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3249 			goto nla_put_failure;
3250 	}
3251 
3252 	if (rt->dst.dev &&
3253 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3254 		goto nla_put_failure;
3255 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3256 		goto nla_put_failure;
3257 
3258 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3259 
3260 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3261 		goto nla_put_failure;
3262 
3263 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3264 		goto nla_put_failure;
3265 
3266 	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3267 
3268 	nlmsg_end(skb, nlh);
3269 	return 0;
3270 
3271 nla_put_failure:
3272 	nlmsg_cancel(skb, nlh);
3273 	return -EMSGSIZE;
3274 }
3275 
3276 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3277 {
3278 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3279 	int prefix;
3280 
3281 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3282 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3283 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3284 	} else
3285 		prefix = 0;
3286 
3287 	return rt6_fill_node(arg->net,
3288 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3289 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3290 		     prefix, 0, NLM_F_MULTI);
3291 }
3292 
3293 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3294 {
3295 	struct net *net = sock_net(in_skb->sk);
3296 	struct nlattr *tb[RTA_MAX+1];
3297 	struct rt6_info *rt;
3298 	struct sk_buff *skb;
3299 	struct rtmsg *rtm;
3300 	struct flowi6 fl6;
3301 	int err, iif = 0, oif = 0;
3302 
3303 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3304 	if (err < 0)
3305 		goto errout;
3306 
3307 	err = -EINVAL;
3308 	memset(&fl6, 0, sizeof(fl6));
3309 
3310 	if (tb[RTA_SRC]) {
3311 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3312 			goto errout;
3313 
3314 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3315 	}
3316 
3317 	if (tb[RTA_DST]) {
3318 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3319 			goto errout;
3320 
3321 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3322 	}
3323 
3324 	if (tb[RTA_IIF])
3325 		iif = nla_get_u32(tb[RTA_IIF]);
3326 
3327 	if (tb[RTA_OIF])
3328 		oif = nla_get_u32(tb[RTA_OIF]);
3329 
3330 	if (tb[RTA_MARK])
3331 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3332 
3333 	if (iif) {
3334 		struct net_device *dev;
3335 		int flags = 0;
3336 
3337 		dev = __dev_get_by_index(net, iif);
3338 		if (!dev) {
3339 			err = -ENODEV;
3340 			goto errout;
3341 		}
3342 
3343 		fl6.flowi6_iif = iif;
3344 
3345 		if (!ipv6_addr_any(&fl6.saddr))
3346 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3347 
3348 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3349 							       flags);
3350 	} else {
3351 		fl6.flowi6_oif = oif;
3352 
3353 		if (netif_index_is_l3_master(net, oif)) {
3354 			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3355 					   FLOWI_FLAG_SKIP_NH_OIF;
3356 		}
3357 
3358 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3359 	}
3360 
3361 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3362 	if (!skb) {
3363 		ip6_rt_put(rt);
3364 		err = -ENOBUFS;
3365 		goto errout;
3366 	}
3367 
3368 	/* Reserve room for dummy headers, this skb can pass
3369 	   through good chunk of routing engine.
3370 	 */
3371 	skb_reset_mac_header(skb);
3372 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3373 
3374 	skb_dst_set(skb, &rt->dst);
3375 
3376 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3377 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3378 			    nlh->nlmsg_seq, 0, 0, 0);
3379 	if (err < 0) {
3380 		kfree_skb(skb);
3381 		goto errout;
3382 	}
3383 
3384 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3385 errout:
3386 	return err;
3387 }
3388 
3389 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3390 		     unsigned int nlm_flags)
3391 {
3392 	struct sk_buff *skb;
3393 	struct net *net = info->nl_net;
3394 	u32 seq;
3395 	int err;
3396 
3397 	err = -ENOBUFS;
3398 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3399 
3400 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3401 	if (!skb)
3402 		goto errout;
3403 
3404 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3405 				event, info->portid, seq, 0, 0, nlm_flags);
3406 	if (err < 0) {
3407 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3408 		WARN_ON(err == -EMSGSIZE);
3409 		kfree_skb(skb);
3410 		goto errout;
3411 	}
3412 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3413 		    info->nlh, gfp_any());
3414 	return;
3415 errout:
3416 	if (err < 0)
3417 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3418 }
3419 
3420 static int ip6_route_dev_notify(struct notifier_block *this,
3421 				unsigned long event, void *ptr)
3422 {
3423 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3424 	struct net *net = dev_net(dev);
3425 
3426 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3427 		net->ipv6.ip6_null_entry->dst.dev = dev;
3428 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3429 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3430 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3431 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3432 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3433 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3434 #endif
3435 	}
3436 
3437 	return NOTIFY_OK;
3438 }
3439 
3440 /*
3441  *	/proc
3442  */
3443 
3444 #ifdef CONFIG_PROC_FS
3445 
3446 static const struct file_operations ipv6_route_proc_fops = {
3447 	.owner		= THIS_MODULE,
3448 	.open		= ipv6_route_open,
3449 	.read		= seq_read,
3450 	.llseek		= seq_lseek,
3451 	.release	= seq_release_net,
3452 };
3453 
3454 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3455 {
3456 	struct net *net = (struct net *)seq->private;
3457 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3458 		   net->ipv6.rt6_stats->fib_nodes,
3459 		   net->ipv6.rt6_stats->fib_route_nodes,
3460 		   net->ipv6.rt6_stats->fib_rt_alloc,
3461 		   net->ipv6.rt6_stats->fib_rt_entries,
3462 		   net->ipv6.rt6_stats->fib_rt_cache,
3463 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3464 		   net->ipv6.rt6_stats->fib_discarded_routes);
3465 
3466 	return 0;
3467 }
3468 
3469 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3470 {
3471 	return single_open_net(inode, file, rt6_stats_seq_show);
3472 }
3473 
3474 static const struct file_operations rt6_stats_seq_fops = {
3475 	.owner	 = THIS_MODULE,
3476 	.open	 = rt6_stats_seq_open,
3477 	.read	 = seq_read,
3478 	.llseek	 = seq_lseek,
3479 	.release = single_release_net,
3480 };
3481 #endif	/* CONFIG_PROC_FS */
3482 
3483 #ifdef CONFIG_SYSCTL
3484 
3485 static
3486 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3487 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3488 {
3489 	struct net *net;
3490 	int delay;
3491 	if (!write)
3492 		return -EINVAL;
3493 
3494 	net = (struct net *)ctl->extra1;
3495 	delay = net->ipv6.sysctl.flush_delay;
3496 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3497 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3498 	return 0;
3499 }
3500 
3501 struct ctl_table ipv6_route_table_template[] = {
3502 	{
3503 		.procname	=	"flush",
3504 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3505 		.maxlen		=	sizeof(int),
3506 		.mode		=	0200,
3507 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3508 	},
3509 	{
3510 		.procname	=	"gc_thresh",
3511 		.data		=	&ip6_dst_ops_template.gc_thresh,
3512 		.maxlen		=	sizeof(int),
3513 		.mode		=	0644,
3514 		.proc_handler	=	proc_dointvec,
3515 	},
3516 	{
3517 		.procname	=	"max_size",
3518 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3519 		.maxlen		=	sizeof(int),
3520 		.mode		=	0644,
3521 		.proc_handler	=	proc_dointvec,
3522 	},
3523 	{
3524 		.procname	=	"gc_min_interval",
3525 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3526 		.maxlen		=	sizeof(int),
3527 		.mode		=	0644,
3528 		.proc_handler	=	proc_dointvec_jiffies,
3529 	},
3530 	{
3531 		.procname	=	"gc_timeout",
3532 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3533 		.maxlen		=	sizeof(int),
3534 		.mode		=	0644,
3535 		.proc_handler	=	proc_dointvec_jiffies,
3536 	},
3537 	{
3538 		.procname	=	"gc_interval",
3539 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3540 		.maxlen		=	sizeof(int),
3541 		.mode		=	0644,
3542 		.proc_handler	=	proc_dointvec_jiffies,
3543 	},
3544 	{
3545 		.procname	=	"gc_elasticity",
3546 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3547 		.maxlen		=	sizeof(int),
3548 		.mode		=	0644,
3549 		.proc_handler	=	proc_dointvec,
3550 	},
3551 	{
3552 		.procname	=	"mtu_expires",
3553 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3554 		.maxlen		=	sizeof(int),
3555 		.mode		=	0644,
3556 		.proc_handler	=	proc_dointvec_jiffies,
3557 	},
3558 	{
3559 		.procname	=	"min_adv_mss",
3560 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3561 		.maxlen		=	sizeof(int),
3562 		.mode		=	0644,
3563 		.proc_handler	=	proc_dointvec,
3564 	},
3565 	{
3566 		.procname	=	"gc_min_interval_ms",
3567 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3568 		.maxlen		=	sizeof(int),
3569 		.mode		=	0644,
3570 		.proc_handler	=	proc_dointvec_ms_jiffies,
3571 	},
3572 	{ }
3573 };
3574 
3575 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3576 {
3577 	struct ctl_table *table;
3578 
3579 	table = kmemdup(ipv6_route_table_template,
3580 			sizeof(ipv6_route_table_template),
3581 			GFP_KERNEL);
3582 
3583 	if (table) {
3584 		table[0].data = &net->ipv6.sysctl.flush_delay;
3585 		table[0].extra1 = net;
3586 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3587 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3588 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3589 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3590 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3591 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3592 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3593 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3594 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3595 
3596 		/* Don't export sysctls to unprivileged users */
3597 		if (net->user_ns != &init_user_ns)
3598 			table[0].procname = NULL;
3599 	}
3600 
3601 	return table;
3602 }
3603 #endif
3604 
3605 static int __net_init ip6_route_net_init(struct net *net)
3606 {
3607 	int ret = -ENOMEM;
3608 
3609 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3610 	       sizeof(net->ipv6.ip6_dst_ops));
3611 
3612 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3613 		goto out_ip6_dst_ops;
3614 
3615 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3616 					   sizeof(*net->ipv6.ip6_null_entry),
3617 					   GFP_KERNEL);
3618 	if (!net->ipv6.ip6_null_entry)
3619 		goto out_ip6_dst_entries;
3620 	net->ipv6.ip6_null_entry->dst.path =
3621 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3622 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3623 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3624 			 ip6_template_metrics, true);
3625 
3626 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3627 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3628 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3629 					       GFP_KERNEL);
3630 	if (!net->ipv6.ip6_prohibit_entry)
3631 		goto out_ip6_null_entry;
3632 	net->ipv6.ip6_prohibit_entry->dst.path =
3633 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3634 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3635 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3636 			 ip6_template_metrics, true);
3637 
3638 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3639 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3640 					       GFP_KERNEL);
3641 	if (!net->ipv6.ip6_blk_hole_entry)
3642 		goto out_ip6_prohibit_entry;
3643 	net->ipv6.ip6_blk_hole_entry->dst.path =
3644 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3645 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3646 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3647 			 ip6_template_metrics, true);
3648 #endif
3649 
3650 	net->ipv6.sysctl.flush_delay = 0;
3651 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3652 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3653 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3654 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3655 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3656 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3657 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3658 
3659 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3660 
3661 	ret = 0;
3662 out:
3663 	return ret;
3664 
3665 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3666 out_ip6_prohibit_entry:
3667 	kfree(net->ipv6.ip6_prohibit_entry);
3668 out_ip6_null_entry:
3669 	kfree(net->ipv6.ip6_null_entry);
3670 #endif
3671 out_ip6_dst_entries:
3672 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3673 out_ip6_dst_ops:
3674 	goto out;
3675 }
3676 
3677 static void __net_exit ip6_route_net_exit(struct net *net)
3678 {
3679 	kfree(net->ipv6.ip6_null_entry);
3680 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3681 	kfree(net->ipv6.ip6_prohibit_entry);
3682 	kfree(net->ipv6.ip6_blk_hole_entry);
3683 #endif
3684 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3685 }
3686 
3687 static int __net_init ip6_route_net_init_late(struct net *net)
3688 {
3689 #ifdef CONFIG_PROC_FS
3690 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3691 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3692 #endif
3693 	return 0;
3694 }
3695 
3696 static void __net_exit ip6_route_net_exit_late(struct net *net)
3697 {
3698 #ifdef CONFIG_PROC_FS
3699 	remove_proc_entry("ipv6_route", net->proc_net);
3700 	remove_proc_entry("rt6_stats", net->proc_net);
3701 #endif
3702 }
3703 
3704 static struct pernet_operations ip6_route_net_ops = {
3705 	.init = ip6_route_net_init,
3706 	.exit = ip6_route_net_exit,
3707 };
3708 
3709 static int __net_init ipv6_inetpeer_init(struct net *net)
3710 {
3711 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3712 
3713 	if (!bp)
3714 		return -ENOMEM;
3715 	inet_peer_base_init(bp);
3716 	net->ipv6.peers = bp;
3717 	return 0;
3718 }
3719 
3720 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3721 {
3722 	struct inet_peer_base *bp = net->ipv6.peers;
3723 
3724 	net->ipv6.peers = NULL;
3725 	inetpeer_invalidate_tree(bp);
3726 	kfree(bp);
3727 }
3728 
3729 static struct pernet_operations ipv6_inetpeer_ops = {
3730 	.init	=	ipv6_inetpeer_init,
3731 	.exit	=	ipv6_inetpeer_exit,
3732 };
3733 
3734 static struct pernet_operations ip6_route_net_late_ops = {
3735 	.init = ip6_route_net_init_late,
3736 	.exit = ip6_route_net_exit_late,
3737 };
3738 
3739 static struct notifier_block ip6_route_dev_notifier = {
3740 	.notifier_call = ip6_route_dev_notify,
3741 	.priority = 0,
3742 };
3743 
3744 int __init ip6_route_init(void)
3745 {
3746 	int ret;
3747 	int cpu;
3748 
3749 	ret = -ENOMEM;
3750 	ip6_dst_ops_template.kmem_cachep =
3751 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3752 				  SLAB_HWCACHE_ALIGN, NULL);
3753 	if (!ip6_dst_ops_template.kmem_cachep)
3754 		goto out;
3755 
3756 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3757 	if (ret)
3758 		goto out_kmem_cache;
3759 
3760 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3761 	if (ret)
3762 		goto out_dst_entries;
3763 
3764 	ret = register_pernet_subsys(&ip6_route_net_ops);
3765 	if (ret)
3766 		goto out_register_inetpeer;
3767 
3768 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3769 
3770 	/* Registering of the loopback is done before this portion of code,
3771 	 * the loopback reference in rt6_info will not be taken, do it
3772 	 * manually for init_net */
3773 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3774 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3775   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3776 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3777 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3778 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3779 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3780   #endif
3781 	ret = fib6_init();
3782 	if (ret)
3783 		goto out_register_subsys;
3784 
3785 	ret = xfrm6_init();
3786 	if (ret)
3787 		goto out_fib6_init;
3788 
3789 	ret = fib6_rules_init();
3790 	if (ret)
3791 		goto xfrm6_init;
3792 
3793 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3794 	if (ret)
3795 		goto fib6_rules_init;
3796 
3797 	ret = -ENOBUFS;
3798 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3799 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3800 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3801 		goto out_register_late_subsys;
3802 
3803 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3804 	if (ret)
3805 		goto out_register_late_subsys;
3806 
3807 	for_each_possible_cpu(cpu) {
3808 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3809 
3810 		INIT_LIST_HEAD(&ul->head);
3811 		spin_lock_init(&ul->lock);
3812 	}
3813 
3814 out:
3815 	return ret;
3816 
3817 out_register_late_subsys:
3818 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3819 fib6_rules_init:
3820 	fib6_rules_cleanup();
3821 xfrm6_init:
3822 	xfrm6_fini();
3823 out_fib6_init:
3824 	fib6_gc_cleanup();
3825 out_register_subsys:
3826 	unregister_pernet_subsys(&ip6_route_net_ops);
3827 out_register_inetpeer:
3828 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3829 out_dst_entries:
3830 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3831 out_kmem_cache:
3832 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3833 	goto out;
3834 }
3835 
3836 void ip6_route_cleanup(void)
3837 {
3838 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3839 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3840 	fib6_rules_cleanup();
3841 	xfrm6_fini();
3842 	fib6_gc_cleanup();
3843 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3844 	unregister_pernet_subsys(&ip6_route_net_ops);
3845 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3846 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3847 }
3848