xref: /linux/net/ipv6/route.c (revision d66f6c0a8f3c0bcc4ee7a9b1da4b0ebe7ee555a3)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66 
67 #include <asm/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 enum rt6_nud_state {
74 	RT6_NUD_FAIL_HARD = -3,
75 	RT6_NUD_FAIL_PROBE = -2,
76 	RT6_NUD_FAIL_DO_RR = -1,
77 	RT6_NUD_SUCCEED = 1
78 };
79 
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104 					   const struct in6_addr *prefix, int prefixlen,
105 					   const struct in6_addr *gwaddr, int ifindex,
106 					   unsigned int pref);
107 static struct rt6_info *rt6_get_route_info(struct net *net,
108 					   const struct in6_addr *prefix, int prefixlen,
109 					   const struct in6_addr *gwaddr, int ifindex);
110 #endif
111 
112 struct uncached_list {
113 	spinlock_t		lock;
114 	struct list_head	head;
115 };
116 
117 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
118 
119 static void rt6_uncached_list_add(struct rt6_info *rt)
120 {
121 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
122 
123 	rt->dst.flags |= DST_NOCACHE;
124 	rt->rt6i_uncached_list = ul;
125 
126 	spin_lock_bh(&ul->lock);
127 	list_add_tail(&rt->rt6i_uncached, &ul->head);
128 	spin_unlock_bh(&ul->lock);
129 }
130 
131 static void rt6_uncached_list_del(struct rt6_info *rt)
132 {
133 	if (!list_empty(&rt->rt6i_uncached)) {
134 		struct uncached_list *ul = rt->rt6i_uncached_list;
135 
136 		spin_lock_bh(&ul->lock);
137 		list_del(&rt->rt6i_uncached);
138 		spin_unlock_bh(&ul->lock);
139 	}
140 }
141 
142 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
143 {
144 	struct net_device *loopback_dev = net->loopback_dev;
145 	int cpu;
146 
147 	if (dev == loopback_dev)
148 		return;
149 
150 	for_each_possible_cpu(cpu) {
151 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152 		struct rt6_info *rt;
153 
154 		spin_lock_bh(&ul->lock);
155 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
156 			struct inet6_dev *rt_idev = rt->rt6i_idev;
157 			struct net_device *rt_dev = rt->dst.dev;
158 
159 			if (rt_idev->dev == dev) {
160 				rt->rt6i_idev = in6_dev_get(loopback_dev);
161 				in6_dev_put(rt_idev);
162 			}
163 
164 			if (rt_dev == dev) {
165 				rt->dst.dev = loopback_dev;
166 				dev_hold(rt->dst.dev);
167 				dev_put(rt_dev);
168 			}
169 		}
170 		spin_unlock_bh(&ul->lock);
171 	}
172 }
173 
174 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
175 {
176 	return dst_metrics_write_ptr(rt->dst.from);
177 }
178 
179 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
180 {
181 	struct rt6_info *rt = (struct rt6_info *)dst;
182 
183 	if (rt->rt6i_flags & RTF_PCPU)
184 		return rt6_pcpu_cow_metrics(rt);
185 	else if (rt->rt6i_flags & RTF_CACHE)
186 		return NULL;
187 	else
188 		return dst_cow_metrics_generic(dst, old);
189 }
190 
191 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	struct in6_addr *p = &rt->rt6i_gateway;
196 
197 	if (!ipv6_addr_any(p))
198 		return (const void *) p;
199 	else if (skb)
200 		return &ipv6_hdr(skb)->daddr;
201 	return daddr;
202 }
203 
204 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
205 					  struct sk_buff *skb,
206 					  const void *daddr)
207 {
208 	struct rt6_info *rt = (struct rt6_info *) dst;
209 	struct neighbour *n;
210 
211 	daddr = choose_neigh_daddr(rt, skb, daddr);
212 	n = __ipv6_neigh_lookup(dst->dev, daddr);
213 	if (n)
214 		return n;
215 	return neigh_create(&nd_tbl, daddr, dst->dev);
216 }
217 
218 static struct dst_ops ip6_dst_ops_template = {
219 	.family			=	AF_INET6,
220 	.gc			=	ip6_dst_gc,
221 	.gc_thresh		=	1024,
222 	.check			=	ip6_dst_check,
223 	.default_advmss		=	ip6_default_advmss,
224 	.mtu			=	ip6_mtu,
225 	.cow_metrics		=	ipv6_cow_metrics,
226 	.destroy		=	ip6_dst_destroy,
227 	.ifdown			=	ip6_dst_ifdown,
228 	.negative_advice	=	ip6_negative_advice,
229 	.link_failure		=	ip6_link_failure,
230 	.update_pmtu		=	ip6_rt_update_pmtu,
231 	.redirect		=	rt6_do_redirect,
232 	.local_out		=	__ip6_local_out,
233 	.neigh_lookup		=	ip6_neigh_lookup,
234 };
235 
236 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
237 {
238 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
239 
240 	return mtu ? : dst->dev->mtu;
241 }
242 
243 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
244 					 struct sk_buff *skb, u32 mtu)
245 {
246 }
247 
248 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
249 				      struct sk_buff *skb)
250 {
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	dst_cow_metrics_generic,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 static void rt6_info_init(struct rt6_info *rt)
319 {
320 	struct dst_entry *dst = &rt->dst;
321 
322 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
323 	INIT_LIST_HEAD(&rt->rt6i_siblings);
324 	INIT_LIST_HEAD(&rt->rt6i_uncached);
325 }
326 
327 /* allocate dst with ip6_dst_ops */
328 static struct rt6_info *__ip6_dst_alloc(struct net *net,
329 					struct net_device *dev,
330 					int flags)
331 {
332 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
333 					0, DST_OBSOLETE_FORCE_CHK, flags);
334 
335 	if (rt)
336 		rt6_info_init(rt);
337 
338 	return rt;
339 }
340 
341 struct rt6_info *ip6_dst_alloc(struct net *net,
342 			       struct net_device *dev,
343 			       int flags)
344 {
345 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346 
347 	if (rt) {
348 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349 		if (rt->rt6i_pcpu) {
350 			int cpu;
351 
352 			for_each_possible_cpu(cpu) {
353 				struct rt6_info **p;
354 
355 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356 				/* no one shares rt */
357 				*p =  NULL;
358 			}
359 		} else {
360 			dst_destroy((struct dst_entry *)rt);
361 			return NULL;
362 		}
363 	}
364 
365 	return rt;
366 }
367 EXPORT_SYMBOL(ip6_dst_alloc);
368 
369 static void ip6_dst_destroy(struct dst_entry *dst)
370 {
371 	struct rt6_info *rt = (struct rt6_info *)dst;
372 	struct dst_entry *from = dst->from;
373 	struct inet6_dev *idev;
374 
375 	dst_destroy_metrics_generic(dst);
376 	free_percpu(rt->rt6i_pcpu);
377 	rt6_uncached_list_del(rt);
378 
379 	idev = rt->rt6i_idev;
380 	if (idev) {
381 		rt->rt6i_idev = NULL;
382 		in6_dev_put(idev);
383 	}
384 
385 	dst->from = NULL;
386 	dst_release(from);
387 }
388 
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 			   int how)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct inet6_dev *idev = rt->rt6i_idev;
394 	struct net_device *loopback_dev =
395 		dev_net(dev)->loopback_dev;
396 
397 	if (dev != loopback_dev) {
398 		if (idev && idev->dev == dev) {
399 			struct inet6_dev *loopback_idev =
400 				in6_dev_get(loopback_dev);
401 			if (loopback_idev) {
402 				rt->rt6i_idev = loopback_idev;
403 				in6_dev_put(idev);
404 			}
405 		}
406 	}
407 }
408 
409 static bool __rt6_check_expired(const struct rt6_info *rt)
410 {
411 	if (rt->rt6i_flags & RTF_EXPIRES)
412 		return time_after(jiffies, rt->dst.expires);
413 	else
414 		return false;
415 }
416 
417 static bool rt6_check_expired(const struct rt6_info *rt)
418 {
419 	if (rt->rt6i_flags & RTF_EXPIRES) {
420 		if (time_after(jiffies, rt->dst.expires))
421 			return true;
422 	} else if (rt->dst.from) {
423 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
424 	}
425 	return false;
426 }
427 
428 /* Multipath route selection:
429  *   Hash based function using packet header and flowlabel.
430  * Adapted from fib_info_hashfn()
431  */
432 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
433 			       const struct flowi6 *fl6)
434 {
435 	return get_hash_from_flowi6(fl6) % candidate_count;
436 }
437 
438 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
439 					     struct flowi6 *fl6, int oif,
440 					     int strict)
441 {
442 	struct rt6_info *sibling, *next_sibling;
443 	int route_choosen;
444 
445 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
446 	/* Don't change the route, if route_choosen == 0
447 	 * (siblings does not include ourself)
448 	 */
449 	if (route_choosen)
450 		list_for_each_entry_safe(sibling, next_sibling,
451 				&match->rt6i_siblings, rt6i_siblings) {
452 			route_choosen--;
453 			if (route_choosen == 0) {
454 				if (rt6_score_route(sibling, oif, strict) < 0)
455 					break;
456 				match = sibling;
457 				break;
458 			}
459 		}
460 	return match;
461 }
462 
463 /*
464  *	Route lookup. Any table->tb6_lock is implied.
465  */
466 
467 static inline struct rt6_info *rt6_device_match(struct net *net,
468 						    struct rt6_info *rt,
469 						    const struct in6_addr *saddr,
470 						    int oif,
471 						    int flags)
472 {
473 	struct rt6_info *local = NULL;
474 	struct rt6_info *sprt;
475 
476 	if (!oif && ipv6_addr_any(saddr))
477 		goto out;
478 
479 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
480 		struct net_device *dev = sprt->dst.dev;
481 
482 		if (oif) {
483 			if (dev->ifindex == oif)
484 				return sprt;
485 			if (dev->flags & IFF_LOOPBACK) {
486 				if (!sprt->rt6i_idev ||
487 				    sprt->rt6i_idev->dev->ifindex != oif) {
488 					if (flags & RT6_LOOKUP_F_IFACE)
489 						continue;
490 					if (local &&
491 					    local->rt6i_idev->dev->ifindex == oif)
492 						continue;
493 				}
494 				local = sprt;
495 			}
496 		} else {
497 			if (ipv6_chk_addr(net, saddr, dev,
498 					  flags & RT6_LOOKUP_F_IFACE))
499 				return sprt;
500 		}
501 	}
502 
503 	if (oif) {
504 		if (local)
505 			return local;
506 
507 		if (flags & RT6_LOOKUP_F_IFACE)
508 			return net->ipv6.ip6_null_entry;
509 	}
510 out:
511 	return rt;
512 }
513 
514 #ifdef CONFIG_IPV6_ROUTER_PREF
515 struct __rt6_probe_work {
516 	struct work_struct work;
517 	struct in6_addr target;
518 	struct net_device *dev;
519 };
520 
521 static void rt6_probe_deferred(struct work_struct *w)
522 {
523 	struct in6_addr mcaddr;
524 	struct __rt6_probe_work *work =
525 		container_of(w, struct __rt6_probe_work, work);
526 
527 	addrconf_addr_solict_mult(&work->target, &mcaddr);
528 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
529 	dev_put(work->dev);
530 	kfree(work);
531 }
532 
533 static void rt6_probe(struct rt6_info *rt)
534 {
535 	struct __rt6_probe_work *work;
536 	struct neighbour *neigh;
537 	/*
538 	 * Okay, this does not seem to be appropriate
539 	 * for now, however, we need to check if it
540 	 * is really so; aka Router Reachability Probing.
541 	 *
542 	 * Router Reachability Probe MUST be rate-limited
543 	 * to no more than one per minute.
544 	 */
545 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
546 		return;
547 	rcu_read_lock_bh();
548 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
549 	if (neigh) {
550 		if (neigh->nud_state & NUD_VALID)
551 			goto out;
552 
553 		work = NULL;
554 		write_lock(&neigh->lock);
555 		if (!(neigh->nud_state & NUD_VALID) &&
556 		    time_after(jiffies,
557 			       neigh->updated +
558 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
559 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
560 			if (work)
561 				__neigh_set_probe_once(neigh);
562 		}
563 		write_unlock(&neigh->lock);
564 	} else {
565 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
566 	}
567 
568 	if (work) {
569 		INIT_WORK(&work->work, rt6_probe_deferred);
570 		work->target = rt->rt6i_gateway;
571 		dev_hold(rt->dst.dev);
572 		work->dev = rt->dst.dev;
573 		schedule_work(&work->work);
574 	}
575 
576 out:
577 	rcu_read_unlock_bh();
578 }
579 #else
580 static inline void rt6_probe(struct rt6_info *rt)
581 {
582 }
583 #endif
584 
585 /*
586  * Default Router Selection (RFC 2461 6.3.6)
587  */
588 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
589 {
590 	struct net_device *dev = rt->dst.dev;
591 	if (!oif || dev->ifindex == oif)
592 		return 2;
593 	if ((dev->flags & IFF_LOOPBACK) &&
594 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
595 		return 1;
596 	return 0;
597 }
598 
599 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
600 {
601 	struct neighbour *neigh;
602 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
603 
604 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
605 	    !(rt->rt6i_flags & RTF_GATEWAY))
606 		return RT6_NUD_SUCCEED;
607 
608 	rcu_read_lock_bh();
609 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
610 	if (neigh) {
611 		read_lock(&neigh->lock);
612 		if (neigh->nud_state & NUD_VALID)
613 			ret = RT6_NUD_SUCCEED;
614 #ifdef CONFIG_IPV6_ROUTER_PREF
615 		else if (!(neigh->nud_state & NUD_FAILED))
616 			ret = RT6_NUD_SUCCEED;
617 		else
618 			ret = RT6_NUD_FAIL_PROBE;
619 #endif
620 		read_unlock(&neigh->lock);
621 	} else {
622 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
623 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
624 	}
625 	rcu_read_unlock_bh();
626 
627 	return ret;
628 }
629 
630 static int rt6_score_route(struct rt6_info *rt, int oif,
631 			   int strict)
632 {
633 	int m;
634 
635 	m = rt6_check_dev(rt, oif);
636 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
637 		return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
640 #endif
641 	if (strict & RT6_LOOKUP_F_REACHABLE) {
642 		int n = rt6_check_neigh(rt);
643 		if (n < 0)
644 			return n;
645 	}
646 	return m;
647 }
648 
649 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
650 				   int *mpri, struct rt6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 	struct inet6_dev *idev = rt->rt6i_idev;
656 	struct net_device *dev = rt->dst.dev;
657 
658 	if (dev && !netif_carrier_ok(dev) &&
659 	    idev->cnf.ignore_routes_with_linkdown)
660 		goto out;
661 
662 	if (rt6_check_expired(rt))
663 		goto out;
664 
665 	m = rt6_score_route(rt, oif, strict);
666 	if (m == RT6_NUD_FAIL_DO_RR) {
667 		match_do_rr = true;
668 		m = 0; /* lowest valid score */
669 	} else if (m == RT6_NUD_FAIL_HARD) {
670 		goto out;
671 	}
672 
673 	if (strict & RT6_LOOKUP_F_REACHABLE)
674 		rt6_probe(rt);
675 
676 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
677 	if (m > *mpri) {
678 		*do_rr = match_do_rr;
679 		*mpri = m;
680 		match = rt;
681 	}
682 out:
683 	return match;
684 }
685 
686 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
687 				     struct rt6_info *rr_head,
688 				     u32 metric, int oif, int strict,
689 				     bool *do_rr)
690 {
691 	struct rt6_info *rt, *match, *cont;
692 	int mpri = -1;
693 
694 	match = NULL;
695 	cont = NULL;
696 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
697 		if (rt->rt6i_metric != metric) {
698 			cont = rt;
699 			break;
700 		}
701 
702 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
703 	}
704 
705 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
706 		if (rt->rt6i_metric != metric) {
707 			cont = rt;
708 			break;
709 		}
710 
711 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
712 	}
713 
714 	if (match || !cont)
715 		return match;
716 
717 	for (rt = cont; rt; rt = rt->dst.rt6_next)
718 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
719 
720 	return match;
721 }
722 
723 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
724 {
725 	struct rt6_info *match, *rt0;
726 	struct net *net;
727 	bool do_rr = false;
728 
729 	rt0 = fn->rr_ptr;
730 	if (!rt0)
731 		fn->rr_ptr = rt0 = fn->leaf;
732 
733 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
734 			     &do_rr);
735 
736 	if (do_rr) {
737 		struct rt6_info *next = rt0->dst.rt6_next;
738 
739 		/* no entries matched; do round-robin */
740 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
741 			next = fn->leaf;
742 
743 		if (next != rt0)
744 			fn->rr_ptr = next;
745 	}
746 
747 	net = dev_net(rt0->dst.dev);
748 	return match ? match : net->ipv6.ip6_null_entry;
749 }
750 
751 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
752 {
753 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
754 }
755 
756 #ifdef CONFIG_IPV6_ROUTE_INFO
757 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
758 		  const struct in6_addr *gwaddr)
759 {
760 	struct net *net = dev_net(dev);
761 	struct route_info *rinfo = (struct route_info *) opt;
762 	struct in6_addr prefix_buf, *prefix;
763 	unsigned int pref;
764 	unsigned long lifetime;
765 	struct rt6_info *rt;
766 
767 	if (len < sizeof(struct route_info)) {
768 		return -EINVAL;
769 	}
770 
771 	/* Sanity check for prefix_len and length */
772 	if (rinfo->length > 3) {
773 		return -EINVAL;
774 	} else if (rinfo->prefix_len > 128) {
775 		return -EINVAL;
776 	} else if (rinfo->prefix_len > 64) {
777 		if (rinfo->length < 2) {
778 			return -EINVAL;
779 		}
780 	} else if (rinfo->prefix_len > 0) {
781 		if (rinfo->length < 1) {
782 			return -EINVAL;
783 		}
784 	}
785 
786 	pref = rinfo->route_pref;
787 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
788 		return -EINVAL;
789 
790 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
791 
792 	if (rinfo->length == 3)
793 		prefix = (struct in6_addr *)rinfo->prefix;
794 	else {
795 		/* this function is safe */
796 		ipv6_addr_prefix(&prefix_buf,
797 				 (struct in6_addr *)rinfo->prefix,
798 				 rinfo->prefix_len);
799 		prefix = &prefix_buf;
800 	}
801 
802 	if (rinfo->prefix_len == 0)
803 		rt = rt6_get_dflt_router(gwaddr, dev);
804 	else
805 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
806 					gwaddr, dev->ifindex);
807 
808 	if (rt && !lifetime) {
809 		ip6_del_rt(rt);
810 		rt = NULL;
811 	}
812 
813 	if (!rt && lifetime)
814 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
815 					pref);
816 	else if (rt)
817 		rt->rt6i_flags = RTF_ROUTEINFO |
818 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
819 
820 	if (rt) {
821 		if (!addrconf_finite_timeout(lifetime))
822 			rt6_clean_expires(rt);
823 		else
824 			rt6_set_expires(rt, jiffies + HZ * lifetime);
825 
826 		ip6_rt_put(rt);
827 	}
828 	return 0;
829 }
830 #endif
831 
832 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
833 					struct in6_addr *saddr)
834 {
835 	struct fib6_node *pn;
836 	while (1) {
837 		if (fn->fn_flags & RTN_TL_ROOT)
838 			return NULL;
839 		pn = fn->parent;
840 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
841 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
842 		else
843 			fn = pn;
844 		if (fn->fn_flags & RTN_RTINFO)
845 			return fn;
846 	}
847 }
848 
849 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
850 					     struct fib6_table *table,
851 					     struct flowi6 *fl6, int flags)
852 {
853 	struct fib6_node *fn;
854 	struct rt6_info *rt;
855 
856 	read_lock_bh(&table->tb6_lock);
857 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
858 restart:
859 	rt = fn->leaf;
860 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
861 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
862 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
863 	if (rt == net->ipv6.ip6_null_entry) {
864 		fn = fib6_backtrack(fn, &fl6->saddr);
865 		if (fn)
866 			goto restart;
867 	}
868 	dst_use(&rt->dst, jiffies);
869 	read_unlock_bh(&table->tb6_lock);
870 
871 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
872 
873 	return rt;
874 
875 }
876 
877 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
878 				    int flags)
879 {
880 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
881 }
882 EXPORT_SYMBOL_GPL(ip6_route_lookup);
883 
884 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
885 			    const struct in6_addr *saddr, int oif, int strict)
886 {
887 	struct flowi6 fl6 = {
888 		.flowi6_oif = oif,
889 		.daddr = *daddr,
890 	};
891 	struct dst_entry *dst;
892 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
893 
894 	if (saddr) {
895 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
896 		flags |= RT6_LOOKUP_F_HAS_SADDR;
897 	}
898 
899 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
900 	if (dst->error == 0)
901 		return (struct rt6_info *) dst;
902 
903 	dst_release(dst);
904 
905 	return NULL;
906 }
907 EXPORT_SYMBOL(rt6_lookup);
908 
909 /* ip6_ins_rt is called with FREE table->tb6_lock.
910    It takes new route entry, the addition fails by any reason the
911    route is freed. In any case, if caller does not hold it, it may
912    be destroyed.
913  */
914 
915 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
916 			struct mx6_config *mxc)
917 {
918 	int err;
919 	struct fib6_table *table;
920 
921 	table = rt->rt6i_table;
922 	write_lock_bh(&table->tb6_lock);
923 	err = fib6_add(&table->tb6_root, rt, info, mxc);
924 	write_unlock_bh(&table->tb6_lock);
925 
926 	return err;
927 }
928 
929 int ip6_ins_rt(struct rt6_info *rt)
930 {
931 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
932 	struct mx6_config mxc = { .mx = NULL, };
933 
934 	return __ip6_ins_rt(rt, &info, &mxc);
935 }
936 
937 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
938 					   const struct in6_addr *daddr,
939 					   const struct in6_addr *saddr)
940 {
941 	struct rt6_info *rt;
942 
943 	/*
944 	 *	Clone the route.
945 	 */
946 
947 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
948 		ort = (struct rt6_info *)ort->dst.from;
949 
950 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
951 
952 	if (!rt)
953 		return NULL;
954 
955 	ip6_rt_copy_init(rt, ort);
956 	rt->rt6i_flags |= RTF_CACHE;
957 	rt->rt6i_metric = 0;
958 	rt->dst.flags |= DST_HOST;
959 	rt->rt6i_dst.addr = *daddr;
960 	rt->rt6i_dst.plen = 128;
961 
962 	if (!rt6_is_gw_or_nonexthop(ort)) {
963 		if (ort->rt6i_dst.plen != 128 &&
964 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
965 			rt->rt6i_flags |= RTF_ANYCAST;
966 #ifdef CONFIG_IPV6_SUBTREES
967 		if (rt->rt6i_src.plen && saddr) {
968 			rt->rt6i_src.addr = *saddr;
969 			rt->rt6i_src.plen = 128;
970 		}
971 #endif
972 	}
973 
974 	return rt;
975 }
976 
977 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
978 {
979 	struct rt6_info *pcpu_rt;
980 
981 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
982 				  rt->dst.dev, rt->dst.flags);
983 
984 	if (!pcpu_rt)
985 		return NULL;
986 	ip6_rt_copy_init(pcpu_rt, rt);
987 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
988 	pcpu_rt->rt6i_flags |= RTF_PCPU;
989 	return pcpu_rt;
990 }
991 
992 /* It should be called with read_lock_bh(&tb6_lock) acquired */
993 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
994 {
995 	struct rt6_info *pcpu_rt, **p;
996 
997 	p = this_cpu_ptr(rt->rt6i_pcpu);
998 	pcpu_rt = *p;
999 
1000 	if (pcpu_rt) {
1001 		dst_hold(&pcpu_rt->dst);
1002 		rt6_dst_from_metrics_check(pcpu_rt);
1003 	}
1004 	return pcpu_rt;
1005 }
1006 
1007 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1008 {
1009 	struct fib6_table *table = rt->rt6i_table;
1010 	struct rt6_info *pcpu_rt, *prev, **p;
1011 
1012 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1013 	if (!pcpu_rt) {
1014 		struct net *net = dev_net(rt->dst.dev);
1015 
1016 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1017 		return net->ipv6.ip6_null_entry;
1018 	}
1019 
1020 	read_lock_bh(&table->tb6_lock);
1021 	if (rt->rt6i_pcpu) {
1022 		p = this_cpu_ptr(rt->rt6i_pcpu);
1023 		prev = cmpxchg(p, NULL, pcpu_rt);
1024 		if (prev) {
1025 			/* If someone did it before us, return prev instead */
1026 			dst_destroy(&pcpu_rt->dst);
1027 			pcpu_rt = prev;
1028 		}
1029 	} else {
1030 		/* rt has been removed from the fib6 tree
1031 		 * before we have a chance to acquire the read_lock.
1032 		 * In this case, don't brother to create a pcpu rt
1033 		 * since rt is going away anyway.  The next
1034 		 * dst_check() will trigger a re-lookup.
1035 		 */
1036 		dst_destroy(&pcpu_rt->dst);
1037 		pcpu_rt = rt;
1038 	}
1039 	dst_hold(&pcpu_rt->dst);
1040 	rt6_dst_from_metrics_check(pcpu_rt);
1041 	read_unlock_bh(&table->tb6_lock);
1042 	return pcpu_rt;
1043 }
1044 
1045 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1046 			       int oif, struct flowi6 *fl6, int flags)
1047 {
1048 	struct fib6_node *fn, *saved_fn;
1049 	struct rt6_info *rt;
1050 	int strict = 0;
1051 
1052 	strict |= flags & RT6_LOOKUP_F_IFACE;
1053 	if (net->ipv6.devconf_all->forwarding == 0)
1054 		strict |= RT6_LOOKUP_F_REACHABLE;
1055 
1056 	read_lock_bh(&table->tb6_lock);
1057 
1058 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1059 	saved_fn = fn;
1060 
1061 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1062 		oif = 0;
1063 
1064 redo_rt6_select:
1065 	rt = rt6_select(fn, oif, strict);
1066 	if (rt->rt6i_nsiblings)
1067 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1068 	if (rt == net->ipv6.ip6_null_entry) {
1069 		fn = fib6_backtrack(fn, &fl6->saddr);
1070 		if (fn)
1071 			goto redo_rt6_select;
1072 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1073 			/* also consider unreachable route */
1074 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1075 			fn = saved_fn;
1076 			goto redo_rt6_select;
1077 		}
1078 	}
1079 
1080 
1081 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1082 		dst_use(&rt->dst, jiffies);
1083 		read_unlock_bh(&table->tb6_lock);
1084 
1085 		rt6_dst_from_metrics_check(rt);
1086 
1087 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1088 		return rt;
1089 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1090 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1091 		/* Create a RTF_CACHE clone which will not be
1092 		 * owned by the fib6 tree.  It is for the special case where
1093 		 * the daddr in the skb during the neighbor look-up is different
1094 		 * from the fl6->daddr used to look-up route here.
1095 		 */
1096 
1097 		struct rt6_info *uncached_rt;
1098 
1099 		dst_use(&rt->dst, jiffies);
1100 		read_unlock_bh(&table->tb6_lock);
1101 
1102 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1103 		dst_release(&rt->dst);
1104 
1105 		if (uncached_rt)
1106 			rt6_uncached_list_add(uncached_rt);
1107 		else
1108 			uncached_rt = net->ipv6.ip6_null_entry;
1109 
1110 		dst_hold(&uncached_rt->dst);
1111 
1112 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1113 		return uncached_rt;
1114 
1115 	} else {
1116 		/* Get a percpu copy */
1117 
1118 		struct rt6_info *pcpu_rt;
1119 
1120 		rt->dst.lastuse = jiffies;
1121 		rt->dst.__use++;
1122 		pcpu_rt = rt6_get_pcpu_route(rt);
1123 
1124 		if (pcpu_rt) {
1125 			read_unlock_bh(&table->tb6_lock);
1126 		} else {
1127 			/* We have to do the read_unlock first
1128 			 * because rt6_make_pcpu_route() may trigger
1129 			 * ip6_dst_gc() which will take the write_lock.
1130 			 */
1131 			dst_hold(&rt->dst);
1132 			read_unlock_bh(&table->tb6_lock);
1133 			pcpu_rt = rt6_make_pcpu_route(rt);
1134 			dst_release(&rt->dst);
1135 		}
1136 
1137 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1138 		return pcpu_rt;
1139 
1140 	}
1141 }
1142 EXPORT_SYMBOL_GPL(ip6_pol_route);
1143 
1144 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1145 					    struct flowi6 *fl6, int flags)
1146 {
1147 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1148 }
1149 
1150 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1151 						struct net_device *dev,
1152 						struct flowi6 *fl6, int flags)
1153 {
1154 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1155 		flags |= RT6_LOOKUP_F_IFACE;
1156 
1157 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1158 }
1159 
1160 void ip6_route_input(struct sk_buff *skb)
1161 {
1162 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1163 	struct net *net = dev_net(skb->dev);
1164 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1165 	struct ip_tunnel_info *tun_info;
1166 	struct flowi6 fl6 = {
1167 		.flowi6_iif = skb->dev->ifindex,
1168 		.daddr = iph->daddr,
1169 		.saddr = iph->saddr,
1170 		.flowlabel = ip6_flowinfo(iph),
1171 		.flowi6_mark = skb->mark,
1172 		.flowi6_proto = iph->nexthdr,
1173 	};
1174 
1175 	tun_info = skb_tunnel_info(skb);
1176 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1177 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1178 	skb_dst_drop(skb);
1179 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1180 }
1181 
1182 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1183 					     struct flowi6 *fl6, int flags)
1184 {
1185 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1186 }
1187 
1188 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1189 					 struct flowi6 *fl6, int flags)
1190 {
1191 	bool any_src;
1192 
1193 	if (rt6_need_strict(&fl6->daddr)) {
1194 		struct dst_entry *dst;
1195 
1196 		dst = l3mdev_link_scope_lookup(net, fl6);
1197 		if (dst)
1198 			return dst;
1199 	}
1200 
1201 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1202 
1203 	any_src = ipv6_addr_any(&fl6->saddr);
1204 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1205 	    (fl6->flowi6_oif && any_src))
1206 		flags |= RT6_LOOKUP_F_IFACE;
1207 
1208 	if (!any_src)
1209 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1210 	else if (sk)
1211 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1212 
1213 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1214 }
1215 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1216 
1217 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1218 {
1219 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1220 	struct dst_entry *new = NULL;
1221 
1222 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1223 	if (rt) {
1224 		rt6_info_init(rt);
1225 
1226 		new = &rt->dst;
1227 		new->__use = 1;
1228 		new->input = dst_discard;
1229 		new->output = dst_discard_out;
1230 
1231 		dst_copy_metrics(new, &ort->dst);
1232 		rt->rt6i_idev = ort->rt6i_idev;
1233 		if (rt->rt6i_idev)
1234 			in6_dev_hold(rt->rt6i_idev);
1235 
1236 		rt->rt6i_gateway = ort->rt6i_gateway;
1237 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1238 		rt->rt6i_metric = 0;
1239 
1240 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1241 #ifdef CONFIG_IPV6_SUBTREES
1242 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1243 #endif
1244 
1245 		dst_free(new);
1246 	}
1247 
1248 	dst_release(dst_orig);
1249 	return new ? new : ERR_PTR(-ENOMEM);
1250 }
1251 
1252 /*
1253  *	Destination cache support functions
1254  */
1255 
1256 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1257 {
1258 	if (rt->dst.from &&
1259 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1260 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1261 }
1262 
1263 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1264 {
1265 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1266 		return NULL;
1267 
1268 	if (rt6_check_expired(rt))
1269 		return NULL;
1270 
1271 	return &rt->dst;
1272 }
1273 
1274 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1275 {
1276 	if (!__rt6_check_expired(rt) &&
1277 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1278 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1279 		return &rt->dst;
1280 	else
1281 		return NULL;
1282 }
1283 
1284 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1285 {
1286 	struct rt6_info *rt;
1287 
1288 	rt = (struct rt6_info *) dst;
1289 
1290 	/* All IPV6 dsts are created with ->obsolete set to the value
1291 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1292 	 * into this function always.
1293 	 */
1294 
1295 	rt6_dst_from_metrics_check(rt);
1296 
1297 	if (rt->rt6i_flags & RTF_PCPU ||
1298 	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1299 		return rt6_dst_from_check(rt, cookie);
1300 	else
1301 		return rt6_check(rt, cookie);
1302 }
1303 
1304 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1305 {
1306 	struct rt6_info *rt = (struct rt6_info *) dst;
1307 
1308 	if (rt) {
1309 		if (rt->rt6i_flags & RTF_CACHE) {
1310 			if (rt6_check_expired(rt)) {
1311 				ip6_del_rt(rt);
1312 				dst = NULL;
1313 			}
1314 		} else {
1315 			dst_release(dst);
1316 			dst = NULL;
1317 		}
1318 	}
1319 	return dst;
1320 }
1321 
1322 static void ip6_link_failure(struct sk_buff *skb)
1323 {
1324 	struct rt6_info *rt;
1325 
1326 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1327 
1328 	rt = (struct rt6_info *) skb_dst(skb);
1329 	if (rt) {
1330 		if (rt->rt6i_flags & RTF_CACHE) {
1331 			dst_hold(&rt->dst);
1332 			ip6_del_rt(rt);
1333 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1334 			rt->rt6i_node->fn_sernum = -1;
1335 		}
1336 	}
1337 }
1338 
1339 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1340 {
1341 	struct net *net = dev_net(rt->dst.dev);
1342 
1343 	rt->rt6i_flags |= RTF_MODIFIED;
1344 	rt->rt6i_pmtu = mtu;
1345 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1346 }
1347 
1348 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1349 {
1350 	return !(rt->rt6i_flags & RTF_CACHE) &&
1351 		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1352 }
1353 
1354 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1355 				 const struct ipv6hdr *iph, u32 mtu)
1356 {
1357 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1358 
1359 	if (rt6->rt6i_flags & RTF_LOCAL)
1360 		return;
1361 
1362 	dst_confirm(dst);
1363 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1364 	if (mtu >= dst_mtu(dst))
1365 		return;
1366 
1367 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1368 		rt6_do_update_pmtu(rt6, mtu);
1369 	} else {
1370 		const struct in6_addr *daddr, *saddr;
1371 		struct rt6_info *nrt6;
1372 
1373 		if (iph) {
1374 			daddr = &iph->daddr;
1375 			saddr = &iph->saddr;
1376 		} else if (sk) {
1377 			daddr = &sk->sk_v6_daddr;
1378 			saddr = &inet6_sk(sk)->saddr;
1379 		} else {
1380 			return;
1381 		}
1382 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1383 		if (nrt6) {
1384 			rt6_do_update_pmtu(nrt6, mtu);
1385 
1386 			/* ip6_ins_rt(nrt6) will bump the
1387 			 * rt6->rt6i_node->fn_sernum
1388 			 * which will fail the next rt6_check() and
1389 			 * invalidate the sk->sk_dst_cache.
1390 			 */
1391 			ip6_ins_rt(nrt6);
1392 		}
1393 	}
1394 }
1395 
1396 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1397 			       struct sk_buff *skb, u32 mtu)
1398 {
1399 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1400 }
1401 
1402 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1403 		     int oif, u32 mark)
1404 {
1405 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1406 	struct dst_entry *dst;
1407 	struct flowi6 fl6;
1408 
1409 	memset(&fl6, 0, sizeof(fl6));
1410 	fl6.flowi6_oif = oif;
1411 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1412 	fl6.daddr = iph->daddr;
1413 	fl6.saddr = iph->saddr;
1414 	fl6.flowlabel = ip6_flowinfo(iph);
1415 
1416 	dst = ip6_route_output(net, NULL, &fl6);
1417 	if (!dst->error)
1418 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1419 	dst_release(dst);
1420 }
1421 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1422 
1423 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1424 {
1425 	struct dst_entry *dst;
1426 
1427 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1428 			sk->sk_bound_dev_if, sk->sk_mark);
1429 
1430 	dst = __sk_dst_get(sk);
1431 	if (!dst || !dst->obsolete ||
1432 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1433 		return;
1434 
1435 	bh_lock_sock(sk);
1436 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1437 		ip6_datagram_dst_update(sk, false);
1438 	bh_unlock_sock(sk);
1439 }
1440 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1441 
1442 /* Handle redirects */
1443 struct ip6rd_flowi {
1444 	struct flowi6 fl6;
1445 	struct in6_addr gateway;
1446 };
1447 
1448 static struct rt6_info *__ip6_route_redirect(struct net *net,
1449 					     struct fib6_table *table,
1450 					     struct flowi6 *fl6,
1451 					     int flags)
1452 {
1453 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1454 	struct rt6_info *rt;
1455 	struct fib6_node *fn;
1456 
1457 	/* Get the "current" route for this destination and
1458 	 * check if the redirect has come from approriate router.
1459 	 *
1460 	 * RFC 4861 specifies that redirects should only be
1461 	 * accepted if they come from the nexthop to the target.
1462 	 * Due to the way the routes are chosen, this notion
1463 	 * is a bit fuzzy and one might need to check all possible
1464 	 * routes.
1465 	 */
1466 
1467 	read_lock_bh(&table->tb6_lock);
1468 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1469 restart:
1470 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1471 		if (rt6_check_expired(rt))
1472 			continue;
1473 		if (rt->dst.error)
1474 			break;
1475 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1476 			continue;
1477 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1478 			continue;
1479 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1480 			continue;
1481 		break;
1482 	}
1483 
1484 	if (!rt)
1485 		rt = net->ipv6.ip6_null_entry;
1486 	else if (rt->dst.error) {
1487 		rt = net->ipv6.ip6_null_entry;
1488 		goto out;
1489 	}
1490 
1491 	if (rt == net->ipv6.ip6_null_entry) {
1492 		fn = fib6_backtrack(fn, &fl6->saddr);
1493 		if (fn)
1494 			goto restart;
1495 	}
1496 
1497 out:
1498 	dst_hold(&rt->dst);
1499 
1500 	read_unlock_bh(&table->tb6_lock);
1501 
1502 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1503 	return rt;
1504 };
1505 
1506 static struct dst_entry *ip6_route_redirect(struct net *net,
1507 					const struct flowi6 *fl6,
1508 					const struct in6_addr *gateway)
1509 {
1510 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1511 	struct ip6rd_flowi rdfl;
1512 
1513 	rdfl.fl6 = *fl6;
1514 	rdfl.gateway = *gateway;
1515 
1516 	return fib6_rule_lookup(net, &rdfl.fl6,
1517 				flags, __ip6_route_redirect);
1518 }
1519 
1520 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1521 {
1522 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1523 	struct dst_entry *dst;
1524 	struct flowi6 fl6;
1525 
1526 	memset(&fl6, 0, sizeof(fl6));
1527 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1528 	fl6.flowi6_oif = oif;
1529 	fl6.flowi6_mark = mark;
1530 	fl6.daddr = iph->daddr;
1531 	fl6.saddr = iph->saddr;
1532 	fl6.flowlabel = ip6_flowinfo(iph);
1533 
1534 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1535 	rt6_do_redirect(dst, NULL, skb);
1536 	dst_release(dst);
1537 }
1538 EXPORT_SYMBOL_GPL(ip6_redirect);
1539 
1540 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1541 			    u32 mark)
1542 {
1543 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1544 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1545 	struct dst_entry *dst;
1546 	struct flowi6 fl6;
1547 
1548 	memset(&fl6, 0, sizeof(fl6));
1549 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1550 	fl6.flowi6_oif = oif;
1551 	fl6.flowi6_mark = mark;
1552 	fl6.daddr = msg->dest;
1553 	fl6.saddr = iph->daddr;
1554 
1555 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1556 	rt6_do_redirect(dst, NULL, skb);
1557 	dst_release(dst);
1558 }
1559 
1560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1561 {
1562 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1563 }
1564 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1565 
1566 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1567 {
1568 	struct net_device *dev = dst->dev;
1569 	unsigned int mtu = dst_mtu(dst);
1570 	struct net *net = dev_net(dev);
1571 
1572 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1573 
1574 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1575 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1576 
1577 	/*
1578 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1579 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1580 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1581 	 * rely only on pmtu discovery"
1582 	 */
1583 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1584 		mtu = IPV6_MAXPLEN;
1585 	return mtu;
1586 }
1587 
1588 static unsigned int ip6_mtu(const struct dst_entry *dst)
1589 {
1590 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1591 	unsigned int mtu = rt->rt6i_pmtu;
1592 	struct inet6_dev *idev;
1593 
1594 	if (mtu)
1595 		goto out;
1596 
1597 	mtu = dst_metric_raw(dst, RTAX_MTU);
1598 	if (mtu)
1599 		goto out;
1600 
1601 	mtu = IPV6_MIN_MTU;
1602 
1603 	rcu_read_lock();
1604 	idev = __in6_dev_get(dst->dev);
1605 	if (idev)
1606 		mtu = idev->cnf.mtu6;
1607 	rcu_read_unlock();
1608 
1609 out:
1610 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1611 
1612 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1613 }
1614 
1615 static struct dst_entry *icmp6_dst_gc_list;
1616 static DEFINE_SPINLOCK(icmp6_dst_lock);
1617 
1618 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1619 				  struct flowi6 *fl6)
1620 {
1621 	struct dst_entry *dst;
1622 	struct rt6_info *rt;
1623 	struct inet6_dev *idev = in6_dev_get(dev);
1624 	struct net *net = dev_net(dev);
1625 
1626 	if (unlikely(!idev))
1627 		return ERR_PTR(-ENODEV);
1628 
1629 	rt = ip6_dst_alloc(net, dev, 0);
1630 	if (unlikely(!rt)) {
1631 		in6_dev_put(idev);
1632 		dst = ERR_PTR(-ENOMEM);
1633 		goto out;
1634 	}
1635 
1636 	rt->dst.flags |= DST_HOST;
1637 	rt->dst.output  = ip6_output;
1638 	atomic_set(&rt->dst.__refcnt, 1);
1639 	rt->rt6i_gateway  = fl6->daddr;
1640 	rt->rt6i_dst.addr = fl6->daddr;
1641 	rt->rt6i_dst.plen = 128;
1642 	rt->rt6i_idev     = idev;
1643 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1644 
1645 	spin_lock_bh(&icmp6_dst_lock);
1646 	rt->dst.next = icmp6_dst_gc_list;
1647 	icmp6_dst_gc_list = &rt->dst;
1648 	spin_unlock_bh(&icmp6_dst_lock);
1649 
1650 	fib6_force_start_gc(net);
1651 
1652 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1653 
1654 out:
1655 	return dst;
1656 }
1657 
1658 int icmp6_dst_gc(void)
1659 {
1660 	struct dst_entry *dst, **pprev;
1661 	int more = 0;
1662 
1663 	spin_lock_bh(&icmp6_dst_lock);
1664 	pprev = &icmp6_dst_gc_list;
1665 
1666 	while ((dst = *pprev) != NULL) {
1667 		if (!atomic_read(&dst->__refcnt)) {
1668 			*pprev = dst->next;
1669 			dst_free(dst);
1670 		} else {
1671 			pprev = &dst->next;
1672 			++more;
1673 		}
1674 	}
1675 
1676 	spin_unlock_bh(&icmp6_dst_lock);
1677 
1678 	return more;
1679 }
1680 
1681 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1682 			    void *arg)
1683 {
1684 	struct dst_entry *dst, **pprev;
1685 
1686 	spin_lock_bh(&icmp6_dst_lock);
1687 	pprev = &icmp6_dst_gc_list;
1688 	while ((dst = *pprev) != NULL) {
1689 		struct rt6_info *rt = (struct rt6_info *) dst;
1690 		if (func(rt, arg)) {
1691 			*pprev = dst->next;
1692 			dst_free(dst);
1693 		} else {
1694 			pprev = &dst->next;
1695 		}
1696 	}
1697 	spin_unlock_bh(&icmp6_dst_lock);
1698 }
1699 
1700 static int ip6_dst_gc(struct dst_ops *ops)
1701 {
1702 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1703 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1704 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1705 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1706 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1707 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1708 	int entries;
1709 
1710 	entries = dst_entries_get_fast(ops);
1711 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1712 	    entries <= rt_max_size)
1713 		goto out;
1714 
1715 	net->ipv6.ip6_rt_gc_expire++;
1716 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1717 	entries = dst_entries_get_slow(ops);
1718 	if (entries < ops->gc_thresh)
1719 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1720 out:
1721 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1722 	return entries > rt_max_size;
1723 }
1724 
1725 static int ip6_convert_metrics(struct mx6_config *mxc,
1726 			       const struct fib6_config *cfg)
1727 {
1728 	bool ecn_ca = false;
1729 	struct nlattr *nla;
1730 	int remaining;
1731 	u32 *mp;
1732 
1733 	if (!cfg->fc_mx)
1734 		return 0;
1735 
1736 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1737 	if (unlikely(!mp))
1738 		return -ENOMEM;
1739 
1740 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1741 		int type = nla_type(nla);
1742 		u32 val;
1743 
1744 		if (!type)
1745 			continue;
1746 		if (unlikely(type > RTAX_MAX))
1747 			goto err;
1748 
1749 		if (type == RTAX_CC_ALGO) {
1750 			char tmp[TCP_CA_NAME_MAX];
1751 
1752 			nla_strlcpy(tmp, nla, sizeof(tmp));
1753 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1754 			if (val == TCP_CA_UNSPEC)
1755 				goto err;
1756 		} else {
1757 			val = nla_get_u32(nla);
1758 		}
1759 		if (type == RTAX_HOPLIMIT && val > 255)
1760 			val = 255;
1761 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1762 			goto err;
1763 
1764 		mp[type - 1] = val;
1765 		__set_bit(type - 1, mxc->mx_valid);
1766 	}
1767 
1768 	if (ecn_ca) {
1769 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1770 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1771 	}
1772 
1773 	mxc->mx = mp;
1774 	return 0;
1775  err:
1776 	kfree(mp);
1777 	return -EINVAL;
1778 }
1779 
1780 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1781 					    struct fib6_config *cfg,
1782 					    const struct in6_addr *gw_addr)
1783 {
1784 	struct flowi6 fl6 = {
1785 		.flowi6_oif = cfg->fc_ifindex,
1786 		.daddr = *gw_addr,
1787 		.saddr = cfg->fc_prefsrc,
1788 	};
1789 	struct fib6_table *table;
1790 	struct rt6_info *rt;
1791 	int flags = RT6_LOOKUP_F_IFACE;
1792 
1793 	table = fib6_get_table(net, cfg->fc_table);
1794 	if (!table)
1795 		return NULL;
1796 
1797 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
1798 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1799 
1800 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1801 
1802 	/* if table lookup failed, fall back to full lookup */
1803 	if (rt == net->ipv6.ip6_null_entry) {
1804 		ip6_rt_put(rt);
1805 		rt = NULL;
1806 	}
1807 
1808 	return rt;
1809 }
1810 
1811 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1812 {
1813 	struct net *net = cfg->fc_nlinfo.nl_net;
1814 	struct rt6_info *rt = NULL;
1815 	struct net_device *dev = NULL;
1816 	struct inet6_dev *idev = NULL;
1817 	struct fib6_table *table;
1818 	int addr_type;
1819 	int err = -EINVAL;
1820 
1821 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1822 		goto out;
1823 #ifndef CONFIG_IPV6_SUBTREES
1824 	if (cfg->fc_src_len)
1825 		goto out;
1826 #endif
1827 	if (cfg->fc_ifindex) {
1828 		err = -ENODEV;
1829 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1830 		if (!dev)
1831 			goto out;
1832 		idev = in6_dev_get(dev);
1833 		if (!idev)
1834 			goto out;
1835 	}
1836 
1837 	if (cfg->fc_metric == 0)
1838 		cfg->fc_metric = IP6_RT_PRIO_USER;
1839 
1840 	err = -ENOBUFS;
1841 	if (cfg->fc_nlinfo.nlh &&
1842 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1843 		table = fib6_get_table(net, cfg->fc_table);
1844 		if (!table) {
1845 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1846 			table = fib6_new_table(net, cfg->fc_table);
1847 		}
1848 	} else {
1849 		table = fib6_new_table(net, cfg->fc_table);
1850 	}
1851 
1852 	if (!table)
1853 		goto out;
1854 
1855 	rt = ip6_dst_alloc(net, NULL,
1856 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1857 
1858 	if (!rt) {
1859 		err = -ENOMEM;
1860 		goto out;
1861 	}
1862 
1863 	if (cfg->fc_flags & RTF_EXPIRES)
1864 		rt6_set_expires(rt, jiffies +
1865 				clock_t_to_jiffies(cfg->fc_expires));
1866 	else
1867 		rt6_clean_expires(rt);
1868 
1869 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1870 		cfg->fc_protocol = RTPROT_BOOT;
1871 	rt->rt6i_protocol = cfg->fc_protocol;
1872 
1873 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1874 
1875 	if (addr_type & IPV6_ADDR_MULTICAST)
1876 		rt->dst.input = ip6_mc_input;
1877 	else if (cfg->fc_flags & RTF_LOCAL)
1878 		rt->dst.input = ip6_input;
1879 	else
1880 		rt->dst.input = ip6_forward;
1881 
1882 	rt->dst.output = ip6_output;
1883 
1884 	if (cfg->fc_encap) {
1885 		struct lwtunnel_state *lwtstate;
1886 
1887 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1888 					   cfg->fc_encap, AF_INET6, cfg,
1889 					   &lwtstate);
1890 		if (err)
1891 			goto out;
1892 		rt->dst.lwtstate = lwtstate_get(lwtstate);
1893 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1894 			rt->dst.lwtstate->orig_output = rt->dst.output;
1895 			rt->dst.output = lwtunnel_output;
1896 		}
1897 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1898 			rt->dst.lwtstate->orig_input = rt->dst.input;
1899 			rt->dst.input = lwtunnel_input;
1900 		}
1901 	}
1902 
1903 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1904 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1905 	if (rt->rt6i_dst.plen == 128)
1906 		rt->dst.flags |= DST_HOST;
1907 
1908 #ifdef CONFIG_IPV6_SUBTREES
1909 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1910 	rt->rt6i_src.plen = cfg->fc_src_len;
1911 #endif
1912 
1913 	rt->rt6i_metric = cfg->fc_metric;
1914 
1915 	/* We cannot add true routes via loopback here,
1916 	   they would result in kernel looping; promote them to reject routes
1917 	 */
1918 	if ((cfg->fc_flags & RTF_REJECT) ||
1919 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1920 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1921 	     !(cfg->fc_flags & RTF_LOCAL))) {
1922 		/* hold loopback dev/idev if we haven't done so. */
1923 		if (dev != net->loopback_dev) {
1924 			if (dev) {
1925 				dev_put(dev);
1926 				in6_dev_put(idev);
1927 			}
1928 			dev = net->loopback_dev;
1929 			dev_hold(dev);
1930 			idev = in6_dev_get(dev);
1931 			if (!idev) {
1932 				err = -ENODEV;
1933 				goto out;
1934 			}
1935 		}
1936 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1937 		switch (cfg->fc_type) {
1938 		case RTN_BLACKHOLE:
1939 			rt->dst.error = -EINVAL;
1940 			rt->dst.output = dst_discard_out;
1941 			rt->dst.input = dst_discard;
1942 			break;
1943 		case RTN_PROHIBIT:
1944 			rt->dst.error = -EACCES;
1945 			rt->dst.output = ip6_pkt_prohibit_out;
1946 			rt->dst.input = ip6_pkt_prohibit;
1947 			break;
1948 		case RTN_THROW:
1949 		case RTN_UNREACHABLE:
1950 		default:
1951 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1952 					: (cfg->fc_type == RTN_UNREACHABLE)
1953 					? -EHOSTUNREACH : -ENETUNREACH;
1954 			rt->dst.output = ip6_pkt_discard_out;
1955 			rt->dst.input = ip6_pkt_discard;
1956 			break;
1957 		}
1958 		goto install_route;
1959 	}
1960 
1961 	if (cfg->fc_flags & RTF_GATEWAY) {
1962 		const struct in6_addr *gw_addr;
1963 		int gwa_type;
1964 
1965 		gw_addr = &cfg->fc_gateway;
1966 		gwa_type = ipv6_addr_type(gw_addr);
1967 
1968 		/* if gw_addr is local we will fail to detect this in case
1969 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1970 		 * will return already-added prefix route via interface that
1971 		 * prefix route was assigned to, which might be non-loopback.
1972 		 */
1973 		err = -EINVAL;
1974 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1975 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1976 					    dev : NULL, 0, 0))
1977 			goto out;
1978 
1979 		rt->rt6i_gateway = *gw_addr;
1980 
1981 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1982 			struct rt6_info *grt = NULL;
1983 
1984 			/* IPv6 strictly inhibits using not link-local
1985 			   addresses as nexthop address.
1986 			   Otherwise, router will not able to send redirects.
1987 			   It is very good, but in some (rare!) circumstances
1988 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1989 			   some exceptions. --ANK
1990 			 */
1991 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1992 				goto out;
1993 
1994 			if (cfg->fc_table)
1995 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
1996 
1997 			if (!grt)
1998 				grt = rt6_lookup(net, gw_addr, NULL,
1999 						 cfg->fc_ifindex, 1);
2000 
2001 			err = -EHOSTUNREACH;
2002 			if (!grt)
2003 				goto out;
2004 			if (dev) {
2005 				if (dev != grt->dst.dev) {
2006 					ip6_rt_put(grt);
2007 					goto out;
2008 				}
2009 			} else {
2010 				dev = grt->dst.dev;
2011 				idev = grt->rt6i_idev;
2012 				dev_hold(dev);
2013 				in6_dev_hold(grt->rt6i_idev);
2014 			}
2015 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2016 				err = 0;
2017 			ip6_rt_put(grt);
2018 
2019 			if (err)
2020 				goto out;
2021 		}
2022 		err = -EINVAL;
2023 		if (!dev || (dev->flags & IFF_LOOPBACK))
2024 			goto out;
2025 	}
2026 
2027 	err = -ENODEV;
2028 	if (!dev)
2029 		goto out;
2030 
2031 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2032 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2033 			err = -EINVAL;
2034 			goto out;
2035 		}
2036 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2037 		rt->rt6i_prefsrc.plen = 128;
2038 	} else
2039 		rt->rt6i_prefsrc.plen = 0;
2040 
2041 	rt->rt6i_flags = cfg->fc_flags;
2042 
2043 install_route:
2044 	rt->dst.dev = dev;
2045 	rt->rt6i_idev = idev;
2046 	rt->rt6i_table = table;
2047 
2048 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2049 
2050 	return rt;
2051 out:
2052 	if (dev)
2053 		dev_put(dev);
2054 	if (idev)
2055 		in6_dev_put(idev);
2056 	if (rt)
2057 		dst_free(&rt->dst);
2058 
2059 	return ERR_PTR(err);
2060 }
2061 
2062 int ip6_route_add(struct fib6_config *cfg)
2063 {
2064 	struct mx6_config mxc = { .mx = NULL, };
2065 	struct rt6_info *rt;
2066 	int err;
2067 
2068 	rt = ip6_route_info_create(cfg);
2069 	if (IS_ERR(rt)) {
2070 		err = PTR_ERR(rt);
2071 		rt = NULL;
2072 		goto out;
2073 	}
2074 
2075 	err = ip6_convert_metrics(&mxc, cfg);
2076 	if (err)
2077 		goto out;
2078 
2079 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2080 
2081 	kfree(mxc.mx);
2082 
2083 	return err;
2084 out:
2085 	if (rt)
2086 		dst_free(&rt->dst);
2087 
2088 	return err;
2089 }
2090 
2091 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2092 {
2093 	int err;
2094 	struct fib6_table *table;
2095 	struct net *net = dev_net(rt->dst.dev);
2096 
2097 	if (rt == net->ipv6.ip6_null_entry ||
2098 	    rt->dst.flags & DST_NOCACHE) {
2099 		err = -ENOENT;
2100 		goto out;
2101 	}
2102 
2103 	table = rt->rt6i_table;
2104 	write_lock_bh(&table->tb6_lock);
2105 	err = fib6_del(rt, info);
2106 	write_unlock_bh(&table->tb6_lock);
2107 
2108 out:
2109 	ip6_rt_put(rt);
2110 	return err;
2111 }
2112 
2113 int ip6_del_rt(struct rt6_info *rt)
2114 {
2115 	struct nl_info info = {
2116 		.nl_net = dev_net(rt->dst.dev),
2117 	};
2118 	return __ip6_del_rt(rt, &info);
2119 }
2120 
2121 static int ip6_route_del(struct fib6_config *cfg)
2122 {
2123 	struct fib6_table *table;
2124 	struct fib6_node *fn;
2125 	struct rt6_info *rt;
2126 	int err = -ESRCH;
2127 
2128 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2129 	if (!table)
2130 		return err;
2131 
2132 	read_lock_bh(&table->tb6_lock);
2133 
2134 	fn = fib6_locate(&table->tb6_root,
2135 			 &cfg->fc_dst, cfg->fc_dst_len,
2136 			 &cfg->fc_src, cfg->fc_src_len);
2137 
2138 	if (fn) {
2139 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2140 			if ((rt->rt6i_flags & RTF_CACHE) &&
2141 			    !(cfg->fc_flags & RTF_CACHE))
2142 				continue;
2143 			if (cfg->fc_ifindex &&
2144 			    (!rt->dst.dev ||
2145 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2146 				continue;
2147 			if (cfg->fc_flags & RTF_GATEWAY &&
2148 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2149 				continue;
2150 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2151 				continue;
2152 			dst_hold(&rt->dst);
2153 			read_unlock_bh(&table->tb6_lock);
2154 
2155 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2156 		}
2157 	}
2158 	read_unlock_bh(&table->tb6_lock);
2159 
2160 	return err;
2161 }
2162 
2163 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2164 {
2165 	struct netevent_redirect netevent;
2166 	struct rt6_info *rt, *nrt = NULL;
2167 	struct ndisc_options ndopts;
2168 	struct inet6_dev *in6_dev;
2169 	struct neighbour *neigh;
2170 	struct rd_msg *msg;
2171 	int optlen, on_link;
2172 	u8 *lladdr;
2173 
2174 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2175 	optlen -= sizeof(*msg);
2176 
2177 	if (optlen < 0) {
2178 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2179 		return;
2180 	}
2181 
2182 	msg = (struct rd_msg *)icmp6_hdr(skb);
2183 
2184 	if (ipv6_addr_is_multicast(&msg->dest)) {
2185 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2186 		return;
2187 	}
2188 
2189 	on_link = 0;
2190 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2191 		on_link = 1;
2192 	} else if (ipv6_addr_type(&msg->target) !=
2193 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2194 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2195 		return;
2196 	}
2197 
2198 	in6_dev = __in6_dev_get(skb->dev);
2199 	if (!in6_dev)
2200 		return;
2201 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2202 		return;
2203 
2204 	/* RFC2461 8.1:
2205 	 *	The IP source address of the Redirect MUST be the same as the current
2206 	 *	first-hop router for the specified ICMP Destination Address.
2207 	 */
2208 
2209 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2210 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2211 		return;
2212 	}
2213 
2214 	lladdr = NULL;
2215 	if (ndopts.nd_opts_tgt_lladdr) {
2216 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2217 					     skb->dev);
2218 		if (!lladdr) {
2219 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2220 			return;
2221 		}
2222 	}
2223 
2224 	rt = (struct rt6_info *) dst;
2225 	if (rt->rt6i_flags & RTF_REJECT) {
2226 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2227 		return;
2228 	}
2229 
2230 	/* Redirect received -> path was valid.
2231 	 * Look, redirects are sent only in response to data packets,
2232 	 * so that this nexthop apparently is reachable. --ANK
2233 	 */
2234 	dst_confirm(&rt->dst);
2235 
2236 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2237 	if (!neigh)
2238 		return;
2239 
2240 	/*
2241 	 *	We have finally decided to accept it.
2242 	 */
2243 
2244 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2245 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2246 		     NEIGH_UPDATE_F_OVERRIDE|
2247 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2248 				     NEIGH_UPDATE_F_ISROUTER)),
2249 		     NDISC_REDIRECT, &ndopts);
2250 
2251 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2252 	if (!nrt)
2253 		goto out;
2254 
2255 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2256 	if (on_link)
2257 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2258 
2259 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2260 
2261 	if (ip6_ins_rt(nrt))
2262 		goto out;
2263 
2264 	netevent.old = &rt->dst;
2265 	netevent.new = &nrt->dst;
2266 	netevent.daddr = &msg->dest;
2267 	netevent.neigh = neigh;
2268 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2269 
2270 	if (rt->rt6i_flags & RTF_CACHE) {
2271 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2272 		ip6_del_rt(rt);
2273 	}
2274 
2275 out:
2276 	neigh_release(neigh);
2277 }
2278 
2279 /*
2280  *	Misc support functions
2281  */
2282 
2283 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2284 {
2285 	BUG_ON(from->dst.from);
2286 
2287 	rt->rt6i_flags &= ~RTF_EXPIRES;
2288 	dst_hold(&from->dst);
2289 	rt->dst.from = &from->dst;
2290 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2291 }
2292 
2293 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2294 {
2295 	rt->dst.input = ort->dst.input;
2296 	rt->dst.output = ort->dst.output;
2297 	rt->rt6i_dst = ort->rt6i_dst;
2298 	rt->dst.error = ort->dst.error;
2299 	rt->rt6i_idev = ort->rt6i_idev;
2300 	if (rt->rt6i_idev)
2301 		in6_dev_hold(rt->rt6i_idev);
2302 	rt->dst.lastuse = jiffies;
2303 	rt->rt6i_gateway = ort->rt6i_gateway;
2304 	rt->rt6i_flags = ort->rt6i_flags;
2305 	rt6_set_from(rt, ort);
2306 	rt->rt6i_metric = ort->rt6i_metric;
2307 #ifdef CONFIG_IPV6_SUBTREES
2308 	rt->rt6i_src = ort->rt6i_src;
2309 #endif
2310 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2311 	rt->rt6i_table = ort->rt6i_table;
2312 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2313 }
2314 
2315 #ifdef CONFIG_IPV6_ROUTE_INFO
2316 static struct rt6_info *rt6_get_route_info(struct net *net,
2317 					   const struct in6_addr *prefix, int prefixlen,
2318 					   const struct in6_addr *gwaddr, int ifindex)
2319 {
2320 	struct fib6_node *fn;
2321 	struct rt6_info *rt = NULL;
2322 	struct fib6_table *table;
2323 
2324 	table = fib6_get_table(net, RT6_TABLE_INFO);
2325 	if (!table)
2326 		return NULL;
2327 
2328 	read_lock_bh(&table->tb6_lock);
2329 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2330 	if (!fn)
2331 		goto out;
2332 
2333 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2334 		if (rt->dst.dev->ifindex != ifindex)
2335 			continue;
2336 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2337 			continue;
2338 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2339 			continue;
2340 		dst_hold(&rt->dst);
2341 		break;
2342 	}
2343 out:
2344 	read_unlock_bh(&table->tb6_lock);
2345 	return rt;
2346 }
2347 
2348 static struct rt6_info *rt6_add_route_info(struct net *net,
2349 					   const struct in6_addr *prefix, int prefixlen,
2350 					   const struct in6_addr *gwaddr, int ifindex,
2351 					   unsigned int pref)
2352 {
2353 	struct fib6_config cfg = {
2354 		.fc_metric	= IP6_RT_PRIO_USER,
2355 		.fc_ifindex	= ifindex,
2356 		.fc_dst_len	= prefixlen,
2357 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2358 				  RTF_UP | RTF_PREF(pref),
2359 		.fc_nlinfo.portid = 0,
2360 		.fc_nlinfo.nlh = NULL,
2361 		.fc_nlinfo.nl_net = net,
2362 	};
2363 
2364 	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2365 	cfg.fc_dst = *prefix;
2366 	cfg.fc_gateway = *gwaddr;
2367 
2368 	/* We should treat it as a default route if prefix length is 0. */
2369 	if (!prefixlen)
2370 		cfg.fc_flags |= RTF_DEFAULT;
2371 
2372 	ip6_route_add(&cfg);
2373 
2374 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2375 }
2376 #endif
2377 
2378 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2379 {
2380 	struct rt6_info *rt;
2381 	struct fib6_table *table;
2382 
2383 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2384 	if (!table)
2385 		return NULL;
2386 
2387 	read_lock_bh(&table->tb6_lock);
2388 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2389 		if (dev == rt->dst.dev &&
2390 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2391 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2392 			break;
2393 	}
2394 	if (rt)
2395 		dst_hold(&rt->dst);
2396 	read_unlock_bh(&table->tb6_lock);
2397 	return rt;
2398 }
2399 
2400 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2401 				     struct net_device *dev,
2402 				     unsigned int pref)
2403 {
2404 	struct fib6_config cfg = {
2405 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2406 		.fc_metric	= IP6_RT_PRIO_USER,
2407 		.fc_ifindex	= dev->ifindex,
2408 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2409 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2410 		.fc_nlinfo.portid = 0,
2411 		.fc_nlinfo.nlh = NULL,
2412 		.fc_nlinfo.nl_net = dev_net(dev),
2413 	};
2414 
2415 	cfg.fc_gateway = *gwaddr;
2416 
2417 	ip6_route_add(&cfg);
2418 
2419 	return rt6_get_dflt_router(gwaddr, dev);
2420 }
2421 
2422 void rt6_purge_dflt_routers(struct net *net)
2423 {
2424 	struct rt6_info *rt;
2425 	struct fib6_table *table;
2426 
2427 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2428 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2429 	if (!table)
2430 		return;
2431 
2432 restart:
2433 	read_lock_bh(&table->tb6_lock);
2434 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2435 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2436 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2437 			dst_hold(&rt->dst);
2438 			read_unlock_bh(&table->tb6_lock);
2439 			ip6_del_rt(rt);
2440 			goto restart;
2441 		}
2442 	}
2443 	read_unlock_bh(&table->tb6_lock);
2444 }
2445 
2446 static void rtmsg_to_fib6_config(struct net *net,
2447 				 struct in6_rtmsg *rtmsg,
2448 				 struct fib6_config *cfg)
2449 {
2450 	memset(cfg, 0, sizeof(*cfg));
2451 
2452 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2453 			 : RT6_TABLE_MAIN;
2454 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2455 	cfg->fc_metric = rtmsg->rtmsg_metric;
2456 	cfg->fc_expires = rtmsg->rtmsg_info;
2457 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2458 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2459 	cfg->fc_flags = rtmsg->rtmsg_flags;
2460 
2461 	cfg->fc_nlinfo.nl_net = net;
2462 
2463 	cfg->fc_dst = rtmsg->rtmsg_dst;
2464 	cfg->fc_src = rtmsg->rtmsg_src;
2465 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2466 }
2467 
2468 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2469 {
2470 	struct fib6_config cfg;
2471 	struct in6_rtmsg rtmsg;
2472 	int err;
2473 
2474 	switch (cmd) {
2475 	case SIOCADDRT:		/* Add a route */
2476 	case SIOCDELRT:		/* Delete a route */
2477 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2478 			return -EPERM;
2479 		err = copy_from_user(&rtmsg, arg,
2480 				     sizeof(struct in6_rtmsg));
2481 		if (err)
2482 			return -EFAULT;
2483 
2484 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2485 
2486 		rtnl_lock();
2487 		switch (cmd) {
2488 		case SIOCADDRT:
2489 			err = ip6_route_add(&cfg);
2490 			break;
2491 		case SIOCDELRT:
2492 			err = ip6_route_del(&cfg);
2493 			break;
2494 		default:
2495 			err = -EINVAL;
2496 		}
2497 		rtnl_unlock();
2498 
2499 		return err;
2500 	}
2501 
2502 	return -EINVAL;
2503 }
2504 
2505 /*
2506  *	Drop the packet on the floor
2507  */
2508 
2509 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2510 {
2511 	int type;
2512 	struct dst_entry *dst = skb_dst(skb);
2513 	switch (ipstats_mib_noroutes) {
2514 	case IPSTATS_MIB_INNOROUTES:
2515 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2516 		if (type == IPV6_ADDR_ANY) {
2517 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2518 				      IPSTATS_MIB_INADDRERRORS);
2519 			break;
2520 		}
2521 		/* FALLTHROUGH */
2522 	case IPSTATS_MIB_OUTNOROUTES:
2523 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2524 			      ipstats_mib_noroutes);
2525 		break;
2526 	}
2527 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2528 	kfree_skb(skb);
2529 	return 0;
2530 }
2531 
2532 static int ip6_pkt_discard(struct sk_buff *skb)
2533 {
2534 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2535 }
2536 
2537 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2538 {
2539 	skb->dev = skb_dst(skb)->dev;
2540 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2541 }
2542 
2543 static int ip6_pkt_prohibit(struct sk_buff *skb)
2544 {
2545 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2546 }
2547 
2548 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2549 {
2550 	skb->dev = skb_dst(skb)->dev;
2551 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2552 }
2553 
2554 /*
2555  *	Allocate a dst for local (unicast / anycast) address.
2556  */
2557 
2558 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2559 				    const struct in6_addr *addr,
2560 				    bool anycast)
2561 {
2562 	u32 tb_id;
2563 	struct net *net = dev_net(idev->dev);
2564 	struct net_device *dev = net->loopback_dev;
2565 	struct rt6_info *rt;
2566 
2567 	/* use L3 Master device as loopback for host routes if device
2568 	 * is enslaved and address is not link local or multicast
2569 	 */
2570 	if (!rt6_need_strict(addr))
2571 		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2572 
2573 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2574 	if (!rt)
2575 		return ERR_PTR(-ENOMEM);
2576 
2577 	in6_dev_hold(idev);
2578 
2579 	rt->dst.flags |= DST_HOST;
2580 	rt->dst.input = ip6_input;
2581 	rt->dst.output = ip6_output;
2582 	rt->rt6i_idev = idev;
2583 
2584 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2585 	if (anycast)
2586 		rt->rt6i_flags |= RTF_ANYCAST;
2587 	else
2588 		rt->rt6i_flags |= RTF_LOCAL;
2589 
2590 	rt->rt6i_gateway  = *addr;
2591 	rt->rt6i_dst.addr = *addr;
2592 	rt->rt6i_dst.plen = 128;
2593 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2594 	rt->rt6i_table = fib6_get_table(net, tb_id);
2595 	rt->dst.flags |= DST_NOCACHE;
2596 
2597 	atomic_set(&rt->dst.__refcnt, 1);
2598 
2599 	return rt;
2600 }
2601 
2602 /* remove deleted ip from prefsrc entries */
2603 struct arg_dev_net_ip {
2604 	struct net_device *dev;
2605 	struct net *net;
2606 	struct in6_addr *addr;
2607 };
2608 
2609 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2610 {
2611 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2612 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2613 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2614 
2615 	if (((void *)rt->dst.dev == dev || !dev) &&
2616 	    rt != net->ipv6.ip6_null_entry &&
2617 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2618 		/* remove prefsrc entry */
2619 		rt->rt6i_prefsrc.plen = 0;
2620 	}
2621 	return 0;
2622 }
2623 
2624 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2625 {
2626 	struct net *net = dev_net(ifp->idev->dev);
2627 	struct arg_dev_net_ip adni = {
2628 		.dev = ifp->idev->dev,
2629 		.net = net,
2630 		.addr = &ifp->addr,
2631 	};
2632 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2633 }
2634 
2635 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2636 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2637 
2638 /* Remove routers and update dst entries when gateway turn into host. */
2639 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2640 {
2641 	struct in6_addr *gateway = (struct in6_addr *)arg;
2642 
2643 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2644 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2645 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2646 		return -1;
2647 	}
2648 	return 0;
2649 }
2650 
2651 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2652 {
2653 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2654 }
2655 
2656 struct arg_dev_net {
2657 	struct net_device *dev;
2658 	struct net *net;
2659 };
2660 
2661 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2662 {
2663 	const struct arg_dev_net *adn = arg;
2664 	const struct net_device *dev = adn->dev;
2665 
2666 	if ((rt->dst.dev == dev || !dev) &&
2667 	    rt != adn->net->ipv6.ip6_null_entry)
2668 		return -1;
2669 
2670 	return 0;
2671 }
2672 
2673 void rt6_ifdown(struct net *net, struct net_device *dev)
2674 {
2675 	struct arg_dev_net adn = {
2676 		.dev = dev,
2677 		.net = net,
2678 	};
2679 
2680 	fib6_clean_all(net, fib6_ifdown, &adn);
2681 	icmp6_clean_all(fib6_ifdown, &adn);
2682 	if (dev)
2683 		rt6_uncached_list_flush_dev(net, dev);
2684 }
2685 
2686 struct rt6_mtu_change_arg {
2687 	struct net_device *dev;
2688 	unsigned int mtu;
2689 };
2690 
2691 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2692 {
2693 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2694 	struct inet6_dev *idev;
2695 
2696 	/* In IPv6 pmtu discovery is not optional,
2697 	   so that RTAX_MTU lock cannot disable it.
2698 	   We still use this lock to block changes
2699 	   caused by addrconf/ndisc.
2700 	*/
2701 
2702 	idev = __in6_dev_get(arg->dev);
2703 	if (!idev)
2704 		return 0;
2705 
2706 	/* For administrative MTU increase, there is no way to discover
2707 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2708 	   Since RFC 1981 doesn't include administrative MTU increase
2709 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2710 	 */
2711 	/*
2712 	   If new MTU is less than route PMTU, this new MTU will be the
2713 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2714 	   decreases; if new MTU is greater than route PMTU, and the
2715 	   old MTU is the lowest MTU in the path, update the route PMTU
2716 	   to reflect the increase. In this case if the other nodes' MTU
2717 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2718 	   PMTU discouvery.
2719 	 */
2720 	if (rt->dst.dev == arg->dev &&
2721 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2722 		if (rt->rt6i_flags & RTF_CACHE) {
2723 			/* For RTF_CACHE with rt6i_pmtu == 0
2724 			 * (i.e. a redirected route),
2725 			 * the metrics of its rt->dst.from has already
2726 			 * been updated.
2727 			 */
2728 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2729 				rt->rt6i_pmtu = arg->mtu;
2730 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2731 			   (dst_mtu(&rt->dst) < arg->mtu &&
2732 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2733 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2734 		}
2735 	}
2736 	return 0;
2737 }
2738 
2739 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2740 {
2741 	struct rt6_mtu_change_arg arg = {
2742 		.dev = dev,
2743 		.mtu = mtu,
2744 	};
2745 
2746 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2747 }
2748 
2749 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2750 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2751 	[RTA_OIF]               = { .type = NLA_U32 },
2752 	[RTA_IIF]		= { .type = NLA_U32 },
2753 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2754 	[RTA_METRICS]           = { .type = NLA_NESTED },
2755 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2756 	[RTA_PREF]              = { .type = NLA_U8 },
2757 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2758 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2759 	[RTA_EXPIRES]		= { .type = NLA_U32 },
2760 };
2761 
2762 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2763 			      struct fib6_config *cfg)
2764 {
2765 	struct rtmsg *rtm;
2766 	struct nlattr *tb[RTA_MAX+1];
2767 	unsigned int pref;
2768 	int err;
2769 
2770 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2771 	if (err < 0)
2772 		goto errout;
2773 
2774 	err = -EINVAL;
2775 	rtm = nlmsg_data(nlh);
2776 	memset(cfg, 0, sizeof(*cfg));
2777 
2778 	cfg->fc_table = rtm->rtm_table;
2779 	cfg->fc_dst_len = rtm->rtm_dst_len;
2780 	cfg->fc_src_len = rtm->rtm_src_len;
2781 	cfg->fc_flags = RTF_UP;
2782 	cfg->fc_protocol = rtm->rtm_protocol;
2783 	cfg->fc_type = rtm->rtm_type;
2784 
2785 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2786 	    rtm->rtm_type == RTN_BLACKHOLE ||
2787 	    rtm->rtm_type == RTN_PROHIBIT ||
2788 	    rtm->rtm_type == RTN_THROW)
2789 		cfg->fc_flags |= RTF_REJECT;
2790 
2791 	if (rtm->rtm_type == RTN_LOCAL)
2792 		cfg->fc_flags |= RTF_LOCAL;
2793 
2794 	if (rtm->rtm_flags & RTM_F_CLONED)
2795 		cfg->fc_flags |= RTF_CACHE;
2796 
2797 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2798 	cfg->fc_nlinfo.nlh = nlh;
2799 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2800 
2801 	if (tb[RTA_GATEWAY]) {
2802 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2803 		cfg->fc_flags |= RTF_GATEWAY;
2804 	}
2805 
2806 	if (tb[RTA_DST]) {
2807 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2808 
2809 		if (nla_len(tb[RTA_DST]) < plen)
2810 			goto errout;
2811 
2812 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2813 	}
2814 
2815 	if (tb[RTA_SRC]) {
2816 		int plen = (rtm->rtm_src_len + 7) >> 3;
2817 
2818 		if (nla_len(tb[RTA_SRC]) < plen)
2819 			goto errout;
2820 
2821 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2822 	}
2823 
2824 	if (tb[RTA_PREFSRC])
2825 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2826 
2827 	if (tb[RTA_OIF])
2828 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2829 
2830 	if (tb[RTA_PRIORITY])
2831 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2832 
2833 	if (tb[RTA_METRICS]) {
2834 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2835 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2836 	}
2837 
2838 	if (tb[RTA_TABLE])
2839 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2840 
2841 	if (tb[RTA_MULTIPATH]) {
2842 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2843 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2844 	}
2845 
2846 	if (tb[RTA_PREF]) {
2847 		pref = nla_get_u8(tb[RTA_PREF]);
2848 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2849 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2850 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2851 		cfg->fc_flags |= RTF_PREF(pref);
2852 	}
2853 
2854 	if (tb[RTA_ENCAP])
2855 		cfg->fc_encap = tb[RTA_ENCAP];
2856 
2857 	if (tb[RTA_ENCAP_TYPE])
2858 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2859 
2860 	if (tb[RTA_EXPIRES]) {
2861 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2862 
2863 		if (addrconf_finite_timeout(timeout)) {
2864 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2865 			cfg->fc_flags |= RTF_EXPIRES;
2866 		}
2867 	}
2868 
2869 	err = 0;
2870 errout:
2871 	return err;
2872 }
2873 
2874 struct rt6_nh {
2875 	struct rt6_info *rt6_info;
2876 	struct fib6_config r_cfg;
2877 	struct mx6_config mxc;
2878 	struct list_head next;
2879 };
2880 
2881 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2882 {
2883 	struct rt6_nh *nh;
2884 
2885 	list_for_each_entry(nh, rt6_nh_list, next) {
2886 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2887 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2888 		        nh->r_cfg.fc_ifindex);
2889 	}
2890 }
2891 
2892 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2893 				 struct rt6_info *rt, struct fib6_config *r_cfg)
2894 {
2895 	struct rt6_nh *nh;
2896 	struct rt6_info *rtnh;
2897 	int err = -EEXIST;
2898 
2899 	list_for_each_entry(nh, rt6_nh_list, next) {
2900 		/* check if rt6_info already exists */
2901 		rtnh = nh->rt6_info;
2902 
2903 		if (rtnh->dst.dev == rt->dst.dev &&
2904 		    rtnh->rt6i_idev == rt->rt6i_idev &&
2905 		    ipv6_addr_equal(&rtnh->rt6i_gateway,
2906 				    &rt->rt6i_gateway))
2907 			return err;
2908 	}
2909 
2910 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2911 	if (!nh)
2912 		return -ENOMEM;
2913 	nh->rt6_info = rt;
2914 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
2915 	if (err) {
2916 		kfree(nh);
2917 		return err;
2918 	}
2919 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2920 	list_add_tail(&nh->next, rt6_nh_list);
2921 
2922 	return 0;
2923 }
2924 
2925 static int ip6_route_multipath_add(struct fib6_config *cfg)
2926 {
2927 	struct fib6_config r_cfg;
2928 	struct rtnexthop *rtnh;
2929 	struct rt6_info *rt;
2930 	struct rt6_nh *err_nh;
2931 	struct rt6_nh *nh, *nh_safe;
2932 	int remaining;
2933 	int attrlen;
2934 	int err = 1;
2935 	int nhn = 0;
2936 	int replace = (cfg->fc_nlinfo.nlh &&
2937 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2938 	LIST_HEAD(rt6_nh_list);
2939 
2940 	remaining = cfg->fc_mp_len;
2941 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2942 
2943 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
2944 	 * rt6_info structs per nexthop
2945 	 */
2946 	while (rtnh_ok(rtnh, remaining)) {
2947 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2948 		if (rtnh->rtnh_ifindex)
2949 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2950 
2951 		attrlen = rtnh_attrlen(rtnh);
2952 		if (attrlen > 0) {
2953 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2954 
2955 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2956 			if (nla) {
2957 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2958 				r_cfg.fc_flags |= RTF_GATEWAY;
2959 			}
2960 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2961 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2962 			if (nla)
2963 				r_cfg.fc_encap_type = nla_get_u16(nla);
2964 		}
2965 
2966 		rt = ip6_route_info_create(&r_cfg);
2967 		if (IS_ERR(rt)) {
2968 			err = PTR_ERR(rt);
2969 			rt = NULL;
2970 			goto cleanup;
2971 		}
2972 
2973 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2974 		if (err) {
2975 			dst_free(&rt->dst);
2976 			goto cleanup;
2977 		}
2978 
2979 		rtnh = rtnh_next(rtnh, &remaining);
2980 	}
2981 
2982 	err_nh = NULL;
2983 	list_for_each_entry(nh, &rt6_nh_list, next) {
2984 		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2985 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
2986 		nh->rt6_info = NULL;
2987 		if (err) {
2988 			if (replace && nhn)
2989 				ip6_print_replace_route_err(&rt6_nh_list);
2990 			err_nh = nh;
2991 			goto add_errout;
2992 		}
2993 
2994 		/* Because each route is added like a single route we remove
2995 		 * these flags after the first nexthop: if there is a collision,
2996 		 * we have already failed to add the first nexthop:
2997 		 * fib6_add_rt2node() has rejected it; when replacing, old
2998 		 * nexthops have been replaced by first new, the rest should
2999 		 * be added to it.
3000 		 */
3001 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3002 						     NLM_F_REPLACE);
3003 		nhn++;
3004 	}
3005 
3006 	goto cleanup;
3007 
3008 add_errout:
3009 	/* Delete routes that were already added */
3010 	list_for_each_entry(nh, &rt6_nh_list, next) {
3011 		if (err_nh == nh)
3012 			break;
3013 		ip6_route_del(&nh->r_cfg);
3014 	}
3015 
3016 cleanup:
3017 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3018 		if (nh->rt6_info)
3019 			dst_free(&nh->rt6_info->dst);
3020 		kfree(nh->mxc.mx);
3021 		list_del(&nh->next);
3022 		kfree(nh);
3023 	}
3024 
3025 	return err;
3026 }
3027 
3028 static int ip6_route_multipath_del(struct fib6_config *cfg)
3029 {
3030 	struct fib6_config r_cfg;
3031 	struct rtnexthop *rtnh;
3032 	int remaining;
3033 	int attrlen;
3034 	int err = 1, last_err = 0;
3035 
3036 	remaining = cfg->fc_mp_len;
3037 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3038 
3039 	/* Parse a Multipath Entry */
3040 	while (rtnh_ok(rtnh, remaining)) {
3041 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3042 		if (rtnh->rtnh_ifindex)
3043 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3044 
3045 		attrlen = rtnh_attrlen(rtnh);
3046 		if (attrlen > 0) {
3047 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3048 
3049 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3050 			if (nla) {
3051 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3052 				r_cfg.fc_flags |= RTF_GATEWAY;
3053 			}
3054 		}
3055 		err = ip6_route_del(&r_cfg);
3056 		if (err)
3057 			last_err = err;
3058 
3059 		rtnh = rtnh_next(rtnh, &remaining);
3060 	}
3061 
3062 	return last_err;
3063 }
3064 
3065 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3066 {
3067 	struct fib6_config cfg;
3068 	int err;
3069 
3070 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3071 	if (err < 0)
3072 		return err;
3073 
3074 	if (cfg.fc_mp)
3075 		return ip6_route_multipath_del(&cfg);
3076 	else
3077 		return ip6_route_del(&cfg);
3078 }
3079 
3080 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3081 {
3082 	struct fib6_config cfg;
3083 	int err;
3084 
3085 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3086 	if (err < 0)
3087 		return err;
3088 
3089 	if (cfg.fc_mp)
3090 		return ip6_route_multipath_add(&cfg);
3091 	else
3092 		return ip6_route_add(&cfg);
3093 }
3094 
3095 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3096 {
3097 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3098 	       + nla_total_size(16) /* RTA_SRC */
3099 	       + nla_total_size(16) /* RTA_DST */
3100 	       + nla_total_size(16) /* RTA_GATEWAY */
3101 	       + nla_total_size(16) /* RTA_PREFSRC */
3102 	       + nla_total_size(4) /* RTA_TABLE */
3103 	       + nla_total_size(4) /* RTA_IIF */
3104 	       + nla_total_size(4) /* RTA_OIF */
3105 	       + nla_total_size(4) /* RTA_PRIORITY */
3106 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3107 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3108 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3109 	       + nla_total_size(1) /* RTA_PREF */
3110 	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
3111 }
3112 
3113 static int rt6_fill_node(struct net *net,
3114 			 struct sk_buff *skb, struct rt6_info *rt,
3115 			 struct in6_addr *dst, struct in6_addr *src,
3116 			 int iif, int type, u32 portid, u32 seq,
3117 			 int prefix, int nowait, unsigned int flags)
3118 {
3119 	u32 metrics[RTAX_MAX];
3120 	struct rtmsg *rtm;
3121 	struct nlmsghdr *nlh;
3122 	long expires;
3123 	u32 table;
3124 
3125 	if (prefix) {	/* user wants prefix routes only */
3126 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3127 			/* success since this is not a prefix route */
3128 			return 1;
3129 		}
3130 	}
3131 
3132 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3133 	if (!nlh)
3134 		return -EMSGSIZE;
3135 
3136 	rtm = nlmsg_data(nlh);
3137 	rtm->rtm_family = AF_INET6;
3138 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3139 	rtm->rtm_src_len = rt->rt6i_src.plen;
3140 	rtm->rtm_tos = 0;
3141 	if (rt->rt6i_table)
3142 		table = rt->rt6i_table->tb6_id;
3143 	else
3144 		table = RT6_TABLE_UNSPEC;
3145 	rtm->rtm_table = table;
3146 	if (nla_put_u32(skb, RTA_TABLE, table))
3147 		goto nla_put_failure;
3148 	if (rt->rt6i_flags & RTF_REJECT) {
3149 		switch (rt->dst.error) {
3150 		case -EINVAL:
3151 			rtm->rtm_type = RTN_BLACKHOLE;
3152 			break;
3153 		case -EACCES:
3154 			rtm->rtm_type = RTN_PROHIBIT;
3155 			break;
3156 		case -EAGAIN:
3157 			rtm->rtm_type = RTN_THROW;
3158 			break;
3159 		default:
3160 			rtm->rtm_type = RTN_UNREACHABLE;
3161 			break;
3162 		}
3163 	}
3164 	else if (rt->rt6i_flags & RTF_LOCAL)
3165 		rtm->rtm_type = RTN_LOCAL;
3166 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3167 		rtm->rtm_type = RTN_LOCAL;
3168 	else
3169 		rtm->rtm_type = RTN_UNICAST;
3170 	rtm->rtm_flags = 0;
3171 	if (!netif_carrier_ok(rt->dst.dev)) {
3172 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
3173 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3174 			rtm->rtm_flags |= RTNH_F_DEAD;
3175 	}
3176 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3177 	rtm->rtm_protocol = rt->rt6i_protocol;
3178 	if (rt->rt6i_flags & RTF_DYNAMIC)
3179 		rtm->rtm_protocol = RTPROT_REDIRECT;
3180 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
3181 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3182 			rtm->rtm_protocol = RTPROT_RA;
3183 		else
3184 			rtm->rtm_protocol = RTPROT_KERNEL;
3185 	}
3186 
3187 	if (rt->rt6i_flags & RTF_CACHE)
3188 		rtm->rtm_flags |= RTM_F_CLONED;
3189 
3190 	if (dst) {
3191 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3192 			goto nla_put_failure;
3193 		rtm->rtm_dst_len = 128;
3194 	} else if (rtm->rtm_dst_len)
3195 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3196 			goto nla_put_failure;
3197 #ifdef CONFIG_IPV6_SUBTREES
3198 	if (src) {
3199 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3200 			goto nla_put_failure;
3201 		rtm->rtm_src_len = 128;
3202 	} else if (rtm->rtm_src_len &&
3203 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3204 		goto nla_put_failure;
3205 #endif
3206 	if (iif) {
3207 #ifdef CONFIG_IPV6_MROUTE
3208 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3209 			int err = ip6mr_get_route(net, skb, rtm, nowait);
3210 			if (err <= 0) {
3211 				if (!nowait) {
3212 					if (err == 0)
3213 						return 0;
3214 					goto nla_put_failure;
3215 				} else {
3216 					if (err == -EMSGSIZE)
3217 						goto nla_put_failure;
3218 				}
3219 			}
3220 		} else
3221 #endif
3222 			if (nla_put_u32(skb, RTA_IIF, iif))
3223 				goto nla_put_failure;
3224 	} else if (dst) {
3225 		struct in6_addr saddr_buf;
3226 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3227 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3228 			goto nla_put_failure;
3229 	}
3230 
3231 	if (rt->rt6i_prefsrc.plen) {
3232 		struct in6_addr saddr_buf;
3233 		saddr_buf = rt->rt6i_prefsrc.addr;
3234 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3235 			goto nla_put_failure;
3236 	}
3237 
3238 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3239 	if (rt->rt6i_pmtu)
3240 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3241 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3242 		goto nla_put_failure;
3243 
3244 	if (rt->rt6i_flags & RTF_GATEWAY) {
3245 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3246 			goto nla_put_failure;
3247 	}
3248 
3249 	if (rt->dst.dev &&
3250 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3251 		goto nla_put_failure;
3252 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3253 		goto nla_put_failure;
3254 
3255 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3256 
3257 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3258 		goto nla_put_failure;
3259 
3260 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3261 		goto nla_put_failure;
3262 
3263 	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3264 
3265 	nlmsg_end(skb, nlh);
3266 	return 0;
3267 
3268 nla_put_failure:
3269 	nlmsg_cancel(skb, nlh);
3270 	return -EMSGSIZE;
3271 }
3272 
3273 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3274 {
3275 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3276 	int prefix;
3277 
3278 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3279 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3280 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3281 	} else
3282 		prefix = 0;
3283 
3284 	return rt6_fill_node(arg->net,
3285 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3286 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3287 		     prefix, 0, NLM_F_MULTI);
3288 }
3289 
3290 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3291 {
3292 	struct net *net = sock_net(in_skb->sk);
3293 	struct nlattr *tb[RTA_MAX+1];
3294 	struct rt6_info *rt;
3295 	struct sk_buff *skb;
3296 	struct rtmsg *rtm;
3297 	struct flowi6 fl6;
3298 	int err, iif = 0, oif = 0;
3299 
3300 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3301 	if (err < 0)
3302 		goto errout;
3303 
3304 	err = -EINVAL;
3305 	memset(&fl6, 0, sizeof(fl6));
3306 	rtm = nlmsg_data(nlh);
3307 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3308 
3309 	if (tb[RTA_SRC]) {
3310 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3311 			goto errout;
3312 
3313 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3314 	}
3315 
3316 	if (tb[RTA_DST]) {
3317 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3318 			goto errout;
3319 
3320 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3321 	}
3322 
3323 	if (tb[RTA_IIF])
3324 		iif = nla_get_u32(tb[RTA_IIF]);
3325 
3326 	if (tb[RTA_OIF])
3327 		oif = nla_get_u32(tb[RTA_OIF]);
3328 
3329 	if (tb[RTA_MARK])
3330 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3331 
3332 	if (iif) {
3333 		struct net_device *dev;
3334 		int flags = 0;
3335 
3336 		dev = __dev_get_by_index(net, iif);
3337 		if (!dev) {
3338 			err = -ENODEV;
3339 			goto errout;
3340 		}
3341 
3342 		fl6.flowi6_iif = iif;
3343 
3344 		if (!ipv6_addr_any(&fl6.saddr))
3345 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3346 
3347 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3348 							       flags);
3349 	} else {
3350 		fl6.flowi6_oif = oif;
3351 
3352 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3353 	}
3354 
3355 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3356 	if (!skb) {
3357 		ip6_rt_put(rt);
3358 		err = -ENOBUFS;
3359 		goto errout;
3360 	}
3361 
3362 	/* Reserve room for dummy headers, this skb can pass
3363 	   through good chunk of routing engine.
3364 	 */
3365 	skb_reset_mac_header(skb);
3366 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3367 
3368 	skb_dst_set(skb, &rt->dst);
3369 
3370 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3371 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3372 			    nlh->nlmsg_seq, 0, 0, 0);
3373 	if (err < 0) {
3374 		kfree_skb(skb);
3375 		goto errout;
3376 	}
3377 
3378 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3379 errout:
3380 	return err;
3381 }
3382 
3383 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3384 		     unsigned int nlm_flags)
3385 {
3386 	struct sk_buff *skb;
3387 	struct net *net = info->nl_net;
3388 	u32 seq;
3389 	int err;
3390 
3391 	err = -ENOBUFS;
3392 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3393 
3394 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3395 	if (!skb)
3396 		goto errout;
3397 
3398 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3399 				event, info->portid, seq, 0, 0, nlm_flags);
3400 	if (err < 0) {
3401 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3402 		WARN_ON(err == -EMSGSIZE);
3403 		kfree_skb(skb);
3404 		goto errout;
3405 	}
3406 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3407 		    info->nlh, gfp_any());
3408 	return;
3409 errout:
3410 	if (err < 0)
3411 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3412 }
3413 
3414 static int ip6_route_dev_notify(struct notifier_block *this,
3415 				unsigned long event, void *ptr)
3416 {
3417 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3418 	struct net *net = dev_net(dev);
3419 
3420 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3421 		net->ipv6.ip6_null_entry->dst.dev = dev;
3422 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3423 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3424 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3425 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3426 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3427 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3428 #endif
3429 	}
3430 
3431 	return NOTIFY_OK;
3432 }
3433 
3434 /*
3435  *	/proc
3436  */
3437 
3438 #ifdef CONFIG_PROC_FS
3439 
3440 static const struct file_operations ipv6_route_proc_fops = {
3441 	.owner		= THIS_MODULE,
3442 	.open		= ipv6_route_open,
3443 	.read		= seq_read,
3444 	.llseek		= seq_lseek,
3445 	.release	= seq_release_net,
3446 };
3447 
3448 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3449 {
3450 	struct net *net = (struct net *)seq->private;
3451 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3452 		   net->ipv6.rt6_stats->fib_nodes,
3453 		   net->ipv6.rt6_stats->fib_route_nodes,
3454 		   net->ipv6.rt6_stats->fib_rt_alloc,
3455 		   net->ipv6.rt6_stats->fib_rt_entries,
3456 		   net->ipv6.rt6_stats->fib_rt_cache,
3457 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3458 		   net->ipv6.rt6_stats->fib_discarded_routes);
3459 
3460 	return 0;
3461 }
3462 
3463 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3464 {
3465 	return single_open_net(inode, file, rt6_stats_seq_show);
3466 }
3467 
3468 static const struct file_operations rt6_stats_seq_fops = {
3469 	.owner	 = THIS_MODULE,
3470 	.open	 = rt6_stats_seq_open,
3471 	.read	 = seq_read,
3472 	.llseek	 = seq_lseek,
3473 	.release = single_release_net,
3474 };
3475 #endif	/* CONFIG_PROC_FS */
3476 
3477 #ifdef CONFIG_SYSCTL
3478 
3479 static
3480 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3481 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3482 {
3483 	struct net *net;
3484 	int delay;
3485 	if (!write)
3486 		return -EINVAL;
3487 
3488 	net = (struct net *)ctl->extra1;
3489 	delay = net->ipv6.sysctl.flush_delay;
3490 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3491 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3492 	return 0;
3493 }
3494 
3495 struct ctl_table ipv6_route_table_template[] = {
3496 	{
3497 		.procname	=	"flush",
3498 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3499 		.maxlen		=	sizeof(int),
3500 		.mode		=	0200,
3501 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3502 	},
3503 	{
3504 		.procname	=	"gc_thresh",
3505 		.data		=	&ip6_dst_ops_template.gc_thresh,
3506 		.maxlen		=	sizeof(int),
3507 		.mode		=	0644,
3508 		.proc_handler	=	proc_dointvec,
3509 	},
3510 	{
3511 		.procname	=	"max_size",
3512 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3513 		.maxlen		=	sizeof(int),
3514 		.mode		=	0644,
3515 		.proc_handler	=	proc_dointvec,
3516 	},
3517 	{
3518 		.procname	=	"gc_min_interval",
3519 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3520 		.maxlen		=	sizeof(int),
3521 		.mode		=	0644,
3522 		.proc_handler	=	proc_dointvec_jiffies,
3523 	},
3524 	{
3525 		.procname	=	"gc_timeout",
3526 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3527 		.maxlen		=	sizeof(int),
3528 		.mode		=	0644,
3529 		.proc_handler	=	proc_dointvec_jiffies,
3530 	},
3531 	{
3532 		.procname	=	"gc_interval",
3533 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3534 		.maxlen		=	sizeof(int),
3535 		.mode		=	0644,
3536 		.proc_handler	=	proc_dointvec_jiffies,
3537 	},
3538 	{
3539 		.procname	=	"gc_elasticity",
3540 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3541 		.maxlen		=	sizeof(int),
3542 		.mode		=	0644,
3543 		.proc_handler	=	proc_dointvec,
3544 	},
3545 	{
3546 		.procname	=	"mtu_expires",
3547 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3548 		.maxlen		=	sizeof(int),
3549 		.mode		=	0644,
3550 		.proc_handler	=	proc_dointvec_jiffies,
3551 	},
3552 	{
3553 		.procname	=	"min_adv_mss",
3554 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3555 		.maxlen		=	sizeof(int),
3556 		.mode		=	0644,
3557 		.proc_handler	=	proc_dointvec,
3558 	},
3559 	{
3560 		.procname	=	"gc_min_interval_ms",
3561 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3562 		.maxlen		=	sizeof(int),
3563 		.mode		=	0644,
3564 		.proc_handler	=	proc_dointvec_ms_jiffies,
3565 	},
3566 	{ }
3567 };
3568 
3569 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3570 {
3571 	struct ctl_table *table;
3572 
3573 	table = kmemdup(ipv6_route_table_template,
3574 			sizeof(ipv6_route_table_template),
3575 			GFP_KERNEL);
3576 
3577 	if (table) {
3578 		table[0].data = &net->ipv6.sysctl.flush_delay;
3579 		table[0].extra1 = net;
3580 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3581 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3582 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3583 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3584 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3585 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3586 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3587 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3588 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3589 
3590 		/* Don't export sysctls to unprivileged users */
3591 		if (net->user_ns != &init_user_ns)
3592 			table[0].procname = NULL;
3593 	}
3594 
3595 	return table;
3596 }
3597 #endif
3598 
3599 static int __net_init ip6_route_net_init(struct net *net)
3600 {
3601 	int ret = -ENOMEM;
3602 
3603 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3604 	       sizeof(net->ipv6.ip6_dst_ops));
3605 
3606 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3607 		goto out_ip6_dst_ops;
3608 
3609 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3610 					   sizeof(*net->ipv6.ip6_null_entry),
3611 					   GFP_KERNEL);
3612 	if (!net->ipv6.ip6_null_entry)
3613 		goto out_ip6_dst_entries;
3614 	net->ipv6.ip6_null_entry->dst.path =
3615 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3616 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3617 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3618 			 ip6_template_metrics, true);
3619 
3620 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3621 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3622 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3623 					       GFP_KERNEL);
3624 	if (!net->ipv6.ip6_prohibit_entry)
3625 		goto out_ip6_null_entry;
3626 	net->ipv6.ip6_prohibit_entry->dst.path =
3627 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3628 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3629 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3630 			 ip6_template_metrics, true);
3631 
3632 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3633 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3634 					       GFP_KERNEL);
3635 	if (!net->ipv6.ip6_blk_hole_entry)
3636 		goto out_ip6_prohibit_entry;
3637 	net->ipv6.ip6_blk_hole_entry->dst.path =
3638 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3639 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3640 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3641 			 ip6_template_metrics, true);
3642 #endif
3643 
3644 	net->ipv6.sysctl.flush_delay = 0;
3645 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3646 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3647 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3648 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3649 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3650 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3651 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3652 
3653 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3654 
3655 	ret = 0;
3656 out:
3657 	return ret;
3658 
3659 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3660 out_ip6_prohibit_entry:
3661 	kfree(net->ipv6.ip6_prohibit_entry);
3662 out_ip6_null_entry:
3663 	kfree(net->ipv6.ip6_null_entry);
3664 #endif
3665 out_ip6_dst_entries:
3666 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3667 out_ip6_dst_ops:
3668 	goto out;
3669 }
3670 
3671 static void __net_exit ip6_route_net_exit(struct net *net)
3672 {
3673 	kfree(net->ipv6.ip6_null_entry);
3674 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3675 	kfree(net->ipv6.ip6_prohibit_entry);
3676 	kfree(net->ipv6.ip6_blk_hole_entry);
3677 #endif
3678 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3679 }
3680 
3681 static int __net_init ip6_route_net_init_late(struct net *net)
3682 {
3683 #ifdef CONFIG_PROC_FS
3684 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3685 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3686 #endif
3687 	return 0;
3688 }
3689 
3690 static void __net_exit ip6_route_net_exit_late(struct net *net)
3691 {
3692 #ifdef CONFIG_PROC_FS
3693 	remove_proc_entry("ipv6_route", net->proc_net);
3694 	remove_proc_entry("rt6_stats", net->proc_net);
3695 #endif
3696 }
3697 
3698 static struct pernet_operations ip6_route_net_ops = {
3699 	.init = ip6_route_net_init,
3700 	.exit = ip6_route_net_exit,
3701 };
3702 
3703 static int __net_init ipv6_inetpeer_init(struct net *net)
3704 {
3705 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3706 
3707 	if (!bp)
3708 		return -ENOMEM;
3709 	inet_peer_base_init(bp);
3710 	net->ipv6.peers = bp;
3711 	return 0;
3712 }
3713 
3714 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3715 {
3716 	struct inet_peer_base *bp = net->ipv6.peers;
3717 
3718 	net->ipv6.peers = NULL;
3719 	inetpeer_invalidate_tree(bp);
3720 	kfree(bp);
3721 }
3722 
3723 static struct pernet_operations ipv6_inetpeer_ops = {
3724 	.init	=	ipv6_inetpeer_init,
3725 	.exit	=	ipv6_inetpeer_exit,
3726 };
3727 
3728 static struct pernet_operations ip6_route_net_late_ops = {
3729 	.init = ip6_route_net_init_late,
3730 	.exit = ip6_route_net_exit_late,
3731 };
3732 
3733 static struct notifier_block ip6_route_dev_notifier = {
3734 	.notifier_call = ip6_route_dev_notify,
3735 	.priority = 0,
3736 };
3737 
3738 int __init ip6_route_init(void)
3739 {
3740 	int ret;
3741 	int cpu;
3742 
3743 	ret = -ENOMEM;
3744 	ip6_dst_ops_template.kmem_cachep =
3745 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3746 				  SLAB_HWCACHE_ALIGN, NULL);
3747 	if (!ip6_dst_ops_template.kmem_cachep)
3748 		goto out;
3749 
3750 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3751 	if (ret)
3752 		goto out_kmem_cache;
3753 
3754 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3755 	if (ret)
3756 		goto out_dst_entries;
3757 
3758 	ret = register_pernet_subsys(&ip6_route_net_ops);
3759 	if (ret)
3760 		goto out_register_inetpeer;
3761 
3762 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3763 
3764 	/* Registering of the loopback is done before this portion of code,
3765 	 * the loopback reference in rt6_info will not be taken, do it
3766 	 * manually for init_net */
3767 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3768 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3769   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3770 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3771 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3772 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3773 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3774   #endif
3775 	ret = fib6_init();
3776 	if (ret)
3777 		goto out_register_subsys;
3778 
3779 	ret = xfrm6_init();
3780 	if (ret)
3781 		goto out_fib6_init;
3782 
3783 	ret = fib6_rules_init();
3784 	if (ret)
3785 		goto xfrm6_init;
3786 
3787 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3788 	if (ret)
3789 		goto fib6_rules_init;
3790 
3791 	ret = -ENOBUFS;
3792 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3793 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3794 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3795 		goto out_register_late_subsys;
3796 
3797 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3798 	if (ret)
3799 		goto out_register_late_subsys;
3800 
3801 	for_each_possible_cpu(cpu) {
3802 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3803 
3804 		INIT_LIST_HEAD(&ul->head);
3805 		spin_lock_init(&ul->lock);
3806 	}
3807 
3808 out:
3809 	return ret;
3810 
3811 out_register_late_subsys:
3812 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3813 fib6_rules_init:
3814 	fib6_rules_cleanup();
3815 xfrm6_init:
3816 	xfrm6_fini();
3817 out_fib6_init:
3818 	fib6_gc_cleanup();
3819 out_register_subsys:
3820 	unregister_pernet_subsys(&ip6_route_net_ops);
3821 out_register_inetpeer:
3822 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3823 out_dst_entries:
3824 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3825 out_kmem_cache:
3826 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3827 	goto out;
3828 }
3829 
3830 void ip6_route_cleanup(void)
3831 {
3832 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3833 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3834 	fib6_rules_cleanup();
3835 	xfrm6_fini();
3836 	fib6_gc_cleanup();
3837 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3838 	unregister_pernet_subsys(&ip6_route_net_ops);
3839 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3840 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3841 }
3842