xref: /linux/net/ipv6/route.c (revision 83a37b3292f4aca799b355179ad6fbdd78a08e10)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(rt->dst.from);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct dst_entry *from = dst->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	dst->from = NULL;
413 	dst_release(from);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->dst.from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 		       rt6_check_expired((struct rt6_info *)rt->dst.from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 	int route_choosen;
459 
460 	/* We might have already computed the hash for ICMPv6 errors. In such
461 	 * case it will always be non-zero. Otherwise now is the time to do it.
462 	 */
463 	if (!fl6->mp_hash)
464 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465 
466 	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467 	/* Don't change the route, if route_choosen == 0
468 	 * (siblings does not include ourself)
469 	 */
470 	if (route_choosen)
471 		list_for_each_entry_safe(sibling, next_sibling,
472 				&match->rt6i_siblings, rt6i_siblings) {
473 			route_choosen--;
474 			if (route_choosen == 0) {
475 				if (rt6_score_route(sibling, oif, strict) < 0)
476 					break;
477 				match = sibling;
478 				break;
479 			}
480 		}
481 	return match;
482 }
483 
484 /*
485  *	Route lookup. rcu_read_lock() should be held.
486  */
487 
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489 						    struct rt6_info *rt,
490 						    const struct in6_addr *saddr,
491 						    int oif,
492 						    int flags)
493 {
494 	struct rt6_info *local = NULL;
495 	struct rt6_info *sprt;
496 
497 	if (!oif && ipv6_addr_any(saddr))
498 		goto out;
499 
500 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501 		struct net_device *dev = sprt->dst.dev;
502 
503 		if (oif) {
504 			if (dev->ifindex == oif)
505 				return sprt;
506 			if (dev->flags & IFF_LOOPBACK) {
507 				if (!sprt->rt6i_idev ||
508 				    sprt->rt6i_idev->dev->ifindex != oif) {
509 					if (flags & RT6_LOOKUP_F_IFACE)
510 						continue;
511 					if (local &&
512 					    local->rt6i_idev->dev->ifindex == oif)
513 						continue;
514 				}
515 				local = sprt;
516 			}
517 		} else {
518 			if (ipv6_chk_addr(net, saddr, dev,
519 					  flags & RT6_LOOKUP_F_IFACE))
520 				return sprt;
521 		}
522 	}
523 
524 	if (oif) {
525 		if (local)
526 			return local;
527 
528 		if (flags & RT6_LOOKUP_F_IFACE)
529 			return net->ipv6.ip6_null_entry;
530 	}
531 out:
532 	return rt;
533 }
534 
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 	struct work_struct work;
538 	struct in6_addr target;
539 	struct net_device *dev;
540 };
541 
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544 	struct in6_addr mcaddr;
545 	struct __rt6_probe_work *work =
546 		container_of(w, struct __rt6_probe_work, work);
547 
548 	addrconf_addr_solict_mult(&work->target, &mcaddr);
549 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 	dev_put(work->dev);
551 	kfree(work);
552 }
553 
554 static void rt6_probe(struct rt6_info *rt)
555 {
556 	struct __rt6_probe_work *work;
557 	struct neighbour *neigh;
558 	/*
559 	 * Okay, this does not seem to be appropriate
560 	 * for now, however, we need to check if it
561 	 * is really so; aka Router Reachability Probing.
562 	 *
563 	 * Router Reachability Probe MUST be rate-limited
564 	 * to no more than one per minute.
565 	 */
566 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 		return;
568 	rcu_read_lock_bh();
569 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 	if (neigh) {
571 		if (neigh->nud_state & NUD_VALID)
572 			goto out;
573 
574 		work = NULL;
575 		write_lock(&neigh->lock);
576 		if (!(neigh->nud_state & NUD_VALID) &&
577 		    time_after(jiffies,
578 			       neigh->updated +
579 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else {
586 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
587 	}
588 
589 	if (work) {
590 		INIT_WORK(&work->work, rt6_probe_deferred);
591 		work->target = rt->rt6i_gateway;
592 		dev_hold(rt->dst.dev);
593 		work->dev = rt->dst.dev;
594 		schedule_work(&work->work);
595 	}
596 
597 out:
598 	rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605 
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611 	struct net_device *dev = rt->dst.dev;
612 	if (!oif || dev->ifindex == oif)
613 		return 2;
614 	if ((dev->flags & IFF_LOOPBACK) &&
615 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 		return 1;
617 	return 0;
618 }
619 
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622 	struct neighbour *neigh;
623 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624 
625 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 	    !(rt->rt6i_flags & RTF_GATEWAY))
627 		return RT6_NUD_SUCCEED;
628 
629 	rcu_read_lock_bh();
630 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 	if (neigh) {
632 		read_lock(&neigh->lock);
633 		if (neigh->nud_state & NUD_VALID)
634 			ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 		else if (!(neigh->nud_state & NUD_FAILED))
637 			ret = RT6_NUD_SUCCEED;
638 		else
639 			ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 		read_unlock(&neigh->lock);
642 	} else {
643 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645 	}
646 	rcu_read_unlock_bh();
647 
648 	return ret;
649 }
650 
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 			   int strict)
653 {
654 	int m;
655 
656 	m = rt6_check_dev(rt, oif);
657 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 		return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 	if (strict & RT6_LOOKUP_F_REACHABLE) {
663 		int n = rt6_check_neigh(rt);
664 		if (n < 0)
665 			return n;
666 	}
667 	return m;
668 }
669 
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 				   int *mpri, struct rt6_info *match,
672 				   bool *do_rr)
673 {
674 	int m;
675 	bool match_do_rr = false;
676 	struct inet6_dev *idev = rt->rt6i_idev;
677 	struct net_device *dev = rt->dst.dev;
678 
679 	if (dev && !netif_carrier_ok(dev) &&
680 	    idev->cnf.ignore_routes_with_linkdown &&
681 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682 		goto out;
683 
684 	if (rt6_check_expired(rt))
685 		goto out;
686 
687 	m = rt6_score_route(rt, oif, strict);
688 	if (m == RT6_NUD_FAIL_DO_RR) {
689 		match_do_rr = true;
690 		m = 0; /* lowest valid score */
691 	} else if (m == RT6_NUD_FAIL_HARD) {
692 		goto out;
693 	}
694 
695 	if (strict & RT6_LOOKUP_F_REACHABLE)
696 		rt6_probe(rt);
697 
698 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
699 	if (m > *mpri) {
700 		*do_rr = match_do_rr;
701 		*mpri = m;
702 		match = rt;
703 	}
704 out:
705 	return match;
706 }
707 
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709 				     struct rt6_info *leaf,
710 				     struct rt6_info *rr_head,
711 				     u32 metric, int oif, int strict,
712 				     bool *do_rr)
713 {
714 	struct rt6_info *rt, *match, *cont;
715 	int mpri = -1;
716 
717 	match = NULL;
718 	cont = NULL;
719 	for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720 		if (rt->rt6i_metric != metric) {
721 			cont = rt;
722 			break;
723 		}
724 
725 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
726 	}
727 
728 	for (rt = leaf; rt && rt != rr_head;
729 	     rt = rcu_dereference(rt->dst.rt6_next)) {
730 		if (rt->rt6i_metric != metric) {
731 			cont = rt;
732 			break;
733 		}
734 
735 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 	}
737 
738 	if (match || !cont)
739 		return match;
740 
741 	for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
743 
744 	return match;
745 }
746 
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
748 				   int oif, int strict)
749 {
750 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
751 	struct rt6_info *match, *rt0;
752 	bool do_rr = false;
753 	int key_plen;
754 
755 	if (!leaf)
756 		return net->ipv6.ip6_null_entry;
757 
758 	rt0 = rcu_dereference(fn->rr_ptr);
759 	if (!rt0)
760 		rt0 = leaf;
761 
762 	/* Double check to make sure fn is not an intermediate node
763 	 * and fn->leaf does not points to its child's leaf
764 	 * (This might happen if all routes under fn are deleted from
765 	 * the tree and fib6_repair_tree() is called on the node.)
766 	 */
767 	key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769 	if (rt0->rt6i_src.plen)
770 		key_plen = rt0->rt6i_src.plen;
771 #endif
772 	if (fn->fn_bit != key_plen)
773 		return net->ipv6.ip6_null_entry;
774 
775 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
776 			     &do_rr);
777 
778 	if (do_rr) {
779 		struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
780 
781 		/* no entries matched; do round-robin */
782 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
783 			next = leaf;
784 
785 		if (next != rt0) {
786 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787 			/* make sure next is not being deleted from the tree */
788 			if (next->rt6i_node)
789 				rcu_assign_pointer(fn->rr_ptr, next);
790 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
791 		}
792 	}
793 
794 	return match ? match : net->ipv6.ip6_null_entry;
795 }
796 
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801 
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804 		  const struct in6_addr *gwaddr)
805 {
806 	struct net *net = dev_net(dev);
807 	struct route_info *rinfo = (struct route_info *) opt;
808 	struct in6_addr prefix_buf, *prefix;
809 	unsigned int pref;
810 	unsigned long lifetime;
811 	struct rt6_info *rt;
812 
813 	if (len < sizeof(struct route_info)) {
814 		return -EINVAL;
815 	}
816 
817 	/* Sanity check for prefix_len and length */
818 	if (rinfo->length > 3) {
819 		return -EINVAL;
820 	} else if (rinfo->prefix_len > 128) {
821 		return -EINVAL;
822 	} else if (rinfo->prefix_len > 64) {
823 		if (rinfo->length < 2) {
824 			return -EINVAL;
825 		}
826 	} else if (rinfo->prefix_len > 0) {
827 		if (rinfo->length < 1) {
828 			return -EINVAL;
829 		}
830 	}
831 
832 	pref = rinfo->route_pref;
833 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
834 		return -EINVAL;
835 
836 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837 
838 	if (rinfo->length == 3)
839 		prefix = (struct in6_addr *)rinfo->prefix;
840 	else {
841 		/* this function is safe */
842 		ipv6_addr_prefix(&prefix_buf,
843 				 (struct in6_addr *)rinfo->prefix,
844 				 rinfo->prefix_len);
845 		prefix = &prefix_buf;
846 	}
847 
848 	if (rinfo->prefix_len == 0)
849 		rt = rt6_get_dflt_router(gwaddr, dev);
850 	else
851 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852 					gwaddr, dev);
853 
854 	if (rt && !lifetime) {
855 		ip6_del_rt(rt);
856 		rt = NULL;
857 	}
858 
859 	if (!rt && lifetime)
860 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861 					dev, pref);
862 	else if (rt)
863 		rt->rt6i_flags = RTF_ROUTEINFO |
864 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865 
866 	if (rt) {
867 		if (!addrconf_finite_timeout(lifetime))
868 			rt6_clean_expires(rt);
869 		else
870 			rt6_set_expires(rt, jiffies + HZ * lifetime);
871 
872 		ip6_rt_put(rt);
873 	}
874 	return 0;
875 }
876 #endif
877 
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879 					struct in6_addr *saddr)
880 {
881 	struct fib6_node *pn, *sn;
882 	while (1) {
883 		if (fn->fn_flags & RTN_TL_ROOT)
884 			return NULL;
885 		pn = rcu_dereference(fn->parent);
886 		sn = FIB6_SUBTREE(pn);
887 		if (sn && sn != fn)
888 			fn = fib6_lookup(sn, NULL, saddr);
889 		else
890 			fn = pn;
891 		if (fn->fn_flags & RTN_RTINFO)
892 			return fn;
893 	}
894 }
895 
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
897 			  bool null_fallback)
898 {
899 	struct rt6_info *rt = *prt;
900 
901 	if (dst_hold_safe(&rt->dst))
902 		return true;
903 	if (null_fallback) {
904 		rt = net->ipv6.ip6_null_entry;
905 		dst_hold(&rt->dst);
906 	} else {
907 		rt = NULL;
908 	}
909 	*prt = rt;
910 	return false;
911 }
912 
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914 					     struct fib6_table *table,
915 					     struct flowi6 *fl6, int flags)
916 {
917 	struct rt6_info *rt, *rt_cache;
918 	struct fib6_node *fn;
919 
920 	rcu_read_lock();
921 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
922 restart:
923 	rt = rcu_dereference(fn->leaf);
924 	if (!rt) {
925 		rt = net->ipv6.ip6_null_entry;
926 	} else {
927 		rt = rt6_device_match(net, rt, &fl6->saddr,
928 				      fl6->flowi6_oif, flags);
929 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930 			rt = rt6_multipath_select(rt, fl6,
931 						  fl6->flowi6_oif, flags);
932 	}
933 	if (rt == net->ipv6.ip6_null_entry) {
934 		fn = fib6_backtrack(fn, &fl6->saddr);
935 		if (fn)
936 			goto restart;
937 	}
938 	/* Search through exception table */
939 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
940 	if (rt_cache)
941 		rt = rt_cache;
942 
943 	if (ip6_hold_safe(net, &rt, true))
944 		dst_use_noref(&rt->dst, jiffies);
945 
946 	rcu_read_unlock();
947 
948 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
949 
950 	return rt;
951 
952 }
953 
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
955 				    int flags)
956 {
957 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
958 }
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
960 
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962 			    const struct in6_addr *saddr, int oif, int strict)
963 {
964 	struct flowi6 fl6 = {
965 		.flowi6_oif = oif,
966 		.daddr = *daddr,
967 	};
968 	struct dst_entry *dst;
969 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
970 
971 	if (saddr) {
972 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973 		flags |= RT6_LOOKUP_F_HAS_SADDR;
974 	}
975 
976 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
977 	if (dst->error == 0)
978 		return (struct rt6_info *) dst;
979 
980 	dst_release(dst);
981 
982 	return NULL;
983 }
984 EXPORT_SYMBOL(rt6_lookup);
985 
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987  * It takes new route entry, the addition fails by any reason the
988  * route is released.
989  * Caller must hold dst before calling it.
990  */
991 
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993 			struct mx6_config *mxc,
994 			struct netlink_ext_ack *extack)
995 {
996 	int err;
997 	struct fib6_table *table;
998 
999 	table = rt->rt6i_table;
1000 	spin_lock_bh(&table->tb6_lock);
1001 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002 	spin_unlock_bh(&table->tb6_lock);
1003 
1004 	return err;
1005 }
1006 
1007 int ip6_ins_rt(struct rt6_info *rt)
1008 {
1009 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1010 	struct mx6_config mxc = { .mx = NULL, };
1011 
1012 	/* Hold dst to account for the reference from the fib6 tree */
1013 	dst_hold(&rt->dst);
1014 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1015 }
1016 
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1019 {
1020 	struct net_device *dev = rt->dst.dev;
1021 
1022 	if (rt->rt6i_flags & RTF_LOCAL) {
1023 		/* for copies of local routes, dst->dev needs to be the
1024 		 * device if it is a master device, the master device if
1025 		 * device is enslaved, and the loopback as the default
1026 		 */
1027 		if (netif_is_l3_slave(dev) &&
1028 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1029 			dev = l3mdev_master_dev_rcu(dev);
1030 		else if (!netif_is_l3_master(dev))
1031 			dev = dev_net(dev)->loopback_dev;
1032 		/* last case is netif_is_l3_master(dev) is true in which
1033 		 * case we want dev returned to be dev
1034 		 */
1035 	}
1036 
1037 	return dev;
1038 }
1039 
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041 					   const struct in6_addr *daddr,
1042 					   const struct in6_addr *saddr)
1043 {
1044 	struct net_device *dev;
1045 	struct rt6_info *rt;
1046 
1047 	/*
1048 	 *	Clone the route.
1049 	 */
1050 
1051 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052 		ort = (struct rt6_info *)ort->dst.from;
1053 
1054 	rcu_read_lock();
1055 	dev = ip6_rt_get_dev_rcu(ort);
1056 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1057 	rcu_read_unlock();
1058 	if (!rt)
1059 		return NULL;
1060 
1061 	ip6_rt_copy_init(rt, ort);
1062 	rt->rt6i_flags |= RTF_CACHE;
1063 	rt->rt6i_metric = 0;
1064 	rt->dst.flags |= DST_HOST;
1065 	rt->rt6i_dst.addr = *daddr;
1066 	rt->rt6i_dst.plen = 128;
1067 
1068 	if (!rt6_is_gw_or_nonexthop(ort)) {
1069 		if (ort->rt6i_dst.plen != 128 &&
1070 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071 			rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073 		if (rt->rt6i_src.plen && saddr) {
1074 			rt->rt6i_src.addr = *saddr;
1075 			rt->rt6i_src.plen = 128;
1076 		}
1077 #endif
1078 	}
1079 
1080 	return rt;
1081 }
1082 
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1084 {
1085 	struct net_device *dev;
1086 	struct rt6_info *pcpu_rt;
1087 
1088 	rcu_read_lock();
1089 	dev = ip6_rt_get_dev_rcu(rt);
1090 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1091 	rcu_read_unlock();
1092 	if (!pcpu_rt)
1093 		return NULL;
1094 	ip6_rt_copy_init(pcpu_rt, rt);
1095 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1097 	return pcpu_rt;
1098 }
1099 
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1102 {
1103 	struct rt6_info *pcpu_rt, **p;
1104 
1105 	p = this_cpu_ptr(rt->rt6i_pcpu);
1106 	pcpu_rt = *p;
1107 
1108 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109 		rt6_dst_from_metrics_check(pcpu_rt);
1110 
1111 	return pcpu_rt;
1112 }
1113 
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1115 {
1116 	struct rt6_info *pcpu_rt, *prev, **p;
1117 
1118 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1119 	if (!pcpu_rt) {
1120 		struct net *net = dev_net(rt->dst.dev);
1121 
1122 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1123 		return net->ipv6.ip6_null_entry;
1124 	}
1125 
1126 	dst_hold(&pcpu_rt->dst);
1127 	p = this_cpu_ptr(rt->rt6i_pcpu);
1128 	prev = cmpxchg(p, NULL, pcpu_rt);
1129 	BUG_ON(prev);
1130 
1131 	rt6_dst_from_metrics_check(pcpu_rt);
1132 	return pcpu_rt;
1133 }
1134 
1135 /* exception hash table implementation
1136  */
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1138 
1139 /* Remove rt6_ex from hash table and free the memory
1140  * Caller must hold rt6_exception_lock
1141  */
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143 				 struct rt6_exception *rt6_ex)
1144 {
1145 	struct net *net;
1146 
1147 	if (!bucket || !rt6_ex)
1148 		return;
1149 
1150 	net = dev_net(rt6_ex->rt6i->dst.dev);
1151 	rt6_ex->rt6i->rt6i_node = NULL;
1152 	hlist_del_rcu(&rt6_ex->hlist);
1153 	rt6_release(rt6_ex->rt6i);
1154 	kfree_rcu(rt6_ex, rcu);
1155 	WARN_ON_ONCE(!bucket->depth);
1156 	bucket->depth--;
1157 	net->ipv6.rt6_stats->fib_rt_cache--;
1158 }
1159 
1160 /* Remove oldest rt6_ex in bucket and free the memory
1161  * Caller must hold rt6_exception_lock
1162  */
1163 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1164 {
1165 	struct rt6_exception *rt6_ex, *oldest = NULL;
1166 
1167 	if (!bucket)
1168 		return;
1169 
1170 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1171 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1172 			oldest = rt6_ex;
1173 	}
1174 	rt6_remove_exception(bucket, oldest);
1175 }
1176 
1177 static u32 rt6_exception_hash(const struct in6_addr *dst,
1178 			      const struct in6_addr *src)
1179 {
1180 	static u32 seed __read_mostly;
1181 	u32 val;
1182 
1183 	net_get_random_once(&seed, sizeof(seed));
1184 	val = jhash(dst, sizeof(*dst), seed);
1185 
1186 #ifdef CONFIG_IPV6_SUBTREES
1187 	if (src)
1188 		val = jhash(src, sizeof(*src), val);
1189 #endif
1190 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1191 }
1192 
1193 /* Helper function to find the cached rt in the hash table
1194  * and update bucket pointer to point to the bucket for this
1195  * (daddr, saddr) pair
1196  * Caller must hold rt6_exception_lock
1197  */
1198 static struct rt6_exception *
1199 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1200 			      const struct in6_addr *daddr,
1201 			      const struct in6_addr *saddr)
1202 {
1203 	struct rt6_exception *rt6_ex;
1204 	u32 hval;
1205 
1206 	if (!(*bucket) || !daddr)
1207 		return NULL;
1208 
1209 	hval = rt6_exception_hash(daddr, saddr);
1210 	*bucket += hval;
1211 
1212 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1213 		struct rt6_info *rt6 = rt6_ex->rt6i;
1214 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1215 
1216 #ifdef CONFIG_IPV6_SUBTREES
1217 		if (matched && saddr)
1218 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1219 #endif
1220 		if (matched)
1221 			return rt6_ex;
1222 	}
1223 	return NULL;
1224 }
1225 
1226 /* Helper function to find the cached rt in the hash table
1227  * and update bucket pointer to point to the bucket for this
1228  * (daddr, saddr) pair
1229  * Caller must hold rcu_read_lock()
1230  */
1231 static struct rt6_exception *
1232 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1233 			 const struct in6_addr *daddr,
1234 			 const struct in6_addr *saddr)
1235 {
1236 	struct rt6_exception *rt6_ex;
1237 	u32 hval;
1238 
1239 	WARN_ON_ONCE(!rcu_read_lock_held());
1240 
1241 	if (!(*bucket) || !daddr)
1242 		return NULL;
1243 
1244 	hval = rt6_exception_hash(daddr, saddr);
1245 	*bucket += hval;
1246 
1247 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1248 		struct rt6_info *rt6 = rt6_ex->rt6i;
1249 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1250 
1251 #ifdef CONFIG_IPV6_SUBTREES
1252 		if (matched && saddr)
1253 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1254 #endif
1255 		if (matched)
1256 			return rt6_ex;
1257 	}
1258 	return NULL;
1259 }
1260 
1261 static int rt6_insert_exception(struct rt6_info *nrt,
1262 				struct rt6_info *ort)
1263 {
1264 	struct net *net = dev_net(ort->dst.dev);
1265 	struct rt6_exception_bucket *bucket;
1266 	struct in6_addr *src_key = NULL;
1267 	struct rt6_exception *rt6_ex;
1268 	int err = 0;
1269 
1270 	/* ort can't be a cache or pcpu route */
1271 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1272 		ort = (struct rt6_info *)ort->dst.from;
1273 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1274 
1275 	spin_lock_bh(&rt6_exception_lock);
1276 
1277 	if (ort->exception_bucket_flushed) {
1278 		err = -EINVAL;
1279 		goto out;
1280 	}
1281 
1282 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1283 					lockdep_is_held(&rt6_exception_lock));
1284 	if (!bucket) {
1285 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1286 				 GFP_ATOMIC);
1287 		if (!bucket) {
1288 			err = -ENOMEM;
1289 			goto out;
1290 		}
1291 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1292 	}
1293 
1294 #ifdef CONFIG_IPV6_SUBTREES
1295 	/* rt6i_src.plen != 0 indicates ort is in subtree
1296 	 * and exception table is indexed by a hash of
1297 	 * both rt6i_dst and rt6i_src.
1298 	 * Otherwise, the exception table is indexed by
1299 	 * a hash of only rt6i_dst.
1300 	 */
1301 	if (ort->rt6i_src.plen)
1302 		src_key = &nrt->rt6i_src.addr;
1303 #endif
1304 
1305 	/* Update rt6i_prefsrc as it could be changed
1306 	 * in rt6_remove_prefsrc()
1307 	 */
1308 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1309 	/* rt6_mtu_change() might lower mtu on ort.
1310 	 * Only insert this exception route if its mtu
1311 	 * is less than ort's mtu value.
1312 	 */
1313 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1314 		err = -EINVAL;
1315 		goto out;
1316 	}
1317 
1318 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1319 					       src_key);
1320 	if (rt6_ex)
1321 		rt6_remove_exception(bucket, rt6_ex);
1322 
1323 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1324 	if (!rt6_ex) {
1325 		err = -ENOMEM;
1326 		goto out;
1327 	}
1328 	rt6_ex->rt6i = nrt;
1329 	rt6_ex->stamp = jiffies;
1330 	atomic_inc(&nrt->rt6i_ref);
1331 	nrt->rt6i_node = ort->rt6i_node;
1332 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1333 	bucket->depth++;
1334 	net->ipv6.rt6_stats->fib_rt_cache++;
1335 
1336 	if (bucket->depth > FIB6_MAX_DEPTH)
1337 		rt6_exception_remove_oldest(bucket);
1338 
1339 out:
1340 	spin_unlock_bh(&rt6_exception_lock);
1341 
1342 	/* Update fn->fn_sernum to invalidate all cached dst */
1343 	if (!err)
1344 		fib6_update_sernum(ort);
1345 
1346 	return err;
1347 }
1348 
1349 void rt6_flush_exceptions(struct rt6_info *rt)
1350 {
1351 	struct rt6_exception_bucket *bucket;
1352 	struct rt6_exception *rt6_ex;
1353 	struct hlist_node *tmp;
1354 	int i;
1355 
1356 	spin_lock_bh(&rt6_exception_lock);
1357 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1358 	rt->exception_bucket_flushed = 1;
1359 
1360 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1361 				    lockdep_is_held(&rt6_exception_lock));
1362 	if (!bucket)
1363 		goto out;
1364 
1365 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1366 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1367 			rt6_remove_exception(bucket, rt6_ex);
1368 		WARN_ON_ONCE(bucket->depth);
1369 		bucket++;
1370 	}
1371 
1372 out:
1373 	spin_unlock_bh(&rt6_exception_lock);
1374 }
1375 
1376 /* Find cached rt in the hash table inside passed in rt
1377  * Caller has to hold rcu_read_lock()
1378  */
1379 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1380 					   struct in6_addr *daddr,
1381 					   struct in6_addr *saddr)
1382 {
1383 	struct rt6_exception_bucket *bucket;
1384 	struct in6_addr *src_key = NULL;
1385 	struct rt6_exception *rt6_ex;
1386 	struct rt6_info *res = NULL;
1387 
1388 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1389 
1390 #ifdef CONFIG_IPV6_SUBTREES
1391 	/* rt6i_src.plen != 0 indicates rt is in subtree
1392 	 * and exception table is indexed by a hash of
1393 	 * both rt6i_dst and rt6i_src.
1394 	 * Otherwise, the exception table is indexed by
1395 	 * a hash of only rt6i_dst.
1396 	 */
1397 	if (rt->rt6i_src.plen)
1398 		src_key = saddr;
1399 #endif
1400 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1401 
1402 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1403 		res = rt6_ex->rt6i;
1404 
1405 	return res;
1406 }
1407 
1408 /* Remove the passed in cached rt from the hash table that contains it */
1409 int rt6_remove_exception_rt(struct rt6_info *rt)
1410 {
1411 	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1412 	struct rt6_exception_bucket *bucket;
1413 	struct in6_addr *src_key = NULL;
1414 	struct rt6_exception *rt6_ex;
1415 	int err;
1416 
1417 	if (!from ||
1418 	    !(rt->rt6i_flags & RTF_CACHE))
1419 		return -EINVAL;
1420 
1421 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1422 		return -ENOENT;
1423 
1424 	spin_lock_bh(&rt6_exception_lock);
1425 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1426 				    lockdep_is_held(&rt6_exception_lock));
1427 #ifdef CONFIG_IPV6_SUBTREES
1428 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1429 	 * and exception table is indexed by a hash of
1430 	 * both rt6i_dst and rt6i_src.
1431 	 * Otherwise, the exception table is indexed by
1432 	 * a hash of only rt6i_dst.
1433 	 */
1434 	if (from->rt6i_src.plen)
1435 		src_key = &rt->rt6i_src.addr;
1436 #endif
1437 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1438 					       &rt->rt6i_dst.addr,
1439 					       src_key);
1440 	if (rt6_ex) {
1441 		rt6_remove_exception(bucket, rt6_ex);
1442 		err = 0;
1443 	} else {
1444 		err = -ENOENT;
1445 	}
1446 
1447 	spin_unlock_bh(&rt6_exception_lock);
1448 	return err;
1449 }
1450 
1451 /* Find rt6_ex which contains the passed in rt cache and
1452  * refresh its stamp
1453  */
1454 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1455 {
1456 	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1457 	struct rt6_exception_bucket *bucket;
1458 	struct in6_addr *src_key = NULL;
1459 	struct rt6_exception *rt6_ex;
1460 
1461 	if (!from ||
1462 	    !(rt->rt6i_flags & RTF_CACHE))
1463 		return;
1464 
1465 	rcu_read_lock();
1466 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1467 
1468 #ifdef CONFIG_IPV6_SUBTREES
1469 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1470 	 * and exception table is indexed by a hash of
1471 	 * both rt6i_dst and rt6i_src.
1472 	 * Otherwise, the exception table is indexed by
1473 	 * a hash of only rt6i_dst.
1474 	 */
1475 	if (from->rt6i_src.plen)
1476 		src_key = &rt->rt6i_src.addr;
1477 #endif
1478 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1479 					  &rt->rt6i_dst.addr,
1480 					  src_key);
1481 	if (rt6_ex)
1482 		rt6_ex->stamp = jiffies;
1483 
1484 	rcu_read_unlock();
1485 }
1486 
1487 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1488 {
1489 	struct rt6_exception_bucket *bucket;
1490 	struct rt6_exception *rt6_ex;
1491 	int i;
1492 
1493 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1494 					lockdep_is_held(&rt6_exception_lock));
1495 
1496 	if (bucket) {
1497 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1499 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1500 			}
1501 			bucket++;
1502 		}
1503 	}
1504 }
1505 
1506 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1507 {
1508 	struct rt6_exception_bucket *bucket;
1509 	struct rt6_exception *rt6_ex;
1510 	int i;
1511 
1512 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1513 					lockdep_is_held(&rt6_exception_lock));
1514 
1515 	if (bucket) {
1516 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1517 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1518 				struct rt6_info *entry = rt6_ex->rt6i;
1519 				/* For RTF_CACHE with rt6i_pmtu == 0
1520 				 * (i.e. a redirected route),
1521 				 * the metrics of its rt->dst.from has already
1522 				 * been updated.
1523 				 */
1524 				if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1525 					entry->rt6i_pmtu = mtu;
1526 			}
1527 			bucket++;
1528 		}
1529 	}
1530 }
1531 
1532 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1533 
1534 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1535 					struct in6_addr *gateway)
1536 {
1537 	struct rt6_exception_bucket *bucket;
1538 	struct rt6_exception *rt6_ex;
1539 	struct hlist_node *tmp;
1540 	int i;
1541 
1542 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1543 		return;
1544 
1545 	spin_lock_bh(&rt6_exception_lock);
1546 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1547 				     lockdep_is_held(&rt6_exception_lock));
1548 
1549 	if (bucket) {
1550 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1551 			hlist_for_each_entry_safe(rt6_ex, tmp,
1552 						  &bucket->chain, hlist) {
1553 				struct rt6_info *entry = rt6_ex->rt6i;
1554 
1555 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1556 				    RTF_CACHE_GATEWAY &&
1557 				    ipv6_addr_equal(gateway,
1558 						    &entry->rt6i_gateway)) {
1559 					rt6_remove_exception(bucket, rt6_ex);
1560 				}
1561 			}
1562 			bucket++;
1563 		}
1564 	}
1565 
1566 	spin_unlock_bh(&rt6_exception_lock);
1567 }
1568 
1569 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1570 				      struct rt6_exception *rt6_ex,
1571 				      struct fib6_gc_args *gc_args,
1572 				      unsigned long now)
1573 {
1574 	struct rt6_info *rt = rt6_ex->rt6i;
1575 
1576 	if (atomic_read(&rt->dst.__refcnt) == 1 &&
1577 	    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1578 		RT6_TRACE("aging clone %p\n", rt);
1579 		rt6_remove_exception(bucket, rt6_ex);
1580 		return;
1581 	} else if (rt->rt6i_flags & RTF_GATEWAY) {
1582 		struct neighbour *neigh;
1583 		__u8 neigh_flags = 0;
1584 
1585 		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1586 		if (neigh) {
1587 			neigh_flags = neigh->flags;
1588 			neigh_release(neigh);
1589 		}
1590 		if (!(neigh_flags & NTF_ROUTER)) {
1591 			RT6_TRACE("purging route %p via non-router but gateway\n",
1592 				  rt);
1593 			rt6_remove_exception(bucket, rt6_ex);
1594 			return;
1595 		}
1596 	}
1597 	gc_args->more++;
1598 }
1599 
1600 void rt6_age_exceptions(struct rt6_info *rt,
1601 			struct fib6_gc_args *gc_args,
1602 			unsigned long now)
1603 {
1604 	struct rt6_exception_bucket *bucket;
1605 	struct rt6_exception *rt6_ex;
1606 	struct hlist_node *tmp;
1607 	int i;
1608 
1609 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1610 		return;
1611 
1612 	spin_lock_bh(&rt6_exception_lock);
1613 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1614 				    lockdep_is_held(&rt6_exception_lock));
1615 
1616 	if (bucket) {
1617 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1618 			hlist_for_each_entry_safe(rt6_ex, tmp,
1619 						  &bucket->chain, hlist) {
1620 				rt6_age_examine_exception(bucket, rt6_ex,
1621 							  gc_args, now);
1622 			}
1623 			bucket++;
1624 		}
1625 	}
1626 	spin_unlock_bh(&rt6_exception_lock);
1627 }
1628 
1629 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1630 			       int oif, struct flowi6 *fl6, int flags)
1631 {
1632 	struct fib6_node *fn, *saved_fn;
1633 	struct rt6_info *rt, *rt_cache;
1634 	int strict = 0;
1635 
1636 	strict |= flags & RT6_LOOKUP_F_IFACE;
1637 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1638 	if (net->ipv6.devconf_all->forwarding == 0)
1639 		strict |= RT6_LOOKUP_F_REACHABLE;
1640 
1641 	rcu_read_lock();
1642 
1643 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1644 	saved_fn = fn;
1645 
1646 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1647 		oif = 0;
1648 
1649 redo_rt6_select:
1650 	rt = rt6_select(net, fn, oif, strict);
1651 	if (rt->rt6i_nsiblings)
1652 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1653 	if (rt == net->ipv6.ip6_null_entry) {
1654 		fn = fib6_backtrack(fn, &fl6->saddr);
1655 		if (fn)
1656 			goto redo_rt6_select;
1657 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1658 			/* also consider unreachable route */
1659 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1660 			fn = saved_fn;
1661 			goto redo_rt6_select;
1662 		}
1663 	}
1664 
1665 	/*Search through exception table */
1666 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1667 	if (rt_cache)
1668 		rt = rt_cache;
1669 
1670 	if (rt == net->ipv6.ip6_null_entry) {
1671 		rcu_read_unlock();
1672 		dst_hold(&rt->dst);
1673 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1674 		return rt;
1675 	} else if (rt->rt6i_flags & RTF_CACHE) {
1676 		if (ip6_hold_safe(net, &rt, true)) {
1677 			dst_use_noref(&rt->dst, jiffies);
1678 			rt6_dst_from_metrics_check(rt);
1679 		}
1680 		rcu_read_unlock();
1681 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1682 		return rt;
1683 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1684 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1685 		/* Create a RTF_CACHE clone which will not be
1686 		 * owned by the fib6 tree.  It is for the special case where
1687 		 * the daddr in the skb during the neighbor look-up is different
1688 		 * from the fl6->daddr used to look-up route here.
1689 		 */
1690 
1691 		struct rt6_info *uncached_rt;
1692 
1693 		if (ip6_hold_safe(net, &rt, true)) {
1694 			dst_use_noref(&rt->dst, jiffies);
1695 		} else {
1696 			rcu_read_unlock();
1697 			uncached_rt = rt;
1698 			goto uncached_rt_out;
1699 		}
1700 		rcu_read_unlock();
1701 
1702 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1703 		dst_release(&rt->dst);
1704 
1705 		if (uncached_rt) {
1706 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1707 			 * No need for another dst_hold()
1708 			 */
1709 			rt6_uncached_list_add(uncached_rt);
1710 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1711 		} else {
1712 			uncached_rt = net->ipv6.ip6_null_entry;
1713 			dst_hold(&uncached_rt->dst);
1714 		}
1715 
1716 uncached_rt_out:
1717 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1718 		return uncached_rt;
1719 
1720 	} else {
1721 		/* Get a percpu copy */
1722 
1723 		struct rt6_info *pcpu_rt;
1724 
1725 		dst_use_noref(&rt->dst, jiffies);
1726 		local_bh_disable();
1727 		pcpu_rt = rt6_get_pcpu_route(rt);
1728 
1729 		if (!pcpu_rt) {
1730 			/* atomic_inc_not_zero() is needed when using rcu */
1731 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1732 				/* No dst_hold() on rt is needed because grabbing
1733 				 * rt->rt6i_ref makes sure rt can't be released.
1734 				 */
1735 				pcpu_rt = rt6_make_pcpu_route(rt);
1736 				rt6_release(rt);
1737 			} else {
1738 				/* rt is already removed from tree */
1739 				pcpu_rt = net->ipv6.ip6_null_entry;
1740 				dst_hold(&pcpu_rt->dst);
1741 			}
1742 		}
1743 		local_bh_enable();
1744 		rcu_read_unlock();
1745 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1746 		return pcpu_rt;
1747 	}
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_pol_route);
1750 
1751 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1752 					    struct flowi6 *fl6, int flags)
1753 {
1754 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1755 }
1756 
1757 struct dst_entry *ip6_route_input_lookup(struct net *net,
1758 					 struct net_device *dev,
1759 					 struct flowi6 *fl6, int flags)
1760 {
1761 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1762 		flags |= RT6_LOOKUP_F_IFACE;
1763 
1764 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1767 
1768 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1769 				  struct flow_keys *keys)
1770 {
1771 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1772 	const struct ipv6hdr *key_iph = outer_iph;
1773 	const struct ipv6hdr *inner_iph;
1774 	const struct icmp6hdr *icmph;
1775 	struct ipv6hdr _inner_iph;
1776 
1777 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1778 		goto out;
1779 
1780 	icmph = icmp6_hdr(skb);
1781 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1782 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1783 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1784 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1785 		goto out;
1786 
1787 	inner_iph = skb_header_pointer(skb,
1788 				       skb_transport_offset(skb) + sizeof(*icmph),
1789 				       sizeof(_inner_iph), &_inner_iph);
1790 	if (!inner_iph)
1791 		goto out;
1792 
1793 	key_iph = inner_iph;
1794 out:
1795 	memset(keys, 0, sizeof(*keys));
1796 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1797 	keys->addrs.v6addrs.src = key_iph->saddr;
1798 	keys->addrs.v6addrs.dst = key_iph->daddr;
1799 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1800 	keys->basic.ip_proto = key_iph->nexthdr;
1801 }
1802 
1803 /* if skb is set it will be used and fl6 can be NULL */
1804 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1805 {
1806 	struct flow_keys hash_keys;
1807 
1808 	if (skb) {
1809 		ip6_multipath_l3_keys(skb, &hash_keys);
1810 		return flow_hash_from_keys(&hash_keys);
1811 	}
1812 
1813 	return get_hash_from_flowi6(fl6);
1814 }
1815 
1816 void ip6_route_input(struct sk_buff *skb)
1817 {
1818 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1819 	struct net *net = dev_net(skb->dev);
1820 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1821 	struct ip_tunnel_info *tun_info;
1822 	struct flowi6 fl6 = {
1823 		.flowi6_iif = skb->dev->ifindex,
1824 		.daddr = iph->daddr,
1825 		.saddr = iph->saddr,
1826 		.flowlabel = ip6_flowinfo(iph),
1827 		.flowi6_mark = skb->mark,
1828 		.flowi6_proto = iph->nexthdr,
1829 	};
1830 
1831 	tun_info = skb_tunnel_info(skb);
1832 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1833 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1834 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1835 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1836 	skb_dst_drop(skb);
1837 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1838 }
1839 
1840 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1841 					     struct flowi6 *fl6, int flags)
1842 {
1843 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1844 }
1845 
1846 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1847 					 struct flowi6 *fl6, int flags)
1848 {
1849 	bool any_src;
1850 
1851 	if (rt6_need_strict(&fl6->daddr)) {
1852 		struct dst_entry *dst;
1853 
1854 		dst = l3mdev_link_scope_lookup(net, fl6);
1855 		if (dst)
1856 			return dst;
1857 	}
1858 
1859 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1860 
1861 	any_src = ipv6_addr_any(&fl6->saddr);
1862 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1863 	    (fl6->flowi6_oif && any_src))
1864 		flags |= RT6_LOOKUP_F_IFACE;
1865 
1866 	if (!any_src)
1867 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1868 	else if (sk)
1869 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1870 
1871 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1872 }
1873 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1874 
1875 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1876 {
1877 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1878 	struct net_device *loopback_dev = net->loopback_dev;
1879 	struct dst_entry *new = NULL;
1880 
1881 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1882 		       DST_OBSOLETE_DEAD, 0);
1883 	if (rt) {
1884 		rt6_info_init(rt);
1885 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1886 
1887 		new = &rt->dst;
1888 		new->__use = 1;
1889 		new->input = dst_discard;
1890 		new->output = dst_discard_out;
1891 
1892 		dst_copy_metrics(new, &ort->dst);
1893 
1894 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1895 		rt->rt6i_gateway = ort->rt6i_gateway;
1896 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1897 		rt->rt6i_metric = 0;
1898 
1899 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1900 #ifdef CONFIG_IPV6_SUBTREES
1901 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1902 #endif
1903 	}
1904 
1905 	dst_release(dst_orig);
1906 	return new ? new : ERR_PTR(-ENOMEM);
1907 }
1908 
1909 /*
1910  *	Destination cache support functions
1911  */
1912 
1913 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1914 {
1915 	if (rt->dst.from &&
1916 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1917 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1918 }
1919 
1920 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1921 {
1922 	u32 rt_cookie = 0;
1923 
1924 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1925 		return NULL;
1926 
1927 	if (rt6_check_expired(rt))
1928 		return NULL;
1929 
1930 	return &rt->dst;
1931 }
1932 
1933 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1934 {
1935 	if (!__rt6_check_expired(rt) &&
1936 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1937 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1938 		return &rt->dst;
1939 	else
1940 		return NULL;
1941 }
1942 
1943 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1944 {
1945 	struct rt6_info *rt;
1946 
1947 	rt = (struct rt6_info *) dst;
1948 
1949 	/* All IPV6 dsts are created with ->obsolete set to the value
1950 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1951 	 * into this function always.
1952 	 */
1953 
1954 	rt6_dst_from_metrics_check(rt);
1955 
1956 	if (rt->rt6i_flags & RTF_PCPU ||
1957 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1958 		return rt6_dst_from_check(rt, cookie);
1959 	else
1960 		return rt6_check(rt, cookie);
1961 }
1962 
1963 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1964 {
1965 	struct rt6_info *rt = (struct rt6_info *) dst;
1966 
1967 	if (rt) {
1968 		if (rt->rt6i_flags & RTF_CACHE) {
1969 			if (rt6_check_expired(rt)) {
1970 				ip6_del_rt(rt);
1971 				dst = NULL;
1972 			}
1973 		} else {
1974 			dst_release(dst);
1975 			dst = NULL;
1976 		}
1977 	}
1978 	return dst;
1979 }
1980 
1981 static void ip6_link_failure(struct sk_buff *skb)
1982 {
1983 	struct rt6_info *rt;
1984 
1985 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1986 
1987 	rt = (struct rt6_info *) skb_dst(skb);
1988 	if (rt) {
1989 		if (rt->rt6i_flags & RTF_CACHE) {
1990 			if (dst_hold_safe(&rt->dst))
1991 				ip6_del_rt(rt);
1992 		} else {
1993 			struct fib6_node *fn;
1994 
1995 			rcu_read_lock();
1996 			fn = rcu_dereference(rt->rt6i_node);
1997 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1998 				fn->fn_sernum = -1;
1999 			rcu_read_unlock();
2000 		}
2001 	}
2002 }
2003 
2004 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2005 {
2006 	struct net *net = dev_net(rt->dst.dev);
2007 
2008 	rt->rt6i_flags |= RTF_MODIFIED;
2009 	rt->rt6i_pmtu = mtu;
2010 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2011 }
2012 
2013 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2014 {
2015 	return !(rt->rt6i_flags & RTF_CACHE) &&
2016 		(rt->rt6i_flags & RTF_PCPU ||
2017 		 rcu_access_pointer(rt->rt6i_node));
2018 }
2019 
2020 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2021 				 const struct ipv6hdr *iph, u32 mtu)
2022 {
2023 	const struct in6_addr *daddr, *saddr;
2024 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2025 
2026 	if (rt6->rt6i_flags & RTF_LOCAL)
2027 		return;
2028 
2029 	if (dst_metric_locked(dst, RTAX_MTU))
2030 		return;
2031 
2032 	if (iph) {
2033 		daddr = &iph->daddr;
2034 		saddr = &iph->saddr;
2035 	} else if (sk) {
2036 		daddr = &sk->sk_v6_daddr;
2037 		saddr = &inet6_sk(sk)->saddr;
2038 	} else {
2039 		daddr = NULL;
2040 		saddr = NULL;
2041 	}
2042 	dst_confirm_neigh(dst, daddr);
2043 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2044 	if (mtu >= dst_mtu(dst))
2045 		return;
2046 
2047 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2048 		rt6_do_update_pmtu(rt6, mtu);
2049 		/* update rt6_ex->stamp for cache */
2050 		if (rt6->rt6i_flags & RTF_CACHE)
2051 			rt6_update_exception_stamp_rt(rt6);
2052 	} else if (daddr) {
2053 		struct rt6_info *nrt6;
2054 
2055 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2056 		if (nrt6) {
2057 			rt6_do_update_pmtu(nrt6, mtu);
2058 			if (rt6_insert_exception(nrt6, rt6))
2059 				dst_release_immediate(&nrt6->dst);
2060 		}
2061 	}
2062 }
2063 
2064 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2065 			       struct sk_buff *skb, u32 mtu)
2066 {
2067 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2068 }
2069 
2070 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2071 		     int oif, u32 mark, kuid_t uid)
2072 {
2073 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2074 	struct dst_entry *dst;
2075 	struct flowi6 fl6;
2076 
2077 	memset(&fl6, 0, sizeof(fl6));
2078 	fl6.flowi6_oif = oif;
2079 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2080 	fl6.daddr = iph->daddr;
2081 	fl6.saddr = iph->saddr;
2082 	fl6.flowlabel = ip6_flowinfo(iph);
2083 	fl6.flowi6_uid = uid;
2084 
2085 	dst = ip6_route_output(net, NULL, &fl6);
2086 	if (!dst->error)
2087 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2088 	dst_release(dst);
2089 }
2090 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2091 
2092 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2093 {
2094 	struct dst_entry *dst;
2095 
2096 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2097 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2098 
2099 	dst = __sk_dst_get(sk);
2100 	if (!dst || !dst->obsolete ||
2101 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2102 		return;
2103 
2104 	bh_lock_sock(sk);
2105 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2106 		ip6_datagram_dst_update(sk, false);
2107 	bh_unlock_sock(sk);
2108 }
2109 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2110 
2111 /* Handle redirects */
2112 struct ip6rd_flowi {
2113 	struct flowi6 fl6;
2114 	struct in6_addr gateway;
2115 };
2116 
2117 static struct rt6_info *__ip6_route_redirect(struct net *net,
2118 					     struct fib6_table *table,
2119 					     struct flowi6 *fl6,
2120 					     int flags)
2121 {
2122 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2123 	struct rt6_info *rt, *rt_cache;
2124 	struct fib6_node *fn;
2125 
2126 	/* Get the "current" route for this destination and
2127 	 * check if the redirect has come from appropriate router.
2128 	 *
2129 	 * RFC 4861 specifies that redirects should only be
2130 	 * accepted if they come from the nexthop to the target.
2131 	 * Due to the way the routes are chosen, this notion
2132 	 * is a bit fuzzy and one might need to check all possible
2133 	 * routes.
2134 	 */
2135 
2136 	rcu_read_lock();
2137 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2138 restart:
2139 	for_each_fib6_node_rt_rcu(fn) {
2140 		if (rt6_check_expired(rt))
2141 			continue;
2142 		if (rt->dst.error)
2143 			break;
2144 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2145 			continue;
2146 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2147 			continue;
2148 		/* rt_cache's gateway might be different from its 'parent'
2149 		 * in the case of an ip redirect.
2150 		 * So we keep searching in the exception table if the gateway
2151 		 * is different.
2152 		 */
2153 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2154 			rt_cache = rt6_find_cached_rt(rt,
2155 						      &fl6->daddr,
2156 						      &fl6->saddr);
2157 			if (rt_cache &&
2158 			    ipv6_addr_equal(&rdfl->gateway,
2159 					    &rt_cache->rt6i_gateway)) {
2160 				rt = rt_cache;
2161 				break;
2162 			}
2163 			continue;
2164 		}
2165 		break;
2166 	}
2167 
2168 	if (!rt)
2169 		rt = net->ipv6.ip6_null_entry;
2170 	else if (rt->dst.error) {
2171 		rt = net->ipv6.ip6_null_entry;
2172 		goto out;
2173 	}
2174 
2175 	if (rt == net->ipv6.ip6_null_entry) {
2176 		fn = fib6_backtrack(fn, &fl6->saddr);
2177 		if (fn)
2178 			goto restart;
2179 	}
2180 
2181 out:
2182 	ip6_hold_safe(net, &rt, true);
2183 
2184 	rcu_read_unlock();
2185 
2186 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2187 	return rt;
2188 };
2189 
2190 static struct dst_entry *ip6_route_redirect(struct net *net,
2191 					const struct flowi6 *fl6,
2192 					const struct in6_addr *gateway)
2193 {
2194 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2195 	struct ip6rd_flowi rdfl;
2196 
2197 	rdfl.fl6 = *fl6;
2198 	rdfl.gateway = *gateway;
2199 
2200 	return fib6_rule_lookup(net, &rdfl.fl6,
2201 				flags, __ip6_route_redirect);
2202 }
2203 
2204 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2205 		  kuid_t uid)
2206 {
2207 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2208 	struct dst_entry *dst;
2209 	struct flowi6 fl6;
2210 
2211 	memset(&fl6, 0, sizeof(fl6));
2212 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2213 	fl6.flowi6_oif = oif;
2214 	fl6.flowi6_mark = mark;
2215 	fl6.daddr = iph->daddr;
2216 	fl6.saddr = iph->saddr;
2217 	fl6.flowlabel = ip6_flowinfo(iph);
2218 	fl6.flowi6_uid = uid;
2219 
2220 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2221 	rt6_do_redirect(dst, NULL, skb);
2222 	dst_release(dst);
2223 }
2224 EXPORT_SYMBOL_GPL(ip6_redirect);
2225 
2226 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2227 			    u32 mark)
2228 {
2229 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2230 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2231 	struct dst_entry *dst;
2232 	struct flowi6 fl6;
2233 
2234 	memset(&fl6, 0, sizeof(fl6));
2235 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2236 	fl6.flowi6_oif = oif;
2237 	fl6.flowi6_mark = mark;
2238 	fl6.daddr = msg->dest;
2239 	fl6.saddr = iph->daddr;
2240 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2241 
2242 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2243 	rt6_do_redirect(dst, NULL, skb);
2244 	dst_release(dst);
2245 }
2246 
2247 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2248 {
2249 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2250 		     sk->sk_uid);
2251 }
2252 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2253 
2254 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2255 {
2256 	struct net_device *dev = dst->dev;
2257 	unsigned int mtu = dst_mtu(dst);
2258 	struct net *net = dev_net(dev);
2259 
2260 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2261 
2262 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2263 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2264 
2265 	/*
2266 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2267 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2268 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2269 	 * rely only on pmtu discovery"
2270 	 */
2271 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2272 		mtu = IPV6_MAXPLEN;
2273 	return mtu;
2274 }
2275 
2276 static unsigned int ip6_mtu(const struct dst_entry *dst)
2277 {
2278 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2279 	unsigned int mtu = rt->rt6i_pmtu;
2280 	struct inet6_dev *idev;
2281 
2282 	if (mtu)
2283 		goto out;
2284 
2285 	mtu = dst_metric_raw(dst, RTAX_MTU);
2286 	if (mtu)
2287 		goto out;
2288 
2289 	mtu = IPV6_MIN_MTU;
2290 
2291 	rcu_read_lock();
2292 	idev = __in6_dev_get(dst->dev);
2293 	if (idev)
2294 		mtu = idev->cnf.mtu6;
2295 	rcu_read_unlock();
2296 
2297 out:
2298 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2299 
2300 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2301 }
2302 
2303 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2304 				  struct flowi6 *fl6)
2305 {
2306 	struct dst_entry *dst;
2307 	struct rt6_info *rt;
2308 	struct inet6_dev *idev = in6_dev_get(dev);
2309 	struct net *net = dev_net(dev);
2310 
2311 	if (unlikely(!idev))
2312 		return ERR_PTR(-ENODEV);
2313 
2314 	rt = ip6_dst_alloc(net, dev, 0);
2315 	if (unlikely(!rt)) {
2316 		in6_dev_put(idev);
2317 		dst = ERR_PTR(-ENOMEM);
2318 		goto out;
2319 	}
2320 
2321 	rt->dst.flags |= DST_HOST;
2322 	rt->dst.output  = ip6_output;
2323 	rt->rt6i_gateway  = fl6->daddr;
2324 	rt->rt6i_dst.addr = fl6->daddr;
2325 	rt->rt6i_dst.plen = 128;
2326 	rt->rt6i_idev     = idev;
2327 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2328 
2329 	/* Add this dst into uncached_list so that rt6_ifdown() can
2330 	 * do proper release of the net_device
2331 	 */
2332 	rt6_uncached_list_add(rt);
2333 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2334 
2335 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2336 
2337 out:
2338 	return dst;
2339 }
2340 
2341 static int ip6_dst_gc(struct dst_ops *ops)
2342 {
2343 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2344 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2345 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2346 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2347 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2348 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2349 	int entries;
2350 
2351 	entries = dst_entries_get_fast(ops);
2352 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2353 	    entries <= rt_max_size)
2354 		goto out;
2355 
2356 	net->ipv6.ip6_rt_gc_expire++;
2357 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2358 	entries = dst_entries_get_slow(ops);
2359 	if (entries < ops->gc_thresh)
2360 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2361 out:
2362 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2363 	return entries > rt_max_size;
2364 }
2365 
2366 static int ip6_convert_metrics(struct mx6_config *mxc,
2367 			       const struct fib6_config *cfg)
2368 {
2369 	bool ecn_ca = false;
2370 	struct nlattr *nla;
2371 	int remaining;
2372 	u32 *mp;
2373 
2374 	if (!cfg->fc_mx)
2375 		return 0;
2376 
2377 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2378 	if (unlikely(!mp))
2379 		return -ENOMEM;
2380 
2381 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2382 		int type = nla_type(nla);
2383 		u32 val;
2384 
2385 		if (!type)
2386 			continue;
2387 		if (unlikely(type > RTAX_MAX))
2388 			goto err;
2389 
2390 		if (type == RTAX_CC_ALGO) {
2391 			char tmp[TCP_CA_NAME_MAX];
2392 
2393 			nla_strlcpy(tmp, nla, sizeof(tmp));
2394 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2395 			if (val == TCP_CA_UNSPEC)
2396 				goto err;
2397 		} else {
2398 			val = nla_get_u32(nla);
2399 		}
2400 		if (type == RTAX_HOPLIMIT && val > 255)
2401 			val = 255;
2402 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2403 			goto err;
2404 
2405 		mp[type - 1] = val;
2406 		__set_bit(type - 1, mxc->mx_valid);
2407 	}
2408 
2409 	if (ecn_ca) {
2410 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2411 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2412 	}
2413 
2414 	mxc->mx = mp;
2415 	return 0;
2416  err:
2417 	kfree(mp);
2418 	return -EINVAL;
2419 }
2420 
2421 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2422 					    struct fib6_config *cfg,
2423 					    const struct in6_addr *gw_addr)
2424 {
2425 	struct flowi6 fl6 = {
2426 		.flowi6_oif = cfg->fc_ifindex,
2427 		.daddr = *gw_addr,
2428 		.saddr = cfg->fc_prefsrc,
2429 	};
2430 	struct fib6_table *table;
2431 	struct rt6_info *rt;
2432 	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2433 
2434 	table = fib6_get_table(net, cfg->fc_table);
2435 	if (!table)
2436 		return NULL;
2437 
2438 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2439 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2440 
2441 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2442 
2443 	/* if table lookup failed, fall back to full lookup */
2444 	if (rt == net->ipv6.ip6_null_entry) {
2445 		ip6_rt_put(rt);
2446 		rt = NULL;
2447 	}
2448 
2449 	return rt;
2450 }
2451 
2452 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2453 					      struct netlink_ext_ack *extack)
2454 {
2455 	struct net *net = cfg->fc_nlinfo.nl_net;
2456 	struct rt6_info *rt = NULL;
2457 	struct net_device *dev = NULL;
2458 	struct inet6_dev *idev = NULL;
2459 	struct fib6_table *table;
2460 	int addr_type;
2461 	int err = -EINVAL;
2462 
2463 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2464 	if (cfg->fc_flags & RTF_PCPU) {
2465 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2466 		goto out;
2467 	}
2468 
2469 	if (cfg->fc_dst_len > 128) {
2470 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2471 		goto out;
2472 	}
2473 	if (cfg->fc_src_len > 128) {
2474 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2475 		goto out;
2476 	}
2477 #ifndef CONFIG_IPV6_SUBTREES
2478 	if (cfg->fc_src_len) {
2479 		NL_SET_ERR_MSG(extack,
2480 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2481 		goto out;
2482 	}
2483 #endif
2484 	if (cfg->fc_ifindex) {
2485 		err = -ENODEV;
2486 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2487 		if (!dev)
2488 			goto out;
2489 		idev = in6_dev_get(dev);
2490 		if (!idev)
2491 			goto out;
2492 	}
2493 
2494 	if (cfg->fc_metric == 0)
2495 		cfg->fc_metric = IP6_RT_PRIO_USER;
2496 
2497 	err = -ENOBUFS;
2498 	if (cfg->fc_nlinfo.nlh &&
2499 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2500 		table = fib6_get_table(net, cfg->fc_table);
2501 		if (!table) {
2502 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2503 			table = fib6_new_table(net, cfg->fc_table);
2504 		}
2505 	} else {
2506 		table = fib6_new_table(net, cfg->fc_table);
2507 	}
2508 
2509 	if (!table)
2510 		goto out;
2511 
2512 	rt = ip6_dst_alloc(net, NULL,
2513 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2514 
2515 	if (!rt) {
2516 		err = -ENOMEM;
2517 		goto out;
2518 	}
2519 
2520 	if (cfg->fc_flags & RTF_EXPIRES)
2521 		rt6_set_expires(rt, jiffies +
2522 				clock_t_to_jiffies(cfg->fc_expires));
2523 	else
2524 		rt6_clean_expires(rt);
2525 
2526 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2527 		cfg->fc_protocol = RTPROT_BOOT;
2528 	rt->rt6i_protocol = cfg->fc_protocol;
2529 
2530 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2531 
2532 	if (addr_type & IPV6_ADDR_MULTICAST)
2533 		rt->dst.input = ip6_mc_input;
2534 	else if (cfg->fc_flags & RTF_LOCAL)
2535 		rt->dst.input = ip6_input;
2536 	else
2537 		rt->dst.input = ip6_forward;
2538 
2539 	rt->dst.output = ip6_output;
2540 
2541 	if (cfg->fc_encap) {
2542 		struct lwtunnel_state *lwtstate;
2543 
2544 		err = lwtunnel_build_state(cfg->fc_encap_type,
2545 					   cfg->fc_encap, AF_INET6, cfg,
2546 					   &lwtstate, extack);
2547 		if (err)
2548 			goto out;
2549 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2550 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2551 			rt->dst.lwtstate->orig_output = rt->dst.output;
2552 			rt->dst.output = lwtunnel_output;
2553 		}
2554 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2555 			rt->dst.lwtstate->orig_input = rt->dst.input;
2556 			rt->dst.input = lwtunnel_input;
2557 		}
2558 	}
2559 
2560 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2561 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2562 	if (rt->rt6i_dst.plen == 128)
2563 		rt->dst.flags |= DST_HOST;
2564 
2565 #ifdef CONFIG_IPV6_SUBTREES
2566 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2567 	rt->rt6i_src.plen = cfg->fc_src_len;
2568 #endif
2569 
2570 	rt->rt6i_metric = cfg->fc_metric;
2571 
2572 	/* We cannot add true routes via loopback here,
2573 	   they would result in kernel looping; promote them to reject routes
2574 	 */
2575 	if ((cfg->fc_flags & RTF_REJECT) ||
2576 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2577 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2578 	     !(cfg->fc_flags & RTF_LOCAL))) {
2579 		/* hold loopback dev/idev if we haven't done so. */
2580 		if (dev != net->loopback_dev) {
2581 			if (dev) {
2582 				dev_put(dev);
2583 				in6_dev_put(idev);
2584 			}
2585 			dev = net->loopback_dev;
2586 			dev_hold(dev);
2587 			idev = in6_dev_get(dev);
2588 			if (!idev) {
2589 				err = -ENODEV;
2590 				goto out;
2591 			}
2592 		}
2593 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2594 		switch (cfg->fc_type) {
2595 		case RTN_BLACKHOLE:
2596 			rt->dst.error = -EINVAL;
2597 			rt->dst.output = dst_discard_out;
2598 			rt->dst.input = dst_discard;
2599 			break;
2600 		case RTN_PROHIBIT:
2601 			rt->dst.error = -EACCES;
2602 			rt->dst.output = ip6_pkt_prohibit_out;
2603 			rt->dst.input = ip6_pkt_prohibit;
2604 			break;
2605 		case RTN_THROW:
2606 		case RTN_UNREACHABLE:
2607 		default:
2608 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2609 					: (cfg->fc_type == RTN_UNREACHABLE)
2610 					? -EHOSTUNREACH : -ENETUNREACH;
2611 			rt->dst.output = ip6_pkt_discard_out;
2612 			rt->dst.input = ip6_pkt_discard;
2613 			break;
2614 		}
2615 		goto install_route;
2616 	}
2617 
2618 	if (cfg->fc_flags & RTF_GATEWAY) {
2619 		const struct in6_addr *gw_addr;
2620 		int gwa_type;
2621 
2622 		gw_addr = &cfg->fc_gateway;
2623 		gwa_type = ipv6_addr_type(gw_addr);
2624 
2625 		/* if gw_addr is local we will fail to detect this in case
2626 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2627 		 * will return already-added prefix route via interface that
2628 		 * prefix route was assigned to, which might be non-loopback.
2629 		 */
2630 		err = -EINVAL;
2631 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2632 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2633 					    dev : NULL, 0, 0)) {
2634 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2635 			goto out;
2636 		}
2637 		rt->rt6i_gateway = *gw_addr;
2638 
2639 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2640 			struct rt6_info *grt = NULL;
2641 
2642 			/* IPv6 strictly inhibits using not link-local
2643 			   addresses as nexthop address.
2644 			   Otherwise, router will not able to send redirects.
2645 			   It is very good, but in some (rare!) circumstances
2646 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2647 			   some exceptions. --ANK
2648 			   We allow IPv4-mapped nexthops to support RFC4798-type
2649 			   addressing
2650 			 */
2651 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2652 					  IPV6_ADDR_MAPPED))) {
2653 				NL_SET_ERR_MSG(extack,
2654 					       "Invalid gateway address");
2655 				goto out;
2656 			}
2657 
2658 			if (cfg->fc_table) {
2659 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2660 
2661 				if (grt) {
2662 					if (grt->rt6i_flags & RTF_GATEWAY ||
2663 					    (dev && dev != grt->dst.dev)) {
2664 						ip6_rt_put(grt);
2665 						grt = NULL;
2666 					}
2667 				}
2668 			}
2669 
2670 			if (!grt)
2671 				grt = rt6_lookup(net, gw_addr, NULL,
2672 						 cfg->fc_ifindex, 1);
2673 
2674 			err = -EHOSTUNREACH;
2675 			if (!grt)
2676 				goto out;
2677 			if (dev) {
2678 				if (dev != grt->dst.dev) {
2679 					ip6_rt_put(grt);
2680 					goto out;
2681 				}
2682 			} else {
2683 				dev = grt->dst.dev;
2684 				idev = grt->rt6i_idev;
2685 				dev_hold(dev);
2686 				in6_dev_hold(grt->rt6i_idev);
2687 			}
2688 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2689 				err = 0;
2690 			ip6_rt_put(grt);
2691 
2692 			if (err)
2693 				goto out;
2694 		}
2695 		err = -EINVAL;
2696 		if (!dev) {
2697 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2698 			goto out;
2699 		} else if (dev->flags & IFF_LOOPBACK) {
2700 			NL_SET_ERR_MSG(extack,
2701 				       "Egress device can not be loopback device for this route");
2702 			goto out;
2703 		}
2704 	}
2705 
2706 	err = -ENODEV;
2707 	if (!dev)
2708 		goto out;
2709 
2710 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2711 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2712 			NL_SET_ERR_MSG(extack, "Invalid source address");
2713 			err = -EINVAL;
2714 			goto out;
2715 		}
2716 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2717 		rt->rt6i_prefsrc.plen = 128;
2718 	} else
2719 		rt->rt6i_prefsrc.plen = 0;
2720 
2721 	rt->rt6i_flags = cfg->fc_flags;
2722 
2723 install_route:
2724 	rt->dst.dev = dev;
2725 	rt->rt6i_idev = idev;
2726 	rt->rt6i_table = table;
2727 
2728 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2729 
2730 	return rt;
2731 out:
2732 	if (dev)
2733 		dev_put(dev);
2734 	if (idev)
2735 		in6_dev_put(idev);
2736 	if (rt)
2737 		dst_release_immediate(&rt->dst);
2738 
2739 	return ERR_PTR(err);
2740 }
2741 
2742 int ip6_route_add(struct fib6_config *cfg,
2743 		  struct netlink_ext_ack *extack)
2744 {
2745 	struct mx6_config mxc = { .mx = NULL, };
2746 	struct rt6_info *rt;
2747 	int err;
2748 
2749 	rt = ip6_route_info_create(cfg, extack);
2750 	if (IS_ERR(rt)) {
2751 		err = PTR_ERR(rt);
2752 		rt = NULL;
2753 		goto out;
2754 	}
2755 
2756 	err = ip6_convert_metrics(&mxc, cfg);
2757 	if (err)
2758 		goto out;
2759 
2760 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2761 
2762 	kfree(mxc.mx);
2763 
2764 	return err;
2765 out:
2766 	if (rt)
2767 		dst_release_immediate(&rt->dst);
2768 
2769 	return err;
2770 }
2771 
2772 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2773 {
2774 	int err;
2775 	struct fib6_table *table;
2776 	struct net *net = dev_net(rt->dst.dev);
2777 
2778 	if (rt == net->ipv6.ip6_null_entry) {
2779 		err = -ENOENT;
2780 		goto out;
2781 	}
2782 
2783 	table = rt->rt6i_table;
2784 	spin_lock_bh(&table->tb6_lock);
2785 	err = fib6_del(rt, info);
2786 	spin_unlock_bh(&table->tb6_lock);
2787 
2788 out:
2789 	ip6_rt_put(rt);
2790 	return err;
2791 }
2792 
2793 int ip6_del_rt(struct rt6_info *rt)
2794 {
2795 	struct nl_info info = {
2796 		.nl_net = dev_net(rt->dst.dev),
2797 	};
2798 	return __ip6_del_rt(rt, &info);
2799 }
2800 
2801 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2802 {
2803 	struct nl_info *info = &cfg->fc_nlinfo;
2804 	struct net *net = info->nl_net;
2805 	struct sk_buff *skb = NULL;
2806 	struct fib6_table *table;
2807 	int err = -ENOENT;
2808 
2809 	if (rt == net->ipv6.ip6_null_entry)
2810 		goto out_put;
2811 	table = rt->rt6i_table;
2812 	spin_lock_bh(&table->tb6_lock);
2813 
2814 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2815 		struct rt6_info *sibling, *next_sibling;
2816 
2817 		/* prefer to send a single notification with all hops */
2818 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2819 		if (skb) {
2820 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2821 
2822 			if (rt6_fill_node(net, skb, rt,
2823 					  NULL, NULL, 0, RTM_DELROUTE,
2824 					  info->portid, seq, 0) < 0) {
2825 				kfree_skb(skb);
2826 				skb = NULL;
2827 			} else
2828 				info->skip_notify = 1;
2829 		}
2830 
2831 		list_for_each_entry_safe(sibling, next_sibling,
2832 					 &rt->rt6i_siblings,
2833 					 rt6i_siblings) {
2834 			err = fib6_del(sibling, info);
2835 			if (err)
2836 				goto out_unlock;
2837 		}
2838 	}
2839 
2840 	err = fib6_del(rt, info);
2841 out_unlock:
2842 	spin_unlock_bh(&table->tb6_lock);
2843 out_put:
2844 	ip6_rt_put(rt);
2845 
2846 	if (skb) {
2847 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2848 			    info->nlh, gfp_any());
2849 	}
2850 	return err;
2851 }
2852 
2853 static int ip6_route_del(struct fib6_config *cfg,
2854 			 struct netlink_ext_ack *extack)
2855 {
2856 	struct rt6_info *rt, *rt_cache;
2857 	struct fib6_table *table;
2858 	struct fib6_node *fn;
2859 	int err = -ESRCH;
2860 
2861 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2862 	if (!table) {
2863 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2864 		return err;
2865 	}
2866 
2867 	rcu_read_lock();
2868 
2869 	fn = fib6_locate(&table->tb6_root,
2870 			 &cfg->fc_dst, cfg->fc_dst_len,
2871 			 &cfg->fc_src, cfg->fc_src_len,
2872 			 !(cfg->fc_flags & RTF_CACHE));
2873 
2874 	if (fn) {
2875 		for_each_fib6_node_rt_rcu(fn) {
2876 			if (cfg->fc_flags & RTF_CACHE) {
2877 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2878 							      &cfg->fc_src);
2879 				if (!rt_cache)
2880 					continue;
2881 				rt = rt_cache;
2882 			}
2883 			if (cfg->fc_ifindex &&
2884 			    (!rt->dst.dev ||
2885 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2886 				continue;
2887 			if (cfg->fc_flags & RTF_GATEWAY &&
2888 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2889 				continue;
2890 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2891 				continue;
2892 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2893 				continue;
2894 			if (!dst_hold_safe(&rt->dst))
2895 				break;
2896 			rcu_read_unlock();
2897 
2898 			/* if gateway was specified only delete the one hop */
2899 			if (cfg->fc_flags & RTF_GATEWAY)
2900 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2901 
2902 			return __ip6_del_rt_siblings(rt, cfg);
2903 		}
2904 	}
2905 	rcu_read_unlock();
2906 
2907 	return err;
2908 }
2909 
2910 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2911 {
2912 	struct netevent_redirect netevent;
2913 	struct rt6_info *rt, *nrt = NULL;
2914 	struct ndisc_options ndopts;
2915 	struct inet6_dev *in6_dev;
2916 	struct neighbour *neigh;
2917 	struct rd_msg *msg;
2918 	int optlen, on_link;
2919 	u8 *lladdr;
2920 
2921 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2922 	optlen -= sizeof(*msg);
2923 
2924 	if (optlen < 0) {
2925 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2926 		return;
2927 	}
2928 
2929 	msg = (struct rd_msg *)icmp6_hdr(skb);
2930 
2931 	if (ipv6_addr_is_multicast(&msg->dest)) {
2932 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2933 		return;
2934 	}
2935 
2936 	on_link = 0;
2937 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2938 		on_link = 1;
2939 	} else if (ipv6_addr_type(&msg->target) !=
2940 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2941 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2942 		return;
2943 	}
2944 
2945 	in6_dev = __in6_dev_get(skb->dev);
2946 	if (!in6_dev)
2947 		return;
2948 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2949 		return;
2950 
2951 	/* RFC2461 8.1:
2952 	 *	The IP source address of the Redirect MUST be the same as the current
2953 	 *	first-hop router for the specified ICMP Destination Address.
2954 	 */
2955 
2956 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2957 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2958 		return;
2959 	}
2960 
2961 	lladdr = NULL;
2962 	if (ndopts.nd_opts_tgt_lladdr) {
2963 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2964 					     skb->dev);
2965 		if (!lladdr) {
2966 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2967 			return;
2968 		}
2969 	}
2970 
2971 	rt = (struct rt6_info *) dst;
2972 	if (rt->rt6i_flags & RTF_REJECT) {
2973 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2974 		return;
2975 	}
2976 
2977 	/* Redirect received -> path was valid.
2978 	 * Look, redirects are sent only in response to data packets,
2979 	 * so that this nexthop apparently is reachable. --ANK
2980 	 */
2981 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2982 
2983 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2984 	if (!neigh)
2985 		return;
2986 
2987 	/*
2988 	 *	We have finally decided to accept it.
2989 	 */
2990 
2991 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2992 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2993 		     NEIGH_UPDATE_F_OVERRIDE|
2994 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2995 				     NEIGH_UPDATE_F_ISROUTER)),
2996 		     NDISC_REDIRECT, &ndopts);
2997 
2998 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2999 	if (!nrt)
3000 		goto out;
3001 
3002 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3003 	if (on_link)
3004 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3005 
3006 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3007 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3008 
3009 	/* No need to remove rt from the exception table if rt is
3010 	 * a cached route because rt6_insert_exception() will
3011 	 * takes care of it
3012 	 */
3013 	if (rt6_insert_exception(nrt, rt)) {
3014 		dst_release_immediate(&nrt->dst);
3015 		goto out;
3016 	}
3017 
3018 	netevent.old = &rt->dst;
3019 	netevent.new = &nrt->dst;
3020 	netevent.daddr = &msg->dest;
3021 	netevent.neigh = neigh;
3022 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3023 
3024 out:
3025 	neigh_release(neigh);
3026 }
3027 
3028 /*
3029  *	Misc support functions
3030  */
3031 
3032 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3033 {
3034 	BUG_ON(from->dst.from);
3035 
3036 	rt->rt6i_flags &= ~RTF_EXPIRES;
3037 	dst_hold(&from->dst);
3038 	rt->dst.from = &from->dst;
3039 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3040 }
3041 
3042 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3043 {
3044 	rt->dst.input = ort->dst.input;
3045 	rt->dst.output = ort->dst.output;
3046 	rt->rt6i_dst = ort->rt6i_dst;
3047 	rt->dst.error = ort->dst.error;
3048 	rt->rt6i_idev = ort->rt6i_idev;
3049 	if (rt->rt6i_idev)
3050 		in6_dev_hold(rt->rt6i_idev);
3051 	rt->dst.lastuse = jiffies;
3052 	rt->rt6i_gateway = ort->rt6i_gateway;
3053 	rt->rt6i_flags = ort->rt6i_flags;
3054 	rt6_set_from(rt, ort);
3055 	rt->rt6i_metric = ort->rt6i_metric;
3056 #ifdef CONFIG_IPV6_SUBTREES
3057 	rt->rt6i_src = ort->rt6i_src;
3058 #endif
3059 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3060 	rt->rt6i_table = ort->rt6i_table;
3061 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3062 }
3063 
3064 #ifdef CONFIG_IPV6_ROUTE_INFO
3065 static struct rt6_info *rt6_get_route_info(struct net *net,
3066 					   const struct in6_addr *prefix, int prefixlen,
3067 					   const struct in6_addr *gwaddr,
3068 					   struct net_device *dev)
3069 {
3070 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3071 	int ifindex = dev->ifindex;
3072 	struct fib6_node *fn;
3073 	struct rt6_info *rt = NULL;
3074 	struct fib6_table *table;
3075 
3076 	table = fib6_get_table(net, tb_id);
3077 	if (!table)
3078 		return NULL;
3079 
3080 	rcu_read_lock();
3081 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3082 	if (!fn)
3083 		goto out;
3084 
3085 	for_each_fib6_node_rt_rcu(fn) {
3086 		if (rt->dst.dev->ifindex != ifindex)
3087 			continue;
3088 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3089 			continue;
3090 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3091 			continue;
3092 		ip6_hold_safe(NULL, &rt, false);
3093 		break;
3094 	}
3095 out:
3096 	rcu_read_unlock();
3097 	return rt;
3098 }
3099 
3100 static struct rt6_info *rt6_add_route_info(struct net *net,
3101 					   const struct in6_addr *prefix, int prefixlen,
3102 					   const struct in6_addr *gwaddr,
3103 					   struct net_device *dev,
3104 					   unsigned int pref)
3105 {
3106 	struct fib6_config cfg = {
3107 		.fc_metric	= IP6_RT_PRIO_USER,
3108 		.fc_ifindex	= dev->ifindex,
3109 		.fc_dst_len	= prefixlen,
3110 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3111 				  RTF_UP | RTF_PREF(pref),
3112 		.fc_protocol = RTPROT_RA,
3113 		.fc_nlinfo.portid = 0,
3114 		.fc_nlinfo.nlh = NULL,
3115 		.fc_nlinfo.nl_net = net,
3116 	};
3117 
3118 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3119 	cfg.fc_dst = *prefix;
3120 	cfg.fc_gateway = *gwaddr;
3121 
3122 	/* We should treat it as a default route if prefix length is 0. */
3123 	if (!prefixlen)
3124 		cfg.fc_flags |= RTF_DEFAULT;
3125 
3126 	ip6_route_add(&cfg, NULL);
3127 
3128 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3129 }
3130 #endif
3131 
3132 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3133 {
3134 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3135 	struct rt6_info *rt;
3136 	struct fib6_table *table;
3137 
3138 	table = fib6_get_table(dev_net(dev), tb_id);
3139 	if (!table)
3140 		return NULL;
3141 
3142 	rcu_read_lock();
3143 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3144 		if (dev == rt->dst.dev &&
3145 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3146 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3147 			break;
3148 	}
3149 	if (rt)
3150 		ip6_hold_safe(NULL, &rt, false);
3151 	rcu_read_unlock();
3152 	return rt;
3153 }
3154 
3155 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3156 				     struct net_device *dev,
3157 				     unsigned int pref)
3158 {
3159 	struct fib6_config cfg = {
3160 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3161 		.fc_metric	= IP6_RT_PRIO_USER,
3162 		.fc_ifindex	= dev->ifindex,
3163 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3164 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3165 		.fc_protocol = RTPROT_RA,
3166 		.fc_nlinfo.portid = 0,
3167 		.fc_nlinfo.nlh = NULL,
3168 		.fc_nlinfo.nl_net = dev_net(dev),
3169 	};
3170 
3171 	cfg.fc_gateway = *gwaddr;
3172 
3173 	if (!ip6_route_add(&cfg, NULL)) {
3174 		struct fib6_table *table;
3175 
3176 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3177 		if (table)
3178 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3179 	}
3180 
3181 	return rt6_get_dflt_router(gwaddr, dev);
3182 }
3183 
3184 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3185 {
3186 	struct rt6_info *rt;
3187 
3188 restart:
3189 	rcu_read_lock();
3190 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3191 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3192 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3193 			if (dst_hold_safe(&rt->dst)) {
3194 				rcu_read_unlock();
3195 				ip6_del_rt(rt);
3196 			} else {
3197 				rcu_read_unlock();
3198 			}
3199 			goto restart;
3200 		}
3201 	}
3202 	rcu_read_unlock();
3203 
3204 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3205 }
3206 
3207 void rt6_purge_dflt_routers(struct net *net)
3208 {
3209 	struct fib6_table *table;
3210 	struct hlist_head *head;
3211 	unsigned int h;
3212 
3213 	rcu_read_lock();
3214 
3215 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3216 		head = &net->ipv6.fib_table_hash[h];
3217 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3218 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3219 				__rt6_purge_dflt_routers(table);
3220 		}
3221 	}
3222 
3223 	rcu_read_unlock();
3224 }
3225 
3226 static void rtmsg_to_fib6_config(struct net *net,
3227 				 struct in6_rtmsg *rtmsg,
3228 				 struct fib6_config *cfg)
3229 {
3230 	memset(cfg, 0, sizeof(*cfg));
3231 
3232 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3233 			 : RT6_TABLE_MAIN;
3234 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3235 	cfg->fc_metric = rtmsg->rtmsg_metric;
3236 	cfg->fc_expires = rtmsg->rtmsg_info;
3237 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3238 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3239 	cfg->fc_flags = rtmsg->rtmsg_flags;
3240 
3241 	cfg->fc_nlinfo.nl_net = net;
3242 
3243 	cfg->fc_dst = rtmsg->rtmsg_dst;
3244 	cfg->fc_src = rtmsg->rtmsg_src;
3245 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3246 }
3247 
3248 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3249 {
3250 	struct fib6_config cfg;
3251 	struct in6_rtmsg rtmsg;
3252 	int err;
3253 
3254 	switch (cmd) {
3255 	case SIOCADDRT:		/* Add a route */
3256 	case SIOCDELRT:		/* Delete a route */
3257 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3258 			return -EPERM;
3259 		err = copy_from_user(&rtmsg, arg,
3260 				     sizeof(struct in6_rtmsg));
3261 		if (err)
3262 			return -EFAULT;
3263 
3264 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3265 
3266 		rtnl_lock();
3267 		switch (cmd) {
3268 		case SIOCADDRT:
3269 			err = ip6_route_add(&cfg, NULL);
3270 			break;
3271 		case SIOCDELRT:
3272 			err = ip6_route_del(&cfg, NULL);
3273 			break;
3274 		default:
3275 			err = -EINVAL;
3276 		}
3277 		rtnl_unlock();
3278 
3279 		return err;
3280 	}
3281 
3282 	return -EINVAL;
3283 }
3284 
3285 /*
3286  *	Drop the packet on the floor
3287  */
3288 
3289 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3290 {
3291 	int type;
3292 	struct dst_entry *dst = skb_dst(skb);
3293 	switch (ipstats_mib_noroutes) {
3294 	case IPSTATS_MIB_INNOROUTES:
3295 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3296 		if (type == IPV6_ADDR_ANY) {
3297 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3298 				      IPSTATS_MIB_INADDRERRORS);
3299 			break;
3300 		}
3301 		/* FALLTHROUGH */
3302 	case IPSTATS_MIB_OUTNOROUTES:
3303 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3304 			      ipstats_mib_noroutes);
3305 		break;
3306 	}
3307 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3308 	kfree_skb(skb);
3309 	return 0;
3310 }
3311 
3312 static int ip6_pkt_discard(struct sk_buff *skb)
3313 {
3314 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3315 }
3316 
3317 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3318 {
3319 	skb->dev = skb_dst(skb)->dev;
3320 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3321 }
3322 
3323 static int ip6_pkt_prohibit(struct sk_buff *skb)
3324 {
3325 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3326 }
3327 
3328 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3329 {
3330 	skb->dev = skb_dst(skb)->dev;
3331 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3332 }
3333 
3334 /*
3335  *	Allocate a dst for local (unicast / anycast) address.
3336  */
3337 
3338 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3339 				    const struct in6_addr *addr,
3340 				    bool anycast)
3341 {
3342 	u32 tb_id;
3343 	struct net *net = dev_net(idev->dev);
3344 	struct net_device *dev = idev->dev;
3345 	struct rt6_info *rt;
3346 
3347 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3348 	if (!rt)
3349 		return ERR_PTR(-ENOMEM);
3350 
3351 	in6_dev_hold(idev);
3352 
3353 	rt->dst.flags |= DST_HOST;
3354 	rt->dst.input = ip6_input;
3355 	rt->dst.output = ip6_output;
3356 	rt->rt6i_idev = idev;
3357 
3358 	rt->rt6i_protocol = RTPROT_KERNEL;
3359 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3360 	if (anycast)
3361 		rt->rt6i_flags |= RTF_ANYCAST;
3362 	else
3363 		rt->rt6i_flags |= RTF_LOCAL;
3364 
3365 	rt->rt6i_gateway  = *addr;
3366 	rt->rt6i_dst.addr = *addr;
3367 	rt->rt6i_dst.plen = 128;
3368 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3369 	rt->rt6i_table = fib6_get_table(net, tb_id);
3370 
3371 	return rt;
3372 }
3373 
3374 /* remove deleted ip from prefsrc entries */
3375 struct arg_dev_net_ip {
3376 	struct net_device *dev;
3377 	struct net *net;
3378 	struct in6_addr *addr;
3379 };
3380 
3381 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3382 {
3383 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3384 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3385 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3386 
3387 	if (((void *)rt->dst.dev == dev || !dev) &&
3388 	    rt != net->ipv6.ip6_null_entry &&
3389 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3390 		spin_lock_bh(&rt6_exception_lock);
3391 		/* remove prefsrc entry */
3392 		rt->rt6i_prefsrc.plen = 0;
3393 		/* need to update cache as well */
3394 		rt6_exceptions_remove_prefsrc(rt);
3395 		spin_unlock_bh(&rt6_exception_lock);
3396 	}
3397 	return 0;
3398 }
3399 
3400 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3401 {
3402 	struct net *net = dev_net(ifp->idev->dev);
3403 	struct arg_dev_net_ip adni = {
3404 		.dev = ifp->idev->dev,
3405 		.net = net,
3406 		.addr = &ifp->addr,
3407 	};
3408 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3409 }
3410 
3411 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3412 
3413 /* Remove routers and update dst entries when gateway turn into host. */
3414 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3415 {
3416 	struct in6_addr *gateway = (struct in6_addr *)arg;
3417 
3418 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3419 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3420 		return -1;
3421 	}
3422 
3423 	/* Further clean up cached routes in exception table.
3424 	 * This is needed because cached route may have a different
3425 	 * gateway than its 'parent' in the case of an ip redirect.
3426 	 */
3427 	rt6_exceptions_clean_tohost(rt, gateway);
3428 
3429 	return 0;
3430 }
3431 
3432 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3433 {
3434 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3435 }
3436 
3437 struct arg_dev_net {
3438 	struct net_device *dev;
3439 	struct net *net;
3440 };
3441 
3442 /* called with write lock held for table with rt */
3443 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3444 {
3445 	const struct arg_dev_net *adn = arg;
3446 	const struct net_device *dev = adn->dev;
3447 
3448 	if ((rt->dst.dev == dev || !dev) &&
3449 	    rt != adn->net->ipv6.ip6_null_entry &&
3450 	    (rt->rt6i_nsiblings == 0 ||
3451 	     (dev && netdev_unregistering(dev)) ||
3452 	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3453 		return -1;
3454 
3455 	return 0;
3456 }
3457 
3458 void rt6_ifdown(struct net *net, struct net_device *dev)
3459 {
3460 	struct arg_dev_net adn = {
3461 		.dev = dev,
3462 		.net = net,
3463 	};
3464 
3465 	fib6_clean_all(net, fib6_ifdown, &adn);
3466 	if (dev)
3467 		rt6_uncached_list_flush_dev(net, dev);
3468 }
3469 
3470 struct rt6_mtu_change_arg {
3471 	struct net_device *dev;
3472 	unsigned int mtu;
3473 };
3474 
3475 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3476 {
3477 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3478 	struct inet6_dev *idev;
3479 
3480 	/* In IPv6 pmtu discovery is not optional,
3481 	   so that RTAX_MTU lock cannot disable it.
3482 	   We still use this lock to block changes
3483 	   caused by addrconf/ndisc.
3484 	*/
3485 
3486 	idev = __in6_dev_get(arg->dev);
3487 	if (!idev)
3488 		return 0;
3489 
3490 	/* For administrative MTU increase, there is no way to discover
3491 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3492 	   Since RFC 1981 doesn't include administrative MTU increase
3493 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3494 	 */
3495 	/*
3496 	   If new MTU is less than route PMTU, this new MTU will be the
3497 	   lowest MTU in the path, update the route PMTU to reflect PMTU
3498 	   decreases; if new MTU is greater than route PMTU, and the
3499 	   old MTU is the lowest MTU in the path, update the route PMTU
3500 	   to reflect the increase. In this case if the other nodes' MTU
3501 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
3502 	   PMTU discovery.
3503 	 */
3504 	if (rt->dst.dev == arg->dev &&
3505 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
3506 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3507 		spin_lock_bh(&rt6_exception_lock);
3508 		if (dst_mtu(&rt->dst) >= arg->mtu ||
3509 		    (dst_mtu(&rt->dst) < arg->mtu &&
3510 		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3511 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3512 		}
3513 		rt6_exceptions_update_pmtu(rt, arg->mtu);
3514 		spin_unlock_bh(&rt6_exception_lock);
3515 	}
3516 	return 0;
3517 }
3518 
3519 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3520 {
3521 	struct rt6_mtu_change_arg arg = {
3522 		.dev = dev,
3523 		.mtu = mtu,
3524 	};
3525 
3526 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3527 }
3528 
3529 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3530 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3531 	[RTA_OIF]               = { .type = NLA_U32 },
3532 	[RTA_IIF]		= { .type = NLA_U32 },
3533 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3534 	[RTA_METRICS]           = { .type = NLA_NESTED },
3535 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3536 	[RTA_PREF]              = { .type = NLA_U8 },
3537 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3538 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3539 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3540 	[RTA_UID]		= { .type = NLA_U32 },
3541 	[RTA_MARK]		= { .type = NLA_U32 },
3542 };
3543 
3544 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3545 			      struct fib6_config *cfg,
3546 			      struct netlink_ext_ack *extack)
3547 {
3548 	struct rtmsg *rtm;
3549 	struct nlattr *tb[RTA_MAX+1];
3550 	unsigned int pref;
3551 	int err;
3552 
3553 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3554 			  NULL);
3555 	if (err < 0)
3556 		goto errout;
3557 
3558 	err = -EINVAL;
3559 	rtm = nlmsg_data(nlh);
3560 	memset(cfg, 0, sizeof(*cfg));
3561 
3562 	cfg->fc_table = rtm->rtm_table;
3563 	cfg->fc_dst_len = rtm->rtm_dst_len;
3564 	cfg->fc_src_len = rtm->rtm_src_len;
3565 	cfg->fc_flags = RTF_UP;
3566 	cfg->fc_protocol = rtm->rtm_protocol;
3567 	cfg->fc_type = rtm->rtm_type;
3568 
3569 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3570 	    rtm->rtm_type == RTN_BLACKHOLE ||
3571 	    rtm->rtm_type == RTN_PROHIBIT ||
3572 	    rtm->rtm_type == RTN_THROW)
3573 		cfg->fc_flags |= RTF_REJECT;
3574 
3575 	if (rtm->rtm_type == RTN_LOCAL)
3576 		cfg->fc_flags |= RTF_LOCAL;
3577 
3578 	if (rtm->rtm_flags & RTM_F_CLONED)
3579 		cfg->fc_flags |= RTF_CACHE;
3580 
3581 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3582 	cfg->fc_nlinfo.nlh = nlh;
3583 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3584 
3585 	if (tb[RTA_GATEWAY]) {
3586 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3587 		cfg->fc_flags |= RTF_GATEWAY;
3588 	}
3589 
3590 	if (tb[RTA_DST]) {
3591 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3592 
3593 		if (nla_len(tb[RTA_DST]) < plen)
3594 			goto errout;
3595 
3596 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3597 	}
3598 
3599 	if (tb[RTA_SRC]) {
3600 		int plen = (rtm->rtm_src_len + 7) >> 3;
3601 
3602 		if (nla_len(tb[RTA_SRC]) < plen)
3603 			goto errout;
3604 
3605 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3606 	}
3607 
3608 	if (tb[RTA_PREFSRC])
3609 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3610 
3611 	if (tb[RTA_OIF])
3612 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3613 
3614 	if (tb[RTA_PRIORITY])
3615 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3616 
3617 	if (tb[RTA_METRICS]) {
3618 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3619 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3620 	}
3621 
3622 	if (tb[RTA_TABLE])
3623 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3624 
3625 	if (tb[RTA_MULTIPATH]) {
3626 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3627 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3628 
3629 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3630 						     cfg->fc_mp_len, extack);
3631 		if (err < 0)
3632 			goto errout;
3633 	}
3634 
3635 	if (tb[RTA_PREF]) {
3636 		pref = nla_get_u8(tb[RTA_PREF]);
3637 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3638 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3639 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3640 		cfg->fc_flags |= RTF_PREF(pref);
3641 	}
3642 
3643 	if (tb[RTA_ENCAP])
3644 		cfg->fc_encap = tb[RTA_ENCAP];
3645 
3646 	if (tb[RTA_ENCAP_TYPE]) {
3647 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3648 
3649 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3650 		if (err < 0)
3651 			goto errout;
3652 	}
3653 
3654 	if (tb[RTA_EXPIRES]) {
3655 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3656 
3657 		if (addrconf_finite_timeout(timeout)) {
3658 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3659 			cfg->fc_flags |= RTF_EXPIRES;
3660 		}
3661 	}
3662 
3663 	err = 0;
3664 errout:
3665 	return err;
3666 }
3667 
3668 struct rt6_nh {
3669 	struct rt6_info *rt6_info;
3670 	struct fib6_config r_cfg;
3671 	struct mx6_config mxc;
3672 	struct list_head next;
3673 };
3674 
3675 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3676 {
3677 	struct rt6_nh *nh;
3678 
3679 	list_for_each_entry(nh, rt6_nh_list, next) {
3680 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3681 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3682 		        nh->r_cfg.fc_ifindex);
3683 	}
3684 }
3685 
3686 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3687 				 struct rt6_info *rt, struct fib6_config *r_cfg)
3688 {
3689 	struct rt6_nh *nh;
3690 	int err = -EEXIST;
3691 
3692 	list_for_each_entry(nh, rt6_nh_list, next) {
3693 		/* check if rt6_info already exists */
3694 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3695 			return err;
3696 	}
3697 
3698 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3699 	if (!nh)
3700 		return -ENOMEM;
3701 	nh->rt6_info = rt;
3702 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
3703 	if (err) {
3704 		kfree(nh);
3705 		return err;
3706 	}
3707 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3708 	list_add_tail(&nh->next, rt6_nh_list);
3709 
3710 	return 0;
3711 }
3712 
3713 static void ip6_route_mpath_notify(struct rt6_info *rt,
3714 				   struct rt6_info *rt_last,
3715 				   struct nl_info *info,
3716 				   __u16 nlflags)
3717 {
3718 	/* if this is an APPEND route, then rt points to the first route
3719 	 * inserted and rt_last points to last route inserted. Userspace
3720 	 * wants a consistent dump of the route which starts at the first
3721 	 * nexthop. Since sibling routes are always added at the end of
3722 	 * the list, find the first sibling of the last route appended
3723 	 */
3724 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3725 		rt = list_first_entry(&rt_last->rt6i_siblings,
3726 				      struct rt6_info,
3727 				      rt6i_siblings);
3728 	}
3729 
3730 	if (rt)
3731 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3732 }
3733 
3734 static int ip6_route_multipath_add(struct fib6_config *cfg,
3735 				   struct netlink_ext_ack *extack)
3736 {
3737 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3738 	struct nl_info *info = &cfg->fc_nlinfo;
3739 	struct fib6_config r_cfg;
3740 	struct rtnexthop *rtnh;
3741 	struct rt6_info *rt;
3742 	struct rt6_nh *err_nh;
3743 	struct rt6_nh *nh, *nh_safe;
3744 	__u16 nlflags;
3745 	int remaining;
3746 	int attrlen;
3747 	int err = 1;
3748 	int nhn = 0;
3749 	int replace = (cfg->fc_nlinfo.nlh &&
3750 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3751 	LIST_HEAD(rt6_nh_list);
3752 
3753 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3754 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3755 		nlflags |= NLM_F_APPEND;
3756 
3757 	remaining = cfg->fc_mp_len;
3758 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3759 
3760 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
3761 	 * rt6_info structs per nexthop
3762 	 */
3763 	while (rtnh_ok(rtnh, remaining)) {
3764 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3765 		if (rtnh->rtnh_ifindex)
3766 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3767 
3768 		attrlen = rtnh_attrlen(rtnh);
3769 		if (attrlen > 0) {
3770 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3771 
3772 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3773 			if (nla) {
3774 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
3775 				r_cfg.fc_flags |= RTF_GATEWAY;
3776 			}
3777 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3778 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3779 			if (nla)
3780 				r_cfg.fc_encap_type = nla_get_u16(nla);
3781 		}
3782 
3783 		rt = ip6_route_info_create(&r_cfg, extack);
3784 		if (IS_ERR(rt)) {
3785 			err = PTR_ERR(rt);
3786 			rt = NULL;
3787 			goto cleanup;
3788 		}
3789 
3790 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3791 		if (err) {
3792 			dst_release_immediate(&rt->dst);
3793 			goto cleanup;
3794 		}
3795 
3796 		rtnh = rtnh_next(rtnh, &remaining);
3797 	}
3798 
3799 	/* for add and replace send one notification with all nexthops.
3800 	 * Skip the notification in fib6_add_rt2node and send one with
3801 	 * the full route when done
3802 	 */
3803 	info->skip_notify = 1;
3804 
3805 	err_nh = NULL;
3806 	list_for_each_entry(nh, &rt6_nh_list, next) {
3807 		rt_last = nh->rt6_info;
3808 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3809 		/* save reference to first route for notification */
3810 		if (!rt_notif && !err)
3811 			rt_notif = nh->rt6_info;
3812 
3813 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
3814 		nh->rt6_info = NULL;
3815 		if (err) {
3816 			if (replace && nhn)
3817 				ip6_print_replace_route_err(&rt6_nh_list);
3818 			err_nh = nh;
3819 			goto add_errout;
3820 		}
3821 
3822 		/* Because each route is added like a single route we remove
3823 		 * these flags after the first nexthop: if there is a collision,
3824 		 * we have already failed to add the first nexthop:
3825 		 * fib6_add_rt2node() has rejected it; when replacing, old
3826 		 * nexthops have been replaced by first new, the rest should
3827 		 * be added to it.
3828 		 */
3829 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3830 						     NLM_F_REPLACE);
3831 		nhn++;
3832 	}
3833 
3834 	/* success ... tell user about new route */
3835 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3836 	goto cleanup;
3837 
3838 add_errout:
3839 	/* send notification for routes that were added so that
3840 	 * the delete notifications sent by ip6_route_del are
3841 	 * coherent
3842 	 */
3843 	if (rt_notif)
3844 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3845 
3846 	/* Delete routes that were already added */
3847 	list_for_each_entry(nh, &rt6_nh_list, next) {
3848 		if (err_nh == nh)
3849 			break;
3850 		ip6_route_del(&nh->r_cfg, extack);
3851 	}
3852 
3853 cleanup:
3854 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3855 		if (nh->rt6_info)
3856 			dst_release_immediate(&nh->rt6_info->dst);
3857 		kfree(nh->mxc.mx);
3858 		list_del(&nh->next);
3859 		kfree(nh);
3860 	}
3861 
3862 	return err;
3863 }
3864 
3865 static int ip6_route_multipath_del(struct fib6_config *cfg,
3866 				   struct netlink_ext_ack *extack)
3867 {
3868 	struct fib6_config r_cfg;
3869 	struct rtnexthop *rtnh;
3870 	int remaining;
3871 	int attrlen;
3872 	int err = 1, last_err = 0;
3873 
3874 	remaining = cfg->fc_mp_len;
3875 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3876 
3877 	/* Parse a Multipath Entry */
3878 	while (rtnh_ok(rtnh, remaining)) {
3879 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3880 		if (rtnh->rtnh_ifindex)
3881 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3882 
3883 		attrlen = rtnh_attrlen(rtnh);
3884 		if (attrlen > 0) {
3885 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3886 
3887 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3888 			if (nla) {
3889 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3890 				r_cfg.fc_flags |= RTF_GATEWAY;
3891 			}
3892 		}
3893 		err = ip6_route_del(&r_cfg, extack);
3894 		if (err)
3895 			last_err = err;
3896 
3897 		rtnh = rtnh_next(rtnh, &remaining);
3898 	}
3899 
3900 	return last_err;
3901 }
3902 
3903 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3904 			      struct netlink_ext_ack *extack)
3905 {
3906 	struct fib6_config cfg;
3907 	int err;
3908 
3909 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3910 	if (err < 0)
3911 		return err;
3912 
3913 	if (cfg.fc_mp)
3914 		return ip6_route_multipath_del(&cfg, extack);
3915 	else {
3916 		cfg.fc_delete_all_nh = 1;
3917 		return ip6_route_del(&cfg, extack);
3918 	}
3919 }
3920 
3921 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3922 			      struct netlink_ext_ack *extack)
3923 {
3924 	struct fib6_config cfg;
3925 	int err;
3926 
3927 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3928 	if (err < 0)
3929 		return err;
3930 
3931 	if (cfg.fc_mp)
3932 		return ip6_route_multipath_add(&cfg, extack);
3933 	else
3934 		return ip6_route_add(&cfg, extack);
3935 }
3936 
3937 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3938 {
3939 	int nexthop_len = 0;
3940 
3941 	if (rt->rt6i_nsiblings) {
3942 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
3943 			    + NLA_ALIGN(sizeof(struct rtnexthop))
3944 			    + nla_total_size(16) /* RTA_GATEWAY */
3945 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
3946 
3947 		nexthop_len *= rt->rt6i_nsiblings;
3948 	}
3949 
3950 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3951 	       + nla_total_size(16) /* RTA_SRC */
3952 	       + nla_total_size(16) /* RTA_DST */
3953 	       + nla_total_size(16) /* RTA_GATEWAY */
3954 	       + nla_total_size(16) /* RTA_PREFSRC */
3955 	       + nla_total_size(4) /* RTA_TABLE */
3956 	       + nla_total_size(4) /* RTA_IIF */
3957 	       + nla_total_size(4) /* RTA_OIF */
3958 	       + nla_total_size(4) /* RTA_PRIORITY */
3959 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3960 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3961 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3962 	       + nla_total_size(1) /* RTA_PREF */
3963 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
3964 	       + nexthop_len;
3965 }
3966 
3967 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3968 			    unsigned int *flags, bool skip_oif)
3969 {
3970 	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3971 		*flags |= RTNH_F_LINKDOWN;
3972 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3973 			*flags |= RTNH_F_DEAD;
3974 	}
3975 
3976 	if (rt->rt6i_flags & RTF_GATEWAY) {
3977 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3978 			goto nla_put_failure;
3979 	}
3980 
3981 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3982 		*flags |= RTNH_F_OFFLOAD;
3983 
3984 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
3985 	if (!skip_oif && rt->dst.dev &&
3986 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3987 		goto nla_put_failure;
3988 
3989 	if (rt->dst.lwtstate &&
3990 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3991 		goto nla_put_failure;
3992 
3993 	return 0;
3994 
3995 nla_put_failure:
3996 	return -EMSGSIZE;
3997 }
3998 
3999 /* add multipath next hop */
4000 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4001 {
4002 	struct rtnexthop *rtnh;
4003 	unsigned int flags = 0;
4004 
4005 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4006 	if (!rtnh)
4007 		goto nla_put_failure;
4008 
4009 	rtnh->rtnh_hops = 0;
4010 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4011 
4012 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4013 		goto nla_put_failure;
4014 
4015 	rtnh->rtnh_flags = flags;
4016 
4017 	/* length of rtnetlink header + attributes */
4018 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4019 
4020 	return 0;
4021 
4022 nla_put_failure:
4023 	return -EMSGSIZE;
4024 }
4025 
4026 static int rt6_fill_node(struct net *net,
4027 			 struct sk_buff *skb, struct rt6_info *rt,
4028 			 struct in6_addr *dst, struct in6_addr *src,
4029 			 int iif, int type, u32 portid, u32 seq,
4030 			 unsigned int flags)
4031 {
4032 	u32 metrics[RTAX_MAX];
4033 	struct rtmsg *rtm;
4034 	struct nlmsghdr *nlh;
4035 	long expires;
4036 	u32 table;
4037 
4038 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4039 	if (!nlh)
4040 		return -EMSGSIZE;
4041 
4042 	rtm = nlmsg_data(nlh);
4043 	rtm->rtm_family = AF_INET6;
4044 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4045 	rtm->rtm_src_len = rt->rt6i_src.plen;
4046 	rtm->rtm_tos = 0;
4047 	if (rt->rt6i_table)
4048 		table = rt->rt6i_table->tb6_id;
4049 	else
4050 		table = RT6_TABLE_UNSPEC;
4051 	rtm->rtm_table = table;
4052 	if (nla_put_u32(skb, RTA_TABLE, table))
4053 		goto nla_put_failure;
4054 	if (rt->rt6i_flags & RTF_REJECT) {
4055 		switch (rt->dst.error) {
4056 		case -EINVAL:
4057 			rtm->rtm_type = RTN_BLACKHOLE;
4058 			break;
4059 		case -EACCES:
4060 			rtm->rtm_type = RTN_PROHIBIT;
4061 			break;
4062 		case -EAGAIN:
4063 			rtm->rtm_type = RTN_THROW;
4064 			break;
4065 		default:
4066 			rtm->rtm_type = RTN_UNREACHABLE;
4067 			break;
4068 		}
4069 	}
4070 	else if (rt->rt6i_flags & RTF_LOCAL)
4071 		rtm->rtm_type = RTN_LOCAL;
4072 	else if (rt->rt6i_flags & RTF_ANYCAST)
4073 		rtm->rtm_type = RTN_ANYCAST;
4074 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4075 		rtm->rtm_type = RTN_LOCAL;
4076 	else
4077 		rtm->rtm_type = RTN_UNICAST;
4078 	rtm->rtm_flags = 0;
4079 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4080 	rtm->rtm_protocol = rt->rt6i_protocol;
4081 
4082 	if (rt->rt6i_flags & RTF_CACHE)
4083 		rtm->rtm_flags |= RTM_F_CLONED;
4084 
4085 	if (dst) {
4086 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4087 			goto nla_put_failure;
4088 		rtm->rtm_dst_len = 128;
4089 	} else if (rtm->rtm_dst_len)
4090 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4091 			goto nla_put_failure;
4092 #ifdef CONFIG_IPV6_SUBTREES
4093 	if (src) {
4094 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4095 			goto nla_put_failure;
4096 		rtm->rtm_src_len = 128;
4097 	} else if (rtm->rtm_src_len &&
4098 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4099 		goto nla_put_failure;
4100 #endif
4101 	if (iif) {
4102 #ifdef CONFIG_IPV6_MROUTE
4103 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4104 			int err = ip6mr_get_route(net, skb, rtm, portid);
4105 
4106 			if (err == 0)
4107 				return 0;
4108 			if (err < 0)
4109 				goto nla_put_failure;
4110 		} else
4111 #endif
4112 			if (nla_put_u32(skb, RTA_IIF, iif))
4113 				goto nla_put_failure;
4114 	} else if (dst) {
4115 		struct in6_addr saddr_buf;
4116 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4117 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4118 			goto nla_put_failure;
4119 	}
4120 
4121 	if (rt->rt6i_prefsrc.plen) {
4122 		struct in6_addr saddr_buf;
4123 		saddr_buf = rt->rt6i_prefsrc.addr;
4124 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4125 			goto nla_put_failure;
4126 	}
4127 
4128 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4129 	if (rt->rt6i_pmtu)
4130 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4131 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4132 		goto nla_put_failure;
4133 
4134 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4135 		goto nla_put_failure;
4136 
4137 	/* For multipath routes, walk the siblings list and add
4138 	 * each as a nexthop within RTA_MULTIPATH.
4139 	 */
4140 	if (rt->rt6i_nsiblings) {
4141 		struct rt6_info *sibling, *next_sibling;
4142 		struct nlattr *mp;
4143 
4144 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4145 		if (!mp)
4146 			goto nla_put_failure;
4147 
4148 		if (rt6_add_nexthop(skb, rt) < 0)
4149 			goto nla_put_failure;
4150 
4151 		list_for_each_entry_safe(sibling, next_sibling,
4152 					 &rt->rt6i_siblings, rt6i_siblings) {
4153 			if (rt6_add_nexthop(skb, sibling) < 0)
4154 				goto nla_put_failure;
4155 		}
4156 
4157 		nla_nest_end(skb, mp);
4158 	} else {
4159 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4160 			goto nla_put_failure;
4161 	}
4162 
4163 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4164 
4165 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4166 		goto nla_put_failure;
4167 
4168 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4169 		goto nla_put_failure;
4170 
4171 
4172 	nlmsg_end(skb, nlh);
4173 	return 0;
4174 
4175 nla_put_failure:
4176 	nlmsg_cancel(skb, nlh);
4177 	return -EMSGSIZE;
4178 }
4179 
4180 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4181 {
4182 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4183 	struct net *net = arg->net;
4184 
4185 	if (rt == net->ipv6.ip6_null_entry)
4186 		return 0;
4187 
4188 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4189 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4190 
4191 		/* user wants prefix routes only */
4192 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4193 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4194 			/* success since this is not a prefix route */
4195 			return 1;
4196 		}
4197 	}
4198 
4199 	return rt6_fill_node(net,
4200 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4201 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4202 		     NLM_F_MULTI);
4203 }
4204 
4205 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4206 			      struct netlink_ext_ack *extack)
4207 {
4208 	struct net *net = sock_net(in_skb->sk);
4209 	struct nlattr *tb[RTA_MAX+1];
4210 	int err, iif = 0, oif = 0;
4211 	struct dst_entry *dst;
4212 	struct rt6_info *rt;
4213 	struct sk_buff *skb;
4214 	struct rtmsg *rtm;
4215 	struct flowi6 fl6;
4216 	bool fibmatch;
4217 
4218 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4219 			  extack);
4220 	if (err < 0)
4221 		goto errout;
4222 
4223 	err = -EINVAL;
4224 	memset(&fl6, 0, sizeof(fl6));
4225 	rtm = nlmsg_data(nlh);
4226 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4227 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4228 
4229 	if (tb[RTA_SRC]) {
4230 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4231 			goto errout;
4232 
4233 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4234 	}
4235 
4236 	if (tb[RTA_DST]) {
4237 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4238 			goto errout;
4239 
4240 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4241 	}
4242 
4243 	if (tb[RTA_IIF])
4244 		iif = nla_get_u32(tb[RTA_IIF]);
4245 
4246 	if (tb[RTA_OIF])
4247 		oif = nla_get_u32(tb[RTA_OIF]);
4248 
4249 	if (tb[RTA_MARK])
4250 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4251 
4252 	if (tb[RTA_UID])
4253 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4254 					   nla_get_u32(tb[RTA_UID]));
4255 	else
4256 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4257 
4258 	if (iif) {
4259 		struct net_device *dev;
4260 		int flags = 0;
4261 
4262 		rcu_read_lock();
4263 
4264 		dev = dev_get_by_index_rcu(net, iif);
4265 		if (!dev) {
4266 			rcu_read_unlock();
4267 			err = -ENODEV;
4268 			goto errout;
4269 		}
4270 
4271 		fl6.flowi6_iif = iif;
4272 
4273 		if (!ipv6_addr_any(&fl6.saddr))
4274 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4275 
4276 		if (!fibmatch)
4277 			dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4278 		else
4279 			dst = ip6_route_lookup(net, &fl6, 0);
4280 
4281 		rcu_read_unlock();
4282 	} else {
4283 		fl6.flowi6_oif = oif;
4284 
4285 		if (!fibmatch)
4286 			dst = ip6_route_output(net, NULL, &fl6);
4287 		else
4288 			dst = ip6_route_lookup(net, &fl6, 0);
4289 	}
4290 
4291 
4292 	rt = container_of(dst, struct rt6_info, dst);
4293 	if (rt->dst.error) {
4294 		err = rt->dst.error;
4295 		ip6_rt_put(rt);
4296 		goto errout;
4297 	}
4298 
4299 	if (rt == net->ipv6.ip6_null_entry) {
4300 		err = rt->dst.error;
4301 		ip6_rt_put(rt);
4302 		goto errout;
4303 	}
4304 
4305 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4306 	if (!skb) {
4307 		ip6_rt_put(rt);
4308 		err = -ENOBUFS;
4309 		goto errout;
4310 	}
4311 
4312 	skb_dst_set(skb, &rt->dst);
4313 	if (fibmatch)
4314 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4315 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4316 				    nlh->nlmsg_seq, 0);
4317 	else
4318 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4319 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4320 				    nlh->nlmsg_seq, 0);
4321 	if (err < 0) {
4322 		kfree_skb(skb);
4323 		goto errout;
4324 	}
4325 
4326 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4327 errout:
4328 	return err;
4329 }
4330 
4331 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4332 		     unsigned int nlm_flags)
4333 {
4334 	struct sk_buff *skb;
4335 	struct net *net = info->nl_net;
4336 	u32 seq;
4337 	int err;
4338 
4339 	err = -ENOBUFS;
4340 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4341 
4342 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4343 	if (!skb)
4344 		goto errout;
4345 
4346 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4347 				event, info->portid, seq, nlm_flags);
4348 	if (err < 0) {
4349 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4350 		WARN_ON(err == -EMSGSIZE);
4351 		kfree_skb(skb);
4352 		goto errout;
4353 	}
4354 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4355 		    info->nlh, gfp_any());
4356 	return;
4357 errout:
4358 	if (err < 0)
4359 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4360 }
4361 
4362 static int ip6_route_dev_notify(struct notifier_block *this,
4363 				unsigned long event, void *ptr)
4364 {
4365 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4366 	struct net *net = dev_net(dev);
4367 
4368 	if (!(dev->flags & IFF_LOOPBACK))
4369 		return NOTIFY_OK;
4370 
4371 	if (event == NETDEV_REGISTER) {
4372 		net->ipv6.ip6_null_entry->dst.dev = dev;
4373 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4374 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4375 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4376 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4377 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4378 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4379 #endif
4380 	 } else if (event == NETDEV_UNREGISTER &&
4381 		    dev->reg_state != NETREG_UNREGISTERED) {
4382 		/* NETDEV_UNREGISTER could be fired for multiple times by
4383 		 * netdev_wait_allrefs(). Make sure we only call this once.
4384 		 */
4385 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4386 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4387 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4388 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4389 #endif
4390 	}
4391 
4392 	return NOTIFY_OK;
4393 }
4394 
4395 /*
4396  *	/proc
4397  */
4398 
4399 #ifdef CONFIG_PROC_FS
4400 
4401 static const struct file_operations ipv6_route_proc_fops = {
4402 	.owner		= THIS_MODULE,
4403 	.open		= ipv6_route_open,
4404 	.read		= seq_read,
4405 	.llseek		= seq_lseek,
4406 	.release	= seq_release_net,
4407 };
4408 
4409 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4410 {
4411 	struct net *net = (struct net *)seq->private;
4412 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4413 		   net->ipv6.rt6_stats->fib_nodes,
4414 		   net->ipv6.rt6_stats->fib_route_nodes,
4415 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4416 		   net->ipv6.rt6_stats->fib_rt_entries,
4417 		   net->ipv6.rt6_stats->fib_rt_cache,
4418 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4419 		   net->ipv6.rt6_stats->fib_discarded_routes);
4420 
4421 	return 0;
4422 }
4423 
4424 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4425 {
4426 	return single_open_net(inode, file, rt6_stats_seq_show);
4427 }
4428 
4429 static const struct file_operations rt6_stats_seq_fops = {
4430 	.owner	 = THIS_MODULE,
4431 	.open	 = rt6_stats_seq_open,
4432 	.read	 = seq_read,
4433 	.llseek	 = seq_lseek,
4434 	.release = single_release_net,
4435 };
4436 #endif	/* CONFIG_PROC_FS */
4437 
4438 #ifdef CONFIG_SYSCTL
4439 
4440 static
4441 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4442 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4443 {
4444 	struct net *net;
4445 	int delay;
4446 	if (!write)
4447 		return -EINVAL;
4448 
4449 	net = (struct net *)ctl->extra1;
4450 	delay = net->ipv6.sysctl.flush_delay;
4451 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4452 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4453 	return 0;
4454 }
4455 
4456 struct ctl_table ipv6_route_table_template[] = {
4457 	{
4458 		.procname	=	"flush",
4459 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4460 		.maxlen		=	sizeof(int),
4461 		.mode		=	0200,
4462 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4463 	},
4464 	{
4465 		.procname	=	"gc_thresh",
4466 		.data		=	&ip6_dst_ops_template.gc_thresh,
4467 		.maxlen		=	sizeof(int),
4468 		.mode		=	0644,
4469 		.proc_handler	=	proc_dointvec,
4470 	},
4471 	{
4472 		.procname	=	"max_size",
4473 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4474 		.maxlen		=	sizeof(int),
4475 		.mode		=	0644,
4476 		.proc_handler	=	proc_dointvec,
4477 	},
4478 	{
4479 		.procname	=	"gc_min_interval",
4480 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4481 		.maxlen		=	sizeof(int),
4482 		.mode		=	0644,
4483 		.proc_handler	=	proc_dointvec_jiffies,
4484 	},
4485 	{
4486 		.procname	=	"gc_timeout",
4487 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4488 		.maxlen		=	sizeof(int),
4489 		.mode		=	0644,
4490 		.proc_handler	=	proc_dointvec_jiffies,
4491 	},
4492 	{
4493 		.procname	=	"gc_interval",
4494 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4495 		.maxlen		=	sizeof(int),
4496 		.mode		=	0644,
4497 		.proc_handler	=	proc_dointvec_jiffies,
4498 	},
4499 	{
4500 		.procname	=	"gc_elasticity",
4501 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4502 		.maxlen		=	sizeof(int),
4503 		.mode		=	0644,
4504 		.proc_handler	=	proc_dointvec,
4505 	},
4506 	{
4507 		.procname	=	"mtu_expires",
4508 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4509 		.maxlen		=	sizeof(int),
4510 		.mode		=	0644,
4511 		.proc_handler	=	proc_dointvec_jiffies,
4512 	},
4513 	{
4514 		.procname	=	"min_adv_mss",
4515 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4516 		.maxlen		=	sizeof(int),
4517 		.mode		=	0644,
4518 		.proc_handler	=	proc_dointvec,
4519 	},
4520 	{
4521 		.procname	=	"gc_min_interval_ms",
4522 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4523 		.maxlen		=	sizeof(int),
4524 		.mode		=	0644,
4525 		.proc_handler	=	proc_dointvec_ms_jiffies,
4526 	},
4527 	{ }
4528 };
4529 
4530 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4531 {
4532 	struct ctl_table *table;
4533 
4534 	table = kmemdup(ipv6_route_table_template,
4535 			sizeof(ipv6_route_table_template),
4536 			GFP_KERNEL);
4537 
4538 	if (table) {
4539 		table[0].data = &net->ipv6.sysctl.flush_delay;
4540 		table[0].extra1 = net;
4541 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4542 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4543 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4544 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4545 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4546 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4547 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4548 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4549 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4550 
4551 		/* Don't export sysctls to unprivileged users */
4552 		if (net->user_ns != &init_user_ns)
4553 			table[0].procname = NULL;
4554 	}
4555 
4556 	return table;
4557 }
4558 #endif
4559 
4560 static int __net_init ip6_route_net_init(struct net *net)
4561 {
4562 	int ret = -ENOMEM;
4563 
4564 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4565 	       sizeof(net->ipv6.ip6_dst_ops));
4566 
4567 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4568 		goto out_ip6_dst_ops;
4569 
4570 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4571 					   sizeof(*net->ipv6.ip6_null_entry),
4572 					   GFP_KERNEL);
4573 	if (!net->ipv6.ip6_null_entry)
4574 		goto out_ip6_dst_entries;
4575 	net->ipv6.ip6_null_entry->dst.path =
4576 		(struct dst_entry *)net->ipv6.ip6_null_entry;
4577 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4578 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4579 			 ip6_template_metrics, true);
4580 
4581 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4582 	net->ipv6.fib6_has_custom_rules = false;
4583 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4584 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4585 					       GFP_KERNEL);
4586 	if (!net->ipv6.ip6_prohibit_entry)
4587 		goto out_ip6_null_entry;
4588 	net->ipv6.ip6_prohibit_entry->dst.path =
4589 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4590 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4591 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4592 			 ip6_template_metrics, true);
4593 
4594 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4595 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4596 					       GFP_KERNEL);
4597 	if (!net->ipv6.ip6_blk_hole_entry)
4598 		goto out_ip6_prohibit_entry;
4599 	net->ipv6.ip6_blk_hole_entry->dst.path =
4600 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4601 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4602 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4603 			 ip6_template_metrics, true);
4604 #endif
4605 
4606 	net->ipv6.sysctl.flush_delay = 0;
4607 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4608 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4609 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4610 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4611 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4612 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4613 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4614 
4615 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4616 
4617 	ret = 0;
4618 out:
4619 	return ret;
4620 
4621 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4622 out_ip6_prohibit_entry:
4623 	kfree(net->ipv6.ip6_prohibit_entry);
4624 out_ip6_null_entry:
4625 	kfree(net->ipv6.ip6_null_entry);
4626 #endif
4627 out_ip6_dst_entries:
4628 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4629 out_ip6_dst_ops:
4630 	goto out;
4631 }
4632 
4633 static void __net_exit ip6_route_net_exit(struct net *net)
4634 {
4635 	kfree(net->ipv6.ip6_null_entry);
4636 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4637 	kfree(net->ipv6.ip6_prohibit_entry);
4638 	kfree(net->ipv6.ip6_blk_hole_entry);
4639 #endif
4640 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4641 }
4642 
4643 static int __net_init ip6_route_net_init_late(struct net *net)
4644 {
4645 #ifdef CONFIG_PROC_FS
4646 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4647 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4648 #endif
4649 	return 0;
4650 }
4651 
4652 static void __net_exit ip6_route_net_exit_late(struct net *net)
4653 {
4654 #ifdef CONFIG_PROC_FS
4655 	remove_proc_entry("ipv6_route", net->proc_net);
4656 	remove_proc_entry("rt6_stats", net->proc_net);
4657 #endif
4658 }
4659 
4660 static struct pernet_operations ip6_route_net_ops = {
4661 	.init = ip6_route_net_init,
4662 	.exit = ip6_route_net_exit,
4663 };
4664 
4665 static int __net_init ipv6_inetpeer_init(struct net *net)
4666 {
4667 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4668 
4669 	if (!bp)
4670 		return -ENOMEM;
4671 	inet_peer_base_init(bp);
4672 	net->ipv6.peers = bp;
4673 	return 0;
4674 }
4675 
4676 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4677 {
4678 	struct inet_peer_base *bp = net->ipv6.peers;
4679 
4680 	net->ipv6.peers = NULL;
4681 	inetpeer_invalidate_tree(bp);
4682 	kfree(bp);
4683 }
4684 
4685 static struct pernet_operations ipv6_inetpeer_ops = {
4686 	.init	=	ipv6_inetpeer_init,
4687 	.exit	=	ipv6_inetpeer_exit,
4688 };
4689 
4690 static struct pernet_operations ip6_route_net_late_ops = {
4691 	.init = ip6_route_net_init_late,
4692 	.exit = ip6_route_net_exit_late,
4693 };
4694 
4695 static struct notifier_block ip6_route_dev_notifier = {
4696 	.notifier_call = ip6_route_dev_notify,
4697 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4698 };
4699 
4700 void __init ip6_route_init_special_entries(void)
4701 {
4702 	/* Registering of the loopback is done before this portion of code,
4703 	 * the loopback reference in rt6_info will not be taken, do it
4704 	 * manually for init_net */
4705 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4706 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4707   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4708 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4709 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4710 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4711 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4712   #endif
4713 }
4714 
4715 int __init ip6_route_init(void)
4716 {
4717 	int ret;
4718 	int cpu;
4719 
4720 	ret = -ENOMEM;
4721 	ip6_dst_ops_template.kmem_cachep =
4722 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4723 				  SLAB_HWCACHE_ALIGN, NULL);
4724 	if (!ip6_dst_ops_template.kmem_cachep)
4725 		goto out;
4726 
4727 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
4728 	if (ret)
4729 		goto out_kmem_cache;
4730 
4731 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4732 	if (ret)
4733 		goto out_dst_entries;
4734 
4735 	ret = register_pernet_subsys(&ip6_route_net_ops);
4736 	if (ret)
4737 		goto out_register_inetpeer;
4738 
4739 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4740 
4741 	ret = fib6_init();
4742 	if (ret)
4743 		goto out_register_subsys;
4744 
4745 	ret = xfrm6_init();
4746 	if (ret)
4747 		goto out_fib6_init;
4748 
4749 	ret = fib6_rules_init();
4750 	if (ret)
4751 		goto xfrm6_init;
4752 
4753 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
4754 	if (ret)
4755 		goto fib6_rules_init;
4756 
4757 	ret = -ENOBUFS;
4758 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4759 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4760 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4761 			    RTNL_FLAG_DOIT_UNLOCKED))
4762 		goto out_register_late_subsys;
4763 
4764 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4765 	if (ret)
4766 		goto out_register_late_subsys;
4767 
4768 	for_each_possible_cpu(cpu) {
4769 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4770 
4771 		INIT_LIST_HEAD(&ul->head);
4772 		spin_lock_init(&ul->lock);
4773 	}
4774 
4775 out:
4776 	return ret;
4777 
4778 out_register_late_subsys:
4779 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4780 fib6_rules_init:
4781 	fib6_rules_cleanup();
4782 xfrm6_init:
4783 	xfrm6_fini();
4784 out_fib6_init:
4785 	fib6_gc_cleanup();
4786 out_register_subsys:
4787 	unregister_pernet_subsys(&ip6_route_net_ops);
4788 out_register_inetpeer:
4789 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4790 out_dst_entries:
4791 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4792 out_kmem_cache:
4793 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4794 	goto out;
4795 }
4796 
4797 void ip6_route_cleanup(void)
4798 {
4799 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
4800 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4801 	fib6_rules_cleanup();
4802 	xfrm6_fini();
4803 	fib6_gc_cleanup();
4804 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4805 	unregister_pernet_subsys(&ip6_route_net_ops);
4806 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4807 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4808 }
4809