xref: /linux/net/ipv6/route.c (revision b04df400c30235fa347313c9e2a0695549bd2c8e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 			 struct fib6_info *rt, struct dst_entry *dst,
103 			 struct in6_addr *dest, struct in6_addr *src,
104 			 int iif, int type, u32 portid, u32 seq,
105 			 unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 					   struct in6_addr *daddr,
108 					   struct in6_addr *saddr);
109 
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 					   const struct in6_addr *prefix, int prefixlen,
113 					   const struct in6_addr *gwaddr,
114 					   struct net_device *dev,
115 					   unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 					   const struct in6_addr *prefix, int prefixlen,
118 					   const struct in6_addr *gwaddr,
119 					   struct net_device *dev);
120 #endif
121 
122 struct uncached_list {
123 	spinlock_t		lock;
124 	struct list_head	head;
125 };
126 
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 
133 	rt->rt6i_uncached_list = ul;
134 
135 	spin_lock_bh(&ul->lock);
136 	list_add_tail(&rt->rt6i_uncached, &ul->head);
137 	spin_unlock_bh(&ul->lock);
138 }
139 
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142 	if (!list_empty(&rt->rt6i_uncached)) {
143 		struct uncached_list *ul = rt->rt6i_uncached_list;
144 		struct net *net = dev_net(rt->dst.dev);
145 
146 		spin_lock_bh(&ul->lock);
147 		list_del(&rt->rt6i_uncached);
148 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 		spin_unlock_bh(&ul->lock);
150 	}
151 }
152 
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155 	struct net_device *loopback_dev = net->loopback_dev;
156 	int cpu;
157 
158 	if (dev == loopback_dev)
159 		return;
160 
161 	for_each_possible_cpu(cpu) {
162 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163 		struct rt6_info *rt;
164 
165 		spin_lock_bh(&ul->lock);
166 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 			struct inet6_dev *rt_idev = rt->rt6i_idev;
168 			struct net_device *rt_dev = rt->dst.dev;
169 
170 			if (rt_idev->dev == dev) {
171 				rt->rt6i_idev = in6_dev_get(loopback_dev);
172 				in6_dev_put(rt_idev);
173 			}
174 
175 			if (rt_dev == dev) {
176 				rt->dst.dev = loopback_dev;
177 				dev_hold(rt->dst.dev);
178 				dev_put(rt_dev);
179 			}
180 		}
181 		spin_unlock_bh(&ul->lock);
182 	}
183 }
184 
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	if (!ipv6_addr_any(p))
190 		return (const void *) p;
191 	else if (skb)
192 		return &ipv6_hdr(skb)->daddr;
193 	return daddr;
194 }
195 
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 				   struct net_device *dev,
198 				   struct sk_buff *skb,
199 				   const void *daddr)
200 {
201 	struct neighbour *n;
202 
203 	daddr = choose_neigh_daddr(gw, skb, daddr);
204 	n = __ipv6_neigh_lookup(dev, daddr);
205 	if (n)
206 		return n;
207 	return neigh_create(&nd_tbl, daddr, dev);
208 }
209 
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211 					      struct sk_buff *skb,
212 					      const void *daddr)
213 {
214 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215 
216 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218 
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221 	struct net_device *dev = dst->dev;
222 	struct rt6_info *rt = (struct rt6_info *)dst;
223 
224 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 	if (!daddr)
226 		return;
227 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228 		return;
229 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230 		return;
231 	__ipv6_confirm_neigh(dev, daddr);
232 }
233 
234 static struct dst_ops ip6_dst_ops_template = {
235 	.family			=	AF_INET6,
236 	.gc			=	ip6_dst_gc,
237 	.gc_thresh		=	1024,
238 	.check			=	ip6_dst_check,
239 	.default_advmss		=	ip6_default_advmss,
240 	.mtu			=	ip6_mtu,
241 	.cow_metrics		=	dst_cow_metrics_generic,
242 	.destroy		=	ip6_dst_destroy,
243 	.ifdown			=	ip6_dst_ifdown,
244 	.negative_advice	=	ip6_negative_advice,
245 	.link_failure		=	ip6_link_failure,
246 	.update_pmtu		=	ip6_rt_update_pmtu,
247 	.redirect		=	rt6_do_redirect,
248 	.local_out		=	__ip6_local_out,
249 	.neigh_lookup		=	ip6_dst_neigh_lookup,
250 	.confirm_neigh		=	ip6_confirm_neigh,
251 };
252 
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256 
257 	return mtu ? : dst->dev->mtu;
258 }
259 
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 					 struct sk_buff *skb, u32 mtu)
262 {
263 }
264 
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266 				      struct sk_buff *skb)
267 {
268 }
269 
270 static struct dst_ops ip6_dst_blackhole_ops = {
271 	.family			=	AF_INET6,
272 	.destroy		=	ip6_dst_destroy,
273 	.check			=	ip6_dst_check,
274 	.mtu			=	ip6_blackhole_mtu,
275 	.default_advmss		=	ip6_default_advmss,
276 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277 	.redirect		=	ip6_rt_blackhole_redirect,
278 	.cow_metrics		=	dst_cow_metrics_generic,
279 	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 };
281 
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 	[RTAX_HOPLIMIT - 1] = 0,
284 };
285 
286 static const struct fib6_info fib6_null_entry_template = {
287 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
288 	.fib6_protocol  = RTPROT_KERNEL,
289 	.fib6_metric	= ~(u32)0,
290 	.fib6_ref	= ATOMIC_INIT(1),
291 	.fib6_type	= RTN_UNREACHABLE,
292 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
293 };
294 
295 static const struct rt6_info ip6_null_entry_template = {
296 	.dst = {
297 		.__refcnt	= ATOMIC_INIT(1),
298 		.__use		= 1,
299 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 		.error		= -ENETUNREACH,
301 		.input		= ip6_pkt_discard,
302 		.output		= ip6_pkt_discard_out,
303 	},
304 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
305 };
306 
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308 
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 	.dst = {
311 		.__refcnt	= ATOMIC_INIT(1),
312 		.__use		= 1,
313 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 		.error		= -EACCES,
315 		.input		= ip6_pkt_prohibit,
316 		.output		= ip6_pkt_prohibit_out,
317 	},
318 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319 };
320 
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322 	.dst = {
323 		.__refcnt	= ATOMIC_INIT(1),
324 		.__use		= 1,
325 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 		.error		= -EINVAL,
327 		.input		= dst_discard,
328 		.output		= dst_discard_out,
329 	},
330 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
331 };
332 
333 #endif
334 
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337 	struct dst_entry *dst = &rt->dst;
338 
339 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 	INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342 
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 					1, DST_OBSOLETE_FORCE_CHK, flags);
349 
350 	if (rt) {
351 		rt6_info_init(rt);
352 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353 	}
354 
355 	return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358 
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361 	struct rt6_info *rt = (struct rt6_info *)dst;
362 	struct fib6_info *from;
363 	struct inet6_dev *idev;
364 
365 	dst_destroy_metrics_generic(dst);
366 	rt6_uncached_list_del(rt);
367 
368 	idev = rt->rt6i_idev;
369 	if (idev) {
370 		rt->rt6i_idev = NULL;
371 		in6_dev_put(idev);
372 	}
373 
374 	rcu_read_lock();
375 	from = rcu_dereference(rt->from);
376 	rcu_assign_pointer(rt->from, NULL);
377 	fib6_info_release(from);
378 	rcu_read_unlock();
379 }
380 
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382 			   int how)
383 {
384 	struct rt6_info *rt = (struct rt6_info *)dst;
385 	struct inet6_dev *idev = rt->rt6i_idev;
386 	struct net_device *loopback_dev =
387 		dev_net(dev)->loopback_dev;
388 
389 	if (idev && idev->dev != loopback_dev) {
390 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391 		if (loopback_idev) {
392 			rt->rt6i_idev = loopback_idev;
393 			in6_dev_put(idev);
394 		}
395 	}
396 }
397 
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400 	if (rt->rt6i_flags & RTF_EXPIRES)
401 		return time_after(jiffies, rt->dst.expires);
402 	else
403 		return false;
404 }
405 
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408 	struct fib6_info *from;
409 
410 	from = rcu_dereference(rt->from);
411 
412 	if (rt->rt6i_flags & RTF_EXPIRES) {
413 		if (time_after(jiffies, rt->dst.expires))
414 			return true;
415 	} else if (from) {
416 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 			fib6_check_expired(from);
418 	}
419 	return false;
420 }
421 
422 struct fib6_info *fib6_multipath_select(const struct net *net,
423 					struct fib6_info *match,
424 					struct flowi6 *fl6, int oif,
425 					const struct sk_buff *skb,
426 					int strict)
427 {
428 	struct fib6_info *sibling, *next_sibling;
429 
430 	/* We might have already computed the hash for ICMPv6 errors. In such
431 	 * case it will always be non-zero. Otherwise now is the time to do it.
432 	 */
433 	if (!fl6->mp_hash)
434 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435 
436 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437 		return match;
438 
439 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440 				 fib6_siblings) {
441 		int nh_upper_bound;
442 
443 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 		if (fl6->mp_hash > nh_upper_bound)
445 			continue;
446 		if (rt6_score_route(sibling, oif, strict) < 0)
447 			break;
448 		match = sibling;
449 		break;
450 	}
451 
452 	return match;
453 }
454 
455 /*
456  *	Route lookup. rcu_read_lock() should be held.
457  */
458 
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 						 struct fib6_info *rt,
461 						    const struct in6_addr *saddr,
462 						    int oif,
463 						    int flags)
464 {
465 	struct fib6_info *sprt;
466 
467 	if (!oif && ipv6_addr_any(saddr) &&
468 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469 		return rt;
470 
471 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
473 
474 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475 			continue;
476 
477 		if (oif) {
478 			if (dev->ifindex == oif)
479 				return sprt;
480 		} else {
481 			if (ipv6_chk_addr(net, saddr, dev,
482 					  flags & RT6_LOOKUP_F_IFACE))
483 				return sprt;
484 		}
485 	}
486 
487 	if (oif && flags & RT6_LOOKUP_F_IFACE)
488 		return net->ipv6.fib6_null_entry;
489 
490 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492 
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 	struct work_struct work;
496 	struct in6_addr target;
497 	struct net_device *dev;
498 };
499 
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502 	struct in6_addr mcaddr;
503 	struct __rt6_probe_work *work =
504 		container_of(w, struct __rt6_probe_work, work);
505 
506 	addrconf_addr_solict_mult(&work->target, &mcaddr);
507 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508 	dev_put(work->dev);
509 	kfree(work);
510 }
511 
512 static void rt6_probe(struct fib6_info *rt)
513 {
514 	struct __rt6_probe_work *work;
515 	const struct in6_addr *nh_gw;
516 	struct neighbour *neigh;
517 	struct net_device *dev;
518 
519 	/*
520 	 * Okay, this does not seem to be appropriate
521 	 * for now, however, we need to check if it
522 	 * is really so; aka Router Reachability Probing.
523 	 *
524 	 * Router Reachability Probe MUST be rate-limited
525 	 * to no more than one per minute.
526 	 */
527 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528 		return;
529 
530 	nh_gw = &rt->fib6_nh.nh_gw;
531 	dev = rt->fib6_nh.nh_dev;
532 	rcu_read_lock_bh();
533 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534 	if (neigh) {
535 		struct inet6_dev *idev;
536 
537 		if (neigh->nud_state & NUD_VALID)
538 			goto out;
539 
540 		idev = __in6_dev_get(dev);
541 		work = NULL;
542 		write_lock(&neigh->lock);
543 		if (!(neigh->nud_state & NUD_VALID) &&
544 		    time_after(jiffies,
545 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
546 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
547 			if (work)
548 				__neigh_set_probe_once(neigh);
549 		}
550 		write_unlock(&neigh->lock);
551 	} else {
552 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 	}
554 
555 	if (work) {
556 		INIT_WORK(&work->work, rt6_probe_deferred);
557 		work->target = *nh_gw;
558 		dev_hold(dev);
559 		work->dev = dev;
560 		schedule_work(&work->work);
561 	}
562 
563 out:
564 	rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571 
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577 	const struct net_device *dev = rt->fib6_nh.nh_dev;
578 
579 	if (!oif || dev->ifindex == oif)
580 		return 2;
581 	return 0;
582 }
583 
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 	struct neighbour *neigh;
588 
589 	if (rt->fib6_flags & RTF_NONEXTHOP ||
590 	    !(rt->fib6_flags & RTF_GATEWAY))
591 		return RT6_NUD_SUCCEED;
592 
593 	rcu_read_lock_bh();
594 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595 					  &rt->fib6_nh.nh_gw);
596 	if (neigh) {
597 		read_lock(&neigh->lock);
598 		if (neigh->nud_state & NUD_VALID)
599 			ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 		else if (!(neigh->nud_state & NUD_FAILED))
602 			ret = RT6_NUD_SUCCEED;
603 		else
604 			ret = RT6_NUD_FAIL_PROBE;
605 #endif
606 		read_unlock(&neigh->lock);
607 	} else {
608 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610 	}
611 	rcu_read_unlock_bh();
612 
613 	return ret;
614 }
615 
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618 	int m;
619 
620 	m = rt6_check_dev(rt, oif);
621 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 		return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626 	if (strict & RT6_LOOKUP_F_REACHABLE) {
627 		int n = rt6_check_neigh(rt);
628 		if (n < 0)
629 			return n;
630 	}
631 	return m;
632 }
633 
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637 	const struct net_device *dev = fib6_info_nh_dev(f6i);
638 	bool rc = false;
639 
640 	if (dev) {
641 		const struct inet6_dev *idev = __in6_dev_get(dev);
642 
643 		rc = !!idev->cnf.ignore_routes_with_linkdown;
644 	}
645 
646 	return rc;
647 }
648 
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 				   int *mpri, struct fib6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 
656 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657 		goto out;
658 
659 	if (fib6_ignore_linkdown(rt) &&
660 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662 		goto out;
663 
664 	if (fib6_check_expired(rt))
665 		goto out;
666 
667 	m = rt6_score_route(rt, oif, strict);
668 	if (m == RT6_NUD_FAIL_DO_RR) {
669 		match_do_rr = true;
670 		m = 0; /* lowest valid score */
671 	} else if (m == RT6_NUD_FAIL_HARD) {
672 		goto out;
673 	}
674 
675 	if (strict & RT6_LOOKUP_F_REACHABLE)
676 		rt6_probe(rt);
677 
678 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
679 	if (m > *mpri) {
680 		*do_rr = match_do_rr;
681 		*mpri = m;
682 		match = rt;
683 	}
684 out:
685 	return match;
686 }
687 
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 				     struct fib6_info *leaf,
690 				     struct fib6_info *rr_head,
691 				     u32 metric, int oif, int strict,
692 				     bool *do_rr)
693 {
694 	struct fib6_info *rt, *match, *cont;
695 	int mpri = -1;
696 
697 	match = NULL;
698 	cont = NULL;
699 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700 		if (rt->fib6_metric != metric) {
701 			cont = rt;
702 			break;
703 		}
704 
705 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
706 	}
707 
708 	for (rt = leaf; rt && rt != rr_head;
709 	     rt = rcu_dereference(rt->fib6_next)) {
710 		if (rt->fib6_metric != metric) {
711 			cont = rt;
712 			break;
713 		}
714 
715 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 	}
717 
718 	if (match || !cont)
719 		return match;
720 
721 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 
724 	return match;
725 }
726 
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728 				   int oif, int strict)
729 {
730 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 	struct fib6_info *match, *rt0;
732 	bool do_rr = false;
733 	int key_plen;
734 
735 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 		return net->ipv6.fib6_null_entry;
737 
738 	rt0 = rcu_dereference(fn->rr_ptr);
739 	if (!rt0)
740 		rt0 = leaf;
741 
742 	/* Double check to make sure fn is not an intermediate node
743 	 * and fn->leaf does not points to its child's leaf
744 	 * (This might happen if all routes under fn are deleted from
745 	 * the tree and fib6_repair_tree() is called on the node.)
746 	 */
747 	key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 	if (rt0->fib6_src.plen)
750 		key_plen = rt0->fib6_src.plen;
751 #endif
752 	if (fn->fn_bit != key_plen)
753 		return net->ipv6.fib6_null_entry;
754 
755 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756 			     &do_rr);
757 
758 	if (do_rr) {
759 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760 
761 		/* no entries matched; do round-robin */
762 		if (!next || next->fib6_metric != rt0->fib6_metric)
763 			next = leaf;
764 
765 		if (next != rt0) {
766 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 			/* make sure next is not being deleted from the tree */
768 			if (next->fib6_node)
769 				rcu_assign_pointer(fn->rr_ptr, next);
770 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771 		}
772 	}
773 
774 	return match ? match : net->ipv6.fib6_null_entry;
775 }
776 
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781 
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 		  const struct in6_addr *gwaddr)
785 {
786 	struct net *net = dev_net(dev);
787 	struct route_info *rinfo = (struct route_info *) opt;
788 	struct in6_addr prefix_buf, *prefix;
789 	unsigned int pref;
790 	unsigned long lifetime;
791 	struct fib6_info *rt;
792 
793 	if (len < sizeof(struct route_info)) {
794 		return -EINVAL;
795 	}
796 
797 	/* Sanity check for prefix_len and length */
798 	if (rinfo->length > 3) {
799 		return -EINVAL;
800 	} else if (rinfo->prefix_len > 128) {
801 		return -EINVAL;
802 	} else if (rinfo->prefix_len > 64) {
803 		if (rinfo->length < 2) {
804 			return -EINVAL;
805 		}
806 	} else if (rinfo->prefix_len > 0) {
807 		if (rinfo->length < 1) {
808 			return -EINVAL;
809 		}
810 	}
811 
812 	pref = rinfo->route_pref;
813 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
814 		return -EINVAL;
815 
816 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817 
818 	if (rinfo->length == 3)
819 		prefix = (struct in6_addr *)rinfo->prefix;
820 	else {
821 		/* this function is safe */
822 		ipv6_addr_prefix(&prefix_buf,
823 				 (struct in6_addr *)rinfo->prefix,
824 				 rinfo->prefix_len);
825 		prefix = &prefix_buf;
826 	}
827 
828 	if (rinfo->prefix_len == 0)
829 		rt = rt6_get_dflt_router(net, gwaddr, dev);
830 	else
831 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832 					gwaddr, dev);
833 
834 	if (rt && !lifetime) {
835 		ip6_del_rt(net, rt);
836 		rt = NULL;
837 	}
838 
839 	if (!rt && lifetime)
840 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841 					dev, pref);
842 	else if (rt)
843 		rt->fib6_flags = RTF_ROUTEINFO |
844 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845 
846 	if (rt) {
847 		if (!addrconf_finite_timeout(lifetime))
848 			fib6_clean_expires(rt);
849 		else
850 			fib6_set_expires(rt, jiffies + HZ * lifetime);
851 
852 		fib6_info_release(rt);
853 	}
854 	return 0;
855 }
856 #endif
857 
858 /*
859  *	Misc support functions
860  */
861 
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865 	struct net_device *dev = rt->fib6_nh.nh_dev;
866 
867 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 		/* for copies of local routes, dst->dev needs to be the
869 		 * device if it is a master device, the master device if
870 		 * device is enslaved, and the loopback as the default
871 		 */
872 		if (netif_is_l3_slave(dev) &&
873 		    !rt6_need_strict(&rt->fib6_dst.addr))
874 			dev = l3mdev_master_dev_rcu(dev);
875 		else if (!netif_is_l3_master(dev))
876 			dev = dev_net(dev)->loopback_dev;
877 		/* last case is netif_is_l3_master(dev) is true in which
878 		 * case we want dev returned to be dev
879 		 */
880 	}
881 
882 	return dev;
883 }
884 
885 static const int fib6_prop[RTN_MAX + 1] = {
886 	[RTN_UNSPEC]	= 0,
887 	[RTN_UNICAST]	= 0,
888 	[RTN_LOCAL]	= 0,
889 	[RTN_BROADCAST]	= 0,
890 	[RTN_ANYCAST]	= 0,
891 	[RTN_MULTICAST]	= 0,
892 	[RTN_BLACKHOLE]	= -EINVAL,
893 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
894 	[RTN_PROHIBIT]	= -EACCES,
895 	[RTN_THROW]	= -EAGAIN,
896 	[RTN_NAT]	= -EINVAL,
897 	[RTN_XRESOLVE]	= -EINVAL,
898 };
899 
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902 	return fib6_prop[fib6_type];
903 }
904 
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907 	unsigned short flags = 0;
908 
909 	if (rt->dst_nocount)
910 		flags |= DST_NOCOUNT;
911 	if (rt->dst_nopolicy)
912 		flags |= DST_NOPOLICY;
913 	if (rt->dst_host)
914 		flags |= DST_HOST;
915 
916 	return flags;
917 }
918 
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922 
923 	switch (ort->fib6_type) {
924 	case RTN_BLACKHOLE:
925 		rt->dst.output = dst_discard_out;
926 		rt->dst.input = dst_discard;
927 		break;
928 	case RTN_PROHIBIT:
929 		rt->dst.output = ip6_pkt_prohibit_out;
930 		rt->dst.input = ip6_pkt_prohibit;
931 		break;
932 	case RTN_THROW:
933 	case RTN_UNREACHABLE:
934 	default:
935 		rt->dst.output = ip6_pkt_discard_out;
936 		rt->dst.input = ip6_pkt_discard;
937 		break;
938 	}
939 }
940 
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943 	rt->dst.flags |= fib6_info_dst_flags(ort);
944 
945 	if (ort->fib6_flags & RTF_REJECT) {
946 		ip6_rt_init_dst_reject(rt, ort);
947 		return;
948 	}
949 
950 	rt->dst.error = 0;
951 	rt->dst.output = ip6_output;
952 
953 	if (ort->fib6_type == RTN_LOCAL) {
954 		rt->dst.input = ip6_input;
955 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 		rt->dst.input = ip6_mc_input;
957 	} else {
958 		rt->dst.input = ip6_forward;
959 	}
960 
961 	if (ort->fib6_nh.nh_lwtstate) {
962 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 		lwtunnel_set_redirect(&rt->dst);
964 	}
965 
966 	rt->dst.lastuse = jiffies;
967 }
968 
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971 	rt->rt6i_flags &= ~RTF_EXPIRES;
972 	fib6_info_hold(from);
973 	rcu_assign_pointer(rt->from, from);
974 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 	if (from->fib6_metrics != &dst_default_metrics) {
976 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 		refcount_inc(&from->fib6_metrics->refcnt);
978 	}
979 }
980 
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983 	struct net_device *dev = fib6_info_nh_dev(ort);
984 
985 	ip6_rt_init_dst(rt, ort);
986 
987 	rt->rt6i_dst = ort->fib6_dst;
988 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 	rt->rt6i_src = ort->fib6_src;
994 #endif
995 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998 
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 					struct in6_addr *saddr)
1001 {
1002 	struct fib6_node *pn, *sn;
1003 	while (1) {
1004 		if (fn->fn_flags & RTN_TL_ROOT)
1005 			return NULL;
1006 		pn = rcu_dereference(fn->parent);
1007 		sn = FIB6_SUBTREE(pn);
1008 		if (sn && sn != fn)
1009 			fn = fib6_node_lookup(sn, NULL, saddr);
1010 		else
1011 			fn = pn;
1012 		if (fn->fn_flags & RTN_RTINFO)
1013 			return fn;
1014 	}
1015 }
1016 
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018 			  bool null_fallback)
1019 {
1020 	struct rt6_info *rt = *prt;
1021 
1022 	if (dst_hold_safe(&rt->dst))
1023 		return true;
1024 	if (null_fallback) {
1025 		rt = net->ipv6.ip6_null_entry;
1026 		dst_hold(&rt->dst);
1027 	} else {
1028 		rt = NULL;
1029 	}
1030 	*prt = rt;
1031 	return false;
1032 }
1033 
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037 	unsigned short flags = fib6_info_dst_flags(rt);
1038 	struct net_device *dev = rt->fib6_nh.nh_dev;
1039 	struct rt6_info *nrt;
1040 
1041 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042 	if (nrt)
1043 		ip6_rt_copy_init(nrt, rt);
1044 
1045 	return nrt;
1046 }
1047 
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 					     struct fib6_table *table,
1050 					     struct flowi6 *fl6,
1051 					     const struct sk_buff *skb,
1052 					     int flags)
1053 {
1054 	struct fib6_info *f6i;
1055 	struct fib6_node *fn;
1056 	struct rt6_info *rt;
1057 
1058 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 		flags &= ~RT6_LOOKUP_F_IFACE;
1060 
1061 	rcu_read_lock();
1062 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064 	f6i = rcu_dereference(fn->leaf);
1065 	if (!f6i) {
1066 		f6i = net->ipv6.fib6_null_entry;
1067 	} else {
1068 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 				      fl6->flowi6_oif, flags);
1070 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 			f6i = fib6_multipath_select(net, f6i, fl6,
1072 						    fl6->flowi6_oif, skb,
1073 						    flags);
1074 	}
1075 	if (f6i == net->ipv6.fib6_null_entry) {
1076 		fn = fib6_backtrack(fn, &fl6->saddr);
1077 		if (fn)
1078 			goto restart;
1079 	}
1080 
1081 	trace_fib6_table_lookup(net, f6i, table, fl6);
1082 
1083 	/* Search through exception table */
1084 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1085 	if (rt) {
1086 		if (ip6_hold_safe(net, &rt, true))
1087 			dst_use_noref(&rt->dst, jiffies);
1088 	} else if (f6i == net->ipv6.fib6_null_entry) {
1089 		rt = net->ipv6.ip6_null_entry;
1090 		dst_hold(&rt->dst);
1091 	} else {
1092 		rt = ip6_create_rt_rcu(f6i);
1093 		if (!rt) {
1094 			rt = net->ipv6.ip6_null_entry;
1095 			dst_hold(&rt->dst);
1096 		}
1097 	}
1098 
1099 	rcu_read_unlock();
1100 
1101 	return rt;
1102 }
1103 
1104 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1105 				   const struct sk_buff *skb, int flags)
1106 {
1107 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 
1111 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1112 			    const struct in6_addr *saddr, int oif,
1113 			    const struct sk_buff *skb, int strict)
1114 {
1115 	struct flowi6 fl6 = {
1116 		.flowi6_oif = oif,
1117 		.daddr = *daddr,
1118 	};
1119 	struct dst_entry *dst;
1120 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1121 
1122 	if (saddr) {
1123 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1124 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1125 	}
1126 
1127 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1128 	if (dst->error == 0)
1129 		return (struct rt6_info *) dst;
1130 
1131 	dst_release(dst);
1132 
1133 	return NULL;
1134 }
1135 EXPORT_SYMBOL(rt6_lookup);
1136 
1137 /* ip6_ins_rt is called with FREE table->tb6_lock.
1138  * It takes new route entry, the addition fails by any reason the
1139  * route is released.
1140  * Caller must hold dst before calling it.
1141  */
1142 
1143 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1144 			struct netlink_ext_ack *extack)
1145 {
1146 	int err;
1147 	struct fib6_table *table;
1148 
1149 	table = rt->fib6_table;
1150 	spin_lock_bh(&table->tb6_lock);
1151 	err = fib6_add(&table->tb6_root, rt, info, extack);
1152 	spin_unlock_bh(&table->tb6_lock);
1153 
1154 	return err;
1155 }
1156 
1157 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 {
1159 	struct nl_info info = {	.nl_net = net, };
1160 
1161 	return __ip6_ins_rt(rt, &info, NULL);
1162 }
1163 
1164 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1165 					   const struct in6_addr *daddr,
1166 					   const struct in6_addr *saddr)
1167 {
1168 	struct net_device *dev;
1169 	struct rt6_info *rt;
1170 
1171 	/*
1172 	 *	Clone the route.
1173 	 */
1174 
1175 	dev = ip6_rt_get_dev_rcu(ort);
1176 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1177 	if (!rt)
1178 		return NULL;
1179 
1180 	ip6_rt_copy_init(rt, ort);
1181 	rt->rt6i_flags |= RTF_CACHE;
1182 	rt->dst.flags |= DST_HOST;
1183 	rt->rt6i_dst.addr = *daddr;
1184 	rt->rt6i_dst.plen = 128;
1185 
1186 	if (!rt6_is_gw_or_nonexthop(ort)) {
1187 		if (ort->fib6_dst.plen != 128 &&
1188 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189 			rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191 		if (rt->rt6i_src.plen && saddr) {
1192 			rt->rt6i_src.addr = *saddr;
1193 			rt->rt6i_src.plen = 128;
1194 		}
1195 #endif
1196 	}
1197 
1198 	return rt;
1199 }
1200 
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 {
1203 	unsigned short flags = fib6_info_dst_flags(rt);
1204 	struct net_device *dev;
1205 	struct rt6_info *pcpu_rt;
1206 
1207 	rcu_read_lock();
1208 	dev = ip6_rt_get_dev_rcu(rt);
1209 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1210 	rcu_read_unlock();
1211 	if (!pcpu_rt)
1212 		return NULL;
1213 	ip6_rt_copy_init(pcpu_rt, rt);
1214 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1215 	return pcpu_rt;
1216 }
1217 
1218 /* It should be called with rcu_read_lock() acquired */
1219 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 {
1221 	struct rt6_info *pcpu_rt, **p;
1222 
1223 	p = this_cpu_ptr(rt->rt6i_pcpu);
1224 	pcpu_rt = *p;
1225 
1226 	if (pcpu_rt)
1227 		ip6_hold_safe(NULL, &pcpu_rt, false);
1228 
1229 	return pcpu_rt;
1230 }
1231 
1232 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1233 					    struct fib6_info *rt)
1234 {
1235 	struct rt6_info *pcpu_rt, *prev, **p;
1236 
1237 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 	if (!pcpu_rt) {
1239 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1240 		return net->ipv6.ip6_null_entry;
1241 	}
1242 
1243 	dst_hold(&pcpu_rt->dst);
1244 	p = this_cpu_ptr(rt->rt6i_pcpu);
1245 	prev = cmpxchg(p, NULL, pcpu_rt);
1246 	BUG_ON(prev);
1247 
1248 	return pcpu_rt;
1249 }
1250 
1251 /* exception hash table implementation
1252  */
1253 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 
1255 /* Remove rt6_ex from hash table and free the memory
1256  * Caller must hold rt6_exception_lock
1257  */
1258 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1259 				 struct rt6_exception *rt6_ex)
1260 {
1261 	struct net *net;
1262 
1263 	if (!bucket || !rt6_ex)
1264 		return;
1265 
1266 	net = dev_net(rt6_ex->rt6i->dst.dev);
1267 	hlist_del_rcu(&rt6_ex->hlist);
1268 	dst_release(&rt6_ex->rt6i->dst);
1269 	kfree_rcu(rt6_ex, rcu);
1270 	WARN_ON_ONCE(!bucket->depth);
1271 	bucket->depth--;
1272 	net->ipv6.rt6_stats->fib_rt_cache--;
1273 }
1274 
1275 /* Remove oldest rt6_ex in bucket and free the memory
1276  * Caller must hold rt6_exception_lock
1277  */
1278 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 {
1280 	struct rt6_exception *rt6_ex, *oldest = NULL;
1281 
1282 	if (!bucket)
1283 		return;
1284 
1285 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1286 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1287 			oldest = rt6_ex;
1288 	}
1289 	rt6_remove_exception(bucket, oldest);
1290 }
1291 
1292 static u32 rt6_exception_hash(const struct in6_addr *dst,
1293 			      const struct in6_addr *src)
1294 {
1295 	static u32 seed __read_mostly;
1296 	u32 val;
1297 
1298 	net_get_random_once(&seed, sizeof(seed));
1299 	val = jhash(dst, sizeof(*dst), seed);
1300 
1301 #ifdef CONFIG_IPV6_SUBTREES
1302 	if (src)
1303 		val = jhash(src, sizeof(*src), val);
1304 #endif
1305 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1306 }
1307 
1308 /* Helper function to find the cached rt in the hash table
1309  * and update bucket pointer to point to the bucket for this
1310  * (daddr, saddr) pair
1311  * Caller must hold rt6_exception_lock
1312  */
1313 static struct rt6_exception *
1314 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1315 			      const struct in6_addr *daddr,
1316 			      const struct in6_addr *saddr)
1317 {
1318 	struct rt6_exception *rt6_ex;
1319 	u32 hval;
1320 
1321 	if (!(*bucket) || !daddr)
1322 		return NULL;
1323 
1324 	hval = rt6_exception_hash(daddr, saddr);
1325 	*bucket += hval;
1326 
1327 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1328 		struct rt6_info *rt6 = rt6_ex->rt6i;
1329 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 
1331 #ifdef CONFIG_IPV6_SUBTREES
1332 		if (matched && saddr)
1333 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1334 #endif
1335 		if (matched)
1336 			return rt6_ex;
1337 	}
1338 	return NULL;
1339 }
1340 
1341 /* Helper function to find the cached rt in the hash table
1342  * and update bucket pointer to point to the bucket for this
1343  * (daddr, saddr) pair
1344  * Caller must hold rcu_read_lock()
1345  */
1346 static struct rt6_exception *
1347 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1348 			 const struct in6_addr *daddr,
1349 			 const struct in6_addr *saddr)
1350 {
1351 	struct rt6_exception *rt6_ex;
1352 	u32 hval;
1353 
1354 	WARN_ON_ONCE(!rcu_read_lock_held());
1355 
1356 	if (!(*bucket) || !daddr)
1357 		return NULL;
1358 
1359 	hval = rt6_exception_hash(daddr, saddr);
1360 	*bucket += hval;
1361 
1362 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1363 		struct rt6_info *rt6 = rt6_ex->rt6i;
1364 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 
1366 #ifdef CONFIG_IPV6_SUBTREES
1367 		if (matched && saddr)
1368 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1369 #endif
1370 		if (matched)
1371 			return rt6_ex;
1372 	}
1373 	return NULL;
1374 }
1375 
1376 static unsigned int fib6_mtu(const struct fib6_info *rt)
1377 {
1378 	unsigned int mtu;
1379 
1380 	if (rt->fib6_pmtu) {
1381 		mtu = rt->fib6_pmtu;
1382 	} else {
1383 		struct net_device *dev = fib6_info_nh_dev(rt);
1384 		struct inet6_dev *idev;
1385 
1386 		rcu_read_lock();
1387 		idev = __in6_dev_get(dev);
1388 		mtu = idev->cnf.mtu6;
1389 		rcu_read_unlock();
1390 	}
1391 
1392 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 
1394 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1395 }
1396 
1397 static int rt6_insert_exception(struct rt6_info *nrt,
1398 				struct fib6_info *ort)
1399 {
1400 	struct net *net = dev_net(nrt->dst.dev);
1401 	struct rt6_exception_bucket *bucket;
1402 	struct in6_addr *src_key = NULL;
1403 	struct rt6_exception *rt6_ex;
1404 	int err = 0;
1405 
1406 	spin_lock_bh(&rt6_exception_lock);
1407 
1408 	if (ort->exception_bucket_flushed) {
1409 		err = -EINVAL;
1410 		goto out;
1411 	}
1412 
1413 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1414 					lockdep_is_held(&rt6_exception_lock));
1415 	if (!bucket) {
1416 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1417 				 GFP_ATOMIC);
1418 		if (!bucket) {
1419 			err = -ENOMEM;
1420 			goto out;
1421 		}
1422 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1423 	}
1424 
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 	/* rt6i_src.plen != 0 indicates ort is in subtree
1427 	 * and exception table is indexed by a hash of
1428 	 * both rt6i_dst and rt6i_src.
1429 	 * Otherwise, the exception table is indexed by
1430 	 * a hash of only rt6i_dst.
1431 	 */
1432 	if (ort->fib6_src.plen)
1433 		src_key = &nrt->rt6i_src.addr;
1434 #endif
1435 
1436 	/* Update rt6i_prefsrc as it could be changed
1437 	 * in rt6_remove_prefsrc()
1438 	 */
1439 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1440 	/* rt6_mtu_change() might lower mtu on ort.
1441 	 * Only insert this exception route if its mtu
1442 	 * is less than ort's mtu value.
1443 	 */
1444 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1445 		err = -EINVAL;
1446 		goto out;
1447 	}
1448 
1449 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1450 					       src_key);
1451 	if (rt6_ex)
1452 		rt6_remove_exception(bucket, rt6_ex);
1453 
1454 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1455 	if (!rt6_ex) {
1456 		err = -ENOMEM;
1457 		goto out;
1458 	}
1459 	rt6_ex->rt6i = nrt;
1460 	rt6_ex->stamp = jiffies;
1461 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 	bucket->depth++;
1463 	net->ipv6.rt6_stats->fib_rt_cache++;
1464 
1465 	if (bucket->depth > FIB6_MAX_DEPTH)
1466 		rt6_exception_remove_oldest(bucket);
1467 
1468 out:
1469 	spin_unlock_bh(&rt6_exception_lock);
1470 
1471 	/* Update fn->fn_sernum to invalidate all cached dst */
1472 	if (!err) {
1473 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1474 		fib6_update_sernum(net, ort);
1475 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1476 		fib6_force_start_gc(net);
1477 	}
1478 
1479 	return err;
1480 }
1481 
1482 void rt6_flush_exceptions(struct fib6_info *rt)
1483 {
1484 	struct rt6_exception_bucket *bucket;
1485 	struct rt6_exception *rt6_ex;
1486 	struct hlist_node *tmp;
1487 	int i;
1488 
1489 	spin_lock_bh(&rt6_exception_lock);
1490 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1491 	rt->exception_bucket_flushed = 1;
1492 
1493 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1494 				    lockdep_is_held(&rt6_exception_lock));
1495 	if (!bucket)
1496 		goto out;
1497 
1498 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1499 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1500 			rt6_remove_exception(bucket, rt6_ex);
1501 		WARN_ON_ONCE(bucket->depth);
1502 		bucket++;
1503 	}
1504 
1505 out:
1506 	spin_unlock_bh(&rt6_exception_lock);
1507 }
1508 
1509 /* Find cached rt in the hash table inside passed in rt
1510  * Caller has to hold rcu_read_lock()
1511  */
1512 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1513 					   struct in6_addr *daddr,
1514 					   struct in6_addr *saddr)
1515 {
1516 	struct rt6_exception_bucket *bucket;
1517 	struct in6_addr *src_key = NULL;
1518 	struct rt6_exception *rt6_ex;
1519 	struct rt6_info *res = NULL;
1520 
1521 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 
1523 #ifdef CONFIG_IPV6_SUBTREES
1524 	/* rt6i_src.plen != 0 indicates rt is in subtree
1525 	 * and exception table is indexed by a hash of
1526 	 * both rt6i_dst and rt6i_src.
1527 	 * Otherwise, the exception table is indexed by
1528 	 * a hash of only rt6i_dst.
1529 	 */
1530 	if (rt->fib6_src.plen)
1531 		src_key = saddr;
1532 #endif
1533 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 
1535 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1536 		res = rt6_ex->rt6i;
1537 
1538 	return res;
1539 }
1540 
1541 /* Remove the passed in cached rt from the hash table that contains it */
1542 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 {
1544 	struct rt6_exception_bucket *bucket;
1545 	struct in6_addr *src_key = NULL;
1546 	struct rt6_exception *rt6_ex;
1547 	struct fib6_info *from;
1548 	int err;
1549 
1550 	from = rcu_dereference(rt->from);
1551 	if (!from ||
1552 	    !(rt->rt6i_flags & RTF_CACHE))
1553 		return -EINVAL;
1554 
1555 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556 		return -ENOENT;
1557 
1558 	spin_lock_bh(&rt6_exception_lock);
1559 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560 				    lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1563 	 * and exception table is indexed by a hash of
1564 	 * both rt6i_dst and rt6i_src.
1565 	 * Otherwise, the exception table is indexed by
1566 	 * a hash of only rt6i_dst.
1567 	 */
1568 	if (from->fib6_src.plen)
1569 		src_key = &rt->rt6i_src.addr;
1570 #endif
1571 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1572 					       &rt->rt6i_dst.addr,
1573 					       src_key);
1574 	if (rt6_ex) {
1575 		rt6_remove_exception(bucket, rt6_ex);
1576 		err = 0;
1577 	} else {
1578 		err = -ENOENT;
1579 	}
1580 
1581 	spin_unlock_bh(&rt6_exception_lock);
1582 	return err;
1583 }
1584 
1585 /* Find rt6_ex which contains the passed in rt cache and
1586  * refresh its stamp
1587  */
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 {
1590 	struct rt6_exception_bucket *bucket;
1591 	struct fib6_info *from = rt->from;
1592 	struct in6_addr *src_key = NULL;
1593 	struct rt6_exception *rt6_ex;
1594 
1595 	if (!from ||
1596 	    !(rt->rt6i_flags & RTF_CACHE))
1597 		return;
1598 
1599 	rcu_read_lock();
1600 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1601 
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1604 	 * and exception table is indexed by a hash of
1605 	 * both rt6i_dst and rt6i_src.
1606 	 * Otherwise, the exception table is indexed by
1607 	 * a hash of only rt6i_dst.
1608 	 */
1609 	if (from->fib6_src.plen)
1610 		src_key = &rt->rt6i_src.addr;
1611 #endif
1612 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1613 					  &rt->rt6i_dst.addr,
1614 					  src_key);
1615 	if (rt6_ex)
1616 		rt6_ex->stamp = jiffies;
1617 
1618 	rcu_read_unlock();
1619 }
1620 
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 {
1623 	struct rt6_exception_bucket *bucket;
1624 	struct rt6_exception *rt6_ex;
1625 	int i;
1626 
1627 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628 					lockdep_is_held(&rt6_exception_lock));
1629 
1630 	if (bucket) {
1631 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1634 			}
1635 			bucket++;
1636 		}
1637 	}
1638 }
1639 
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641 					 struct rt6_info *rt, int mtu)
1642 {
1643 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1644 	 * lowest MTU in the path: always allow updating the route PMTU to
1645 	 * reflect PMTU decreases.
1646 	 *
1647 	 * If the new MTU is higher, and the route PMTU is equal to the local
1648 	 * MTU, this means the old MTU is the lowest in the path, so allow
1649 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1650 	 * handle this.
1651 	 */
1652 
1653 	if (dst_mtu(&rt->dst) >= mtu)
1654 		return true;
1655 
1656 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657 		return true;
1658 
1659 	return false;
1660 }
1661 
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663 				       struct fib6_info *rt, int mtu)
1664 {
1665 	struct rt6_exception_bucket *bucket;
1666 	struct rt6_exception *rt6_ex;
1667 	int i;
1668 
1669 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670 					lockdep_is_held(&rt6_exception_lock));
1671 
1672 	if (!bucket)
1673 		return;
1674 
1675 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677 			struct rt6_info *entry = rt6_ex->rt6i;
1678 
1679 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680 			 * route), the metrics of its rt->from have already
1681 			 * been updated.
1682 			 */
1683 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1685 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686 		}
1687 		bucket++;
1688 	}
1689 }
1690 
1691 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1692 
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694 					struct in6_addr *gateway)
1695 {
1696 	struct rt6_exception_bucket *bucket;
1697 	struct rt6_exception *rt6_ex;
1698 	struct hlist_node *tmp;
1699 	int i;
1700 
1701 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702 		return;
1703 
1704 	spin_lock_bh(&rt6_exception_lock);
1705 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 				     lockdep_is_held(&rt6_exception_lock));
1707 
1708 	if (bucket) {
1709 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 			hlist_for_each_entry_safe(rt6_ex, tmp,
1711 						  &bucket->chain, hlist) {
1712 				struct rt6_info *entry = rt6_ex->rt6i;
1713 
1714 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715 				    RTF_CACHE_GATEWAY &&
1716 				    ipv6_addr_equal(gateway,
1717 						    &entry->rt6i_gateway)) {
1718 					rt6_remove_exception(bucket, rt6_ex);
1719 				}
1720 			}
1721 			bucket++;
1722 		}
1723 	}
1724 
1725 	spin_unlock_bh(&rt6_exception_lock);
1726 }
1727 
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729 				      struct rt6_exception *rt6_ex,
1730 				      struct fib6_gc_args *gc_args,
1731 				      unsigned long now)
1732 {
1733 	struct rt6_info *rt = rt6_ex->rt6i;
1734 
1735 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1736 	 * even if others have still references to them, so that on next
1737 	 * dst_check() such references can be dropped.
1738 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739 	 * expired, independently from their aging, as per RFC 8201 section 4
1740 	 */
1741 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743 			RT6_TRACE("aging clone %p\n", rt);
1744 			rt6_remove_exception(bucket, rt6_ex);
1745 			return;
1746 		}
1747 	} else if (time_after(jiffies, rt->dst.expires)) {
1748 		RT6_TRACE("purging expired route %p\n", rt);
1749 		rt6_remove_exception(bucket, rt6_ex);
1750 		return;
1751 	}
1752 
1753 	if (rt->rt6i_flags & RTF_GATEWAY) {
1754 		struct neighbour *neigh;
1755 		__u8 neigh_flags = 0;
1756 
1757 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758 		if (neigh)
1759 			neigh_flags = neigh->flags;
1760 
1761 		if (!(neigh_flags & NTF_ROUTER)) {
1762 			RT6_TRACE("purging route %p via non-router but gateway\n",
1763 				  rt);
1764 			rt6_remove_exception(bucket, rt6_ex);
1765 			return;
1766 		}
1767 	}
1768 
1769 	gc_args->more++;
1770 }
1771 
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773 			struct fib6_gc_args *gc_args,
1774 			unsigned long now)
1775 {
1776 	struct rt6_exception_bucket *bucket;
1777 	struct rt6_exception *rt6_ex;
1778 	struct hlist_node *tmp;
1779 	int i;
1780 
1781 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1782 		return;
1783 
1784 	rcu_read_lock_bh();
1785 	spin_lock(&rt6_exception_lock);
1786 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787 				    lockdep_is_held(&rt6_exception_lock));
1788 
1789 	if (bucket) {
1790 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791 			hlist_for_each_entry_safe(rt6_ex, tmp,
1792 						  &bucket->chain, hlist) {
1793 				rt6_age_examine_exception(bucket, rt6_ex,
1794 							  gc_args, now);
1795 			}
1796 			bucket++;
1797 		}
1798 	}
1799 	spin_unlock(&rt6_exception_lock);
1800 	rcu_read_unlock_bh();
1801 }
1802 
1803 /* must be called with rcu lock held */
1804 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1805 				    int oif, struct flowi6 *fl6, int strict)
1806 {
1807 	struct fib6_node *fn, *saved_fn;
1808 	struct fib6_info *f6i;
1809 
1810 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1811 	saved_fn = fn;
1812 
1813 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1814 		oif = 0;
1815 
1816 redo_rt6_select:
1817 	f6i = rt6_select(net, fn, oif, strict);
1818 	if (f6i == net->ipv6.fib6_null_entry) {
1819 		fn = fib6_backtrack(fn, &fl6->saddr);
1820 		if (fn)
1821 			goto redo_rt6_select;
1822 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1823 			/* also consider unreachable route */
1824 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1825 			fn = saved_fn;
1826 			goto redo_rt6_select;
1827 		}
1828 	}
1829 
1830 	trace_fib6_table_lookup(net, f6i, table, fl6);
1831 
1832 	return f6i;
1833 }
1834 
1835 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1836 			       int oif, struct flowi6 *fl6,
1837 			       const struct sk_buff *skb, int flags)
1838 {
1839 	struct fib6_info *f6i;
1840 	struct rt6_info *rt;
1841 	int strict = 0;
1842 
1843 	strict |= flags & RT6_LOOKUP_F_IFACE;
1844 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1845 	if (net->ipv6.devconf_all->forwarding == 0)
1846 		strict |= RT6_LOOKUP_F_REACHABLE;
1847 
1848 	rcu_read_lock();
1849 
1850 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1851 	if (f6i->fib6_nsiblings)
1852 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1853 
1854 	if (f6i == net->ipv6.fib6_null_entry) {
1855 		rt = net->ipv6.ip6_null_entry;
1856 		rcu_read_unlock();
1857 		dst_hold(&rt->dst);
1858 		return rt;
1859 	}
1860 
1861 	/*Search through exception table */
1862 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1863 	if (rt) {
1864 		if (ip6_hold_safe(net, &rt, true))
1865 			dst_use_noref(&rt->dst, jiffies);
1866 
1867 		rcu_read_unlock();
1868 		return rt;
1869 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1870 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1871 		/* Create a RTF_CACHE clone which will not be
1872 		 * owned by the fib6 tree.  It is for the special case where
1873 		 * the daddr in the skb during the neighbor look-up is different
1874 		 * from the fl6->daddr used to look-up route here.
1875 		 */
1876 		struct rt6_info *uncached_rt;
1877 
1878 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1879 
1880 		rcu_read_unlock();
1881 
1882 		if (uncached_rt) {
1883 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1884 			 * No need for another dst_hold()
1885 			 */
1886 			rt6_uncached_list_add(uncached_rt);
1887 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1888 		} else {
1889 			uncached_rt = net->ipv6.ip6_null_entry;
1890 			dst_hold(&uncached_rt->dst);
1891 		}
1892 
1893 		return uncached_rt;
1894 	} else {
1895 		/* Get a percpu copy */
1896 
1897 		struct rt6_info *pcpu_rt;
1898 
1899 		local_bh_disable();
1900 		pcpu_rt = rt6_get_pcpu_route(f6i);
1901 
1902 		if (!pcpu_rt)
1903 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1904 
1905 		local_bh_enable();
1906 		rcu_read_unlock();
1907 
1908 		return pcpu_rt;
1909 	}
1910 }
1911 EXPORT_SYMBOL_GPL(ip6_pol_route);
1912 
1913 static struct rt6_info *ip6_pol_route_input(struct net *net,
1914 					    struct fib6_table *table,
1915 					    struct flowi6 *fl6,
1916 					    const struct sk_buff *skb,
1917 					    int flags)
1918 {
1919 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1920 }
1921 
1922 struct dst_entry *ip6_route_input_lookup(struct net *net,
1923 					 struct net_device *dev,
1924 					 struct flowi6 *fl6,
1925 					 const struct sk_buff *skb,
1926 					 int flags)
1927 {
1928 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1929 		flags |= RT6_LOOKUP_F_IFACE;
1930 
1931 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1932 }
1933 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1934 
1935 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1936 				  struct flow_keys *keys,
1937 				  struct flow_keys *flkeys)
1938 {
1939 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1940 	const struct ipv6hdr *key_iph = outer_iph;
1941 	struct flow_keys *_flkeys = flkeys;
1942 	const struct ipv6hdr *inner_iph;
1943 	const struct icmp6hdr *icmph;
1944 	struct ipv6hdr _inner_iph;
1945 	struct icmp6hdr _icmph;
1946 
1947 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1948 		goto out;
1949 
1950 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1951 				   sizeof(_icmph), &_icmph);
1952 	if (!icmph)
1953 		goto out;
1954 
1955 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1956 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1957 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1958 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1959 		goto out;
1960 
1961 	inner_iph = skb_header_pointer(skb,
1962 				       skb_transport_offset(skb) + sizeof(*icmph),
1963 				       sizeof(_inner_iph), &_inner_iph);
1964 	if (!inner_iph)
1965 		goto out;
1966 
1967 	key_iph = inner_iph;
1968 	_flkeys = NULL;
1969 out:
1970 	if (_flkeys) {
1971 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1972 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1973 		keys->tags.flow_label = _flkeys->tags.flow_label;
1974 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1975 	} else {
1976 		keys->addrs.v6addrs.src = key_iph->saddr;
1977 		keys->addrs.v6addrs.dst = key_iph->daddr;
1978 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1979 		keys->basic.ip_proto = key_iph->nexthdr;
1980 	}
1981 }
1982 
1983 /* if skb is set it will be used and fl6 can be NULL */
1984 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1985 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1986 {
1987 	struct flow_keys hash_keys;
1988 	u32 mhash;
1989 
1990 	switch (ip6_multipath_hash_policy(net)) {
1991 	case 0:
1992 		memset(&hash_keys, 0, sizeof(hash_keys));
1993 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1994 		if (skb) {
1995 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1996 		} else {
1997 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1998 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1999 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
2000 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2001 		}
2002 		break;
2003 	case 1:
2004 		if (skb) {
2005 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2006 			struct flow_keys keys;
2007 
2008 			/* short-circuit if we already have L4 hash present */
2009 			if (skb->l4_hash)
2010 				return skb_get_hash_raw(skb) >> 1;
2011 
2012 			memset(&hash_keys, 0, sizeof(hash_keys));
2013 
2014                         if (!flkeys) {
2015 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2016 				flkeys = &keys;
2017 			}
2018 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2020 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2021 			hash_keys.ports.src = flkeys->ports.src;
2022 			hash_keys.ports.dst = flkeys->ports.dst;
2023 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2024 		} else {
2025 			memset(&hash_keys, 0, sizeof(hash_keys));
2026 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2027 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2028 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2029 			hash_keys.ports.src = fl6->fl6_sport;
2030 			hash_keys.ports.dst = fl6->fl6_dport;
2031 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2032 		}
2033 		break;
2034 	}
2035 	mhash = flow_hash_from_keys(&hash_keys);
2036 
2037 	return mhash >> 1;
2038 }
2039 
2040 void ip6_route_input(struct sk_buff *skb)
2041 {
2042 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2043 	struct net *net = dev_net(skb->dev);
2044 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2045 	struct ip_tunnel_info *tun_info;
2046 	struct flowi6 fl6 = {
2047 		.flowi6_iif = skb->dev->ifindex,
2048 		.daddr = iph->daddr,
2049 		.saddr = iph->saddr,
2050 		.flowlabel = ip6_flowinfo(iph),
2051 		.flowi6_mark = skb->mark,
2052 		.flowi6_proto = iph->nexthdr,
2053 	};
2054 	struct flow_keys *flkeys = NULL, _flkeys;
2055 
2056 	tun_info = skb_tunnel_info(skb);
2057 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2058 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2059 
2060 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2061 		flkeys = &_flkeys;
2062 
2063 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2064 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2065 	skb_dst_drop(skb);
2066 	skb_dst_set(skb,
2067 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2068 }
2069 
2070 static struct rt6_info *ip6_pol_route_output(struct net *net,
2071 					     struct fib6_table *table,
2072 					     struct flowi6 *fl6,
2073 					     const struct sk_buff *skb,
2074 					     int flags)
2075 {
2076 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2077 }
2078 
2079 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2080 					 struct flowi6 *fl6, int flags)
2081 {
2082 	bool any_src;
2083 
2084 	if (rt6_need_strict(&fl6->daddr)) {
2085 		struct dst_entry *dst;
2086 
2087 		dst = l3mdev_link_scope_lookup(net, fl6);
2088 		if (dst)
2089 			return dst;
2090 	}
2091 
2092 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2093 
2094 	any_src = ipv6_addr_any(&fl6->saddr);
2095 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2096 	    (fl6->flowi6_oif && any_src))
2097 		flags |= RT6_LOOKUP_F_IFACE;
2098 
2099 	if (!any_src)
2100 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2101 	else if (sk)
2102 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2103 
2104 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2105 }
2106 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2107 
2108 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2109 {
2110 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2111 	struct net_device *loopback_dev = net->loopback_dev;
2112 	struct dst_entry *new = NULL;
2113 
2114 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2115 		       DST_OBSOLETE_DEAD, 0);
2116 	if (rt) {
2117 		rt6_info_init(rt);
2118 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2119 
2120 		new = &rt->dst;
2121 		new->__use = 1;
2122 		new->input = dst_discard;
2123 		new->output = dst_discard_out;
2124 
2125 		dst_copy_metrics(new, &ort->dst);
2126 
2127 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2128 		rt->rt6i_gateway = ort->rt6i_gateway;
2129 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2130 
2131 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2132 #ifdef CONFIG_IPV6_SUBTREES
2133 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2134 #endif
2135 	}
2136 
2137 	dst_release(dst_orig);
2138 	return new ? new : ERR_PTR(-ENOMEM);
2139 }
2140 
2141 /*
2142  *	Destination cache support functions
2143  */
2144 
2145 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2146 {
2147 	u32 rt_cookie = 0;
2148 
2149 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2150 		return false;
2151 
2152 	if (fib6_check_expired(f6i))
2153 		return false;
2154 
2155 	return true;
2156 }
2157 
2158 static struct dst_entry *rt6_check(struct rt6_info *rt,
2159 				   struct fib6_info *from,
2160 				   u32 cookie)
2161 {
2162 	u32 rt_cookie = 0;
2163 
2164 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2165 	    rt_cookie != cookie)
2166 		return NULL;
2167 
2168 	if (rt6_check_expired(rt))
2169 		return NULL;
2170 
2171 	return &rt->dst;
2172 }
2173 
2174 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2175 					    struct fib6_info *from,
2176 					    u32 cookie)
2177 {
2178 	if (!__rt6_check_expired(rt) &&
2179 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2180 	    fib6_check(from, cookie))
2181 		return &rt->dst;
2182 	else
2183 		return NULL;
2184 }
2185 
2186 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2187 {
2188 	struct dst_entry *dst_ret;
2189 	struct fib6_info *from;
2190 	struct rt6_info *rt;
2191 
2192 	rt = container_of(dst, struct rt6_info, dst);
2193 
2194 	rcu_read_lock();
2195 
2196 	/* All IPV6 dsts are created with ->obsolete set to the value
2197 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2198 	 * into this function always.
2199 	 */
2200 
2201 	from = rcu_dereference(rt->from);
2202 
2203 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2204 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2205 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2206 	else
2207 		dst_ret = rt6_check(rt, from, cookie);
2208 
2209 	rcu_read_unlock();
2210 
2211 	return dst_ret;
2212 }
2213 
2214 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2215 {
2216 	struct rt6_info *rt = (struct rt6_info *) dst;
2217 
2218 	if (rt) {
2219 		if (rt->rt6i_flags & RTF_CACHE) {
2220 			rcu_read_lock();
2221 			if (rt6_check_expired(rt)) {
2222 				rt6_remove_exception_rt(rt);
2223 				dst = NULL;
2224 			}
2225 			rcu_read_unlock();
2226 		} else {
2227 			dst_release(dst);
2228 			dst = NULL;
2229 		}
2230 	}
2231 	return dst;
2232 }
2233 
2234 static void ip6_link_failure(struct sk_buff *skb)
2235 {
2236 	struct rt6_info *rt;
2237 
2238 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2239 
2240 	rt = (struct rt6_info *) skb_dst(skb);
2241 	if (rt) {
2242 		rcu_read_lock();
2243 		if (rt->rt6i_flags & RTF_CACHE) {
2244 			if (dst_hold_safe(&rt->dst))
2245 				rt6_remove_exception_rt(rt);
2246 		} else {
2247 			struct fib6_info *from;
2248 			struct fib6_node *fn;
2249 
2250 			from = rcu_dereference(rt->from);
2251 			if (from) {
2252 				fn = rcu_dereference(from->fib6_node);
2253 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2254 					fn->fn_sernum = -1;
2255 			}
2256 		}
2257 		rcu_read_unlock();
2258 	}
2259 }
2260 
2261 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2262 {
2263 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2264 		struct fib6_info *from;
2265 
2266 		rcu_read_lock();
2267 		from = rcu_dereference(rt0->from);
2268 		if (from)
2269 			rt0->dst.expires = from->expires;
2270 		rcu_read_unlock();
2271 	}
2272 
2273 	dst_set_expires(&rt0->dst, timeout);
2274 	rt0->rt6i_flags |= RTF_EXPIRES;
2275 }
2276 
2277 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2278 {
2279 	struct net *net = dev_net(rt->dst.dev);
2280 
2281 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2282 	rt->rt6i_flags |= RTF_MODIFIED;
2283 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2284 }
2285 
2286 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2287 {
2288 	bool from_set;
2289 
2290 	rcu_read_lock();
2291 	from_set = !!rcu_dereference(rt->from);
2292 	rcu_read_unlock();
2293 
2294 	return !(rt->rt6i_flags & RTF_CACHE) &&
2295 		(rt->rt6i_flags & RTF_PCPU || from_set);
2296 }
2297 
2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2299 				 const struct ipv6hdr *iph, u32 mtu)
2300 {
2301 	const struct in6_addr *daddr, *saddr;
2302 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2303 
2304 	if (rt6->rt6i_flags & RTF_LOCAL)
2305 		return;
2306 
2307 	if (dst_metric_locked(dst, RTAX_MTU))
2308 		return;
2309 
2310 	if (iph) {
2311 		daddr = &iph->daddr;
2312 		saddr = &iph->saddr;
2313 	} else if (sk) {
2314 		daddr = &sk->sk_v6_daddr;
2315 		saddr = &inet6_sk(sk)->saddr;
2316 	} else {
2317 		daddr = NULL;
2318 		saddr = NULL;
2319 	}
2320 	dst_confirm_neigh(dst, daddr);
2321 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2322 	if (mtu >= dst_mtu(dst))
2323 		return;
2324 
2325 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2326 		rt6_do_update_pmtu(rt6, mtu);
2327 		/* update rt6_ex->stamp for cache */
2328 		if (rt6->rt6i_flags & RTF_CACHE)
2329 			rt6_update_exception_stamp_rt(rt6);
2330 	} else if (daddr) {
2331 		struct fib6_info *from;
2332 		struct rt6_info *nrt6;
2333 
2334 		rcu_read_lock();
2335 		from = rcu_dereference(rt6->from);
2336 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2337 		if (nrt6) {
2338 			rt6_do_update_pmtu(nrt6, mtu);
2339 			if (rt6_insert_exception(nrt6, from))
2340 				dst_release_immediate(&nrt6->dst);
2341 		}
2342 		rcu_read_unlock();
2343 	}
2344 }
2345 
2346 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2347 			       struct sk_buff *skb, u32 mtu)
2348 {
2349 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2350 }
2351 
2352 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2353 		     int oif, u32 mark, kuid_t uid)
2354 {
2355 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2356 	struct dst_entry *dst;
2357 	struct flowi6 fl6;
2358 
2359 	memset(&fl6, 0, sizeof(fl6));
2360 	fl6.flowi6_oif = oif;
2361 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2362 	fl6.daddr = iph->daddr;
2363 	fl6.saddr = iph->saddr;
2364 	fl6.flowlabel = ip6_flowinfo(iph);
2365 	fl6.flowi6_uid = uid;
2366 
2367 	dst = ip6_route_output(net, NULL, &fl6);
2368 	if (!dst->error)
2369 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2370 	dst_release(dst);
2371 }
2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2373 
2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2375 {
2376 	struct dst_entry *dst;
2377 
2378 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2379 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2380 
2381 	dst = __sk_dst_get(sk);
2382 	if (!dst || !dst->obsolete ||
2383 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384 		return;
2385 
2386 	bh_lock_sock(sk);
2387 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2388 		ip6_datagram_dst_update(sk, false);
2389 	bh_unlock_sock(sk);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392 
2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2394 			   const struct flowi6 *fl6)
2395 {
2396 #ifdef CONFIG_IPV6_SUBTREES
2397 	struct ipv6_pinfo *np = inet6_sk(sk);
2398 #endif
2399 
2400 	ip6_dst_store(sk, dst,
2401 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2402 		      &sk->sk_v6_daddr : NULL,
2403 #ifdef CONFIG_IPV6_SUBTREES
2404 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2405 		      &np->saddr :
2406 #endif
2407 		      NULL);
2408 }
2409 
2410 /* Handle redirects */
2411 struct ip6rd_flowi {
2412 	struct flowi6 fl6;
2413 	struct in6_addr gateway;
2414 };
2415 
2416 static struct rt6_info *__ip6_route_redirect(struct net *net,
2417 					     struct fib6_table *table,
2418 					     struct flowi6 *fl6,
2419 					     const struct sk_buff *skb,
2420 					     int flags)
2421 {
2422 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2423 	struct rt6_info *ret = NULL, *rt_cache;
2424 	struct fib6_info *rt;
2425 	struct fib6_node *fn;
2426 
2427 	/* Get the "current" route for this destination and
2428 	 * check if the redirect has come from appropriate router.
2429 	 *
2430 	 * RFC 4861 specifies that redirects should only be
2431 	 * accepted if they come from the nexthop to the target.
2432 	 * Due to the way the routes are chosen, this notion
2433 	 * is a bit fuzzy and one might need to check all possible
2434 	 * routes.
2435 	 */
2436 
2437 	rcu_read_lock();
2438 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2439 restart:
2440 	for_each_fib6_node_rt_rcu(fn) {
2441 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2442 			continue;
2443 		if (fib6_check_expired(rt))
2444 			continue;
2445 		if (rt->fib6_flags & RTF_REJECT)
2446 			break;
2447 		if (!(rt->fib6_flags & RTF_GATEWAY))
2448 			continue;
2449 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2450 			continue;
2451 		/* rt_cache's gateway might be different from its 'parent'
2452 		 * in the case of an ip redirect.
2453 		 * So we keep searching in the exception table if the gateway
2454 		 * is different.
2455 		 */
2456 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2457 			rt_cache = rt6_find_cached_rt(rt,
2458 						      &fl6->daddr,
2459 						      &fl6->saddr);
2460 			if (rt_cache &&
2461 			    ipv6_addr_equal(&rdfl->gateway,
2462 					    &rt_cache->rt6i_gateway)) {
2463 				ret = rt_cache;
2464 				break;
2465 			}
2466 			continue;
2467 		}
2468 		break;
2469 	}
2470 
2471 	if (!rt)
2472 		rt = net->ipv6.fib6_null_entry;
2473 	else if (rt->fib6_flags & RTF_REJECT) {
2474 		ret = net->ipv6.ip6_null_entry;
2475 		goto out;
2476 	}
2477 
2478 	if (rt == net->ipv6.fib6_null_entry) {
2479 		fn = fib6_backtrack(fn, &fl6->saddr);
2480 		if (fn)
2481 			goto restart;
2482 	}
2483 
2484 out:
2485 	if (ret)
2486 		dst_hold(&ret->dst);
2487 	else
2488 		ret = ip6_create_rt_rcu(rt);
2489 
2490 	rcu_read_unlock();
2491 
2492 	trace_fib6_table_lookup(net, rt, table, fl6);
2493 	return ret;
2494 };
2495 
2496 static struct dst_entry *ip6_route_redirect(struct net *net,
2497 					    const struct flowi6 *fl6,
2498 					    const struct sk_buff *skb,
2499 					    const struct in6_addr *gateway)
2500 {
2501 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2502 	struct ip6rd_flowi rdfl;
2503 
2504 	rdfl.fl6 = *fl6;
2505 	rdfl.gateway = *gateway;
2506 
2507 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2508 				flags, __ip6_route_redirect);
2509 }
2510 
2511 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2512 		  kuid_t uid)
2513 {
2514 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2515 	struct dst_entry *dst;
2516 	struct flowi6 fl6;
2517 
2518 	memset(&fl6, 0, sizeof(fl6));
2519 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2520 	fl6.flowi6_oif = oif;
2521 	fl6.flowi6_mark = mark;
2522 	fl6.daddr = iph->daddr;
2523 	fl6.saddr = iph->saddr;
2524 	fl6.flowlabel = ip6_flowinfo(iph);
2525 	fl6.flowi6_uid = uid;
2526 
2527 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2528 	rt6_do_redirect(dst, NULL, skb);
2529 	dst_release(dst);
2530 }
2531 EXPORT_SYMBOL_GPL(ip6_redirect);
2532 
2533 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2534 			    u32 mark)
2535 {
2536 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2537 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2538 	struct dst_entry *dst;
2539 	struct flowi6 fl6;
2540 
2541 	memset(&fl6, 0, sizeof(fl6));
2542 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2543 	fl6.flowi6_oif = oif;
2544 	fl6.flowi6_mark = mark;
2545 	fl6.daddr = msg->dest;
2546 	fl6.saddr = iph->daddr;
2547 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2548 
2549 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2550 	rt6_do_redirect(dst, NULL, skb);
2551 	dst_release(dst);
2552 }
2553 
2554 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2555 {
2556 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2557 		     sk->sk_uid);
2558 }
2559 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2560 
2561 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2562 {
2563 	struct net_device *dev = dst->dev;
2564 	unsigned int mtu = dst_mtu(dst);
2565 	struct net *net = dev_net(dev);
2566 
2567 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2568 
2569 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2570 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2571 
2572 	/*
2573 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2574 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2575 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2576 	 * rely only on pmtu discovery"
2577 	 */
2578 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2579 		mtu = IPV6_MAXPLEN;
2580 	return mtu;
2581 }
2582 
2583 static unsigned int ip6_mtu(const struct dst_entry *dst)
2584 {
2585 	struct inet6_dev *idev;
2586 	unsigned int mtu;
2587 
2588 	mtu = dst_metric_raw(dst, RTAX_MTU);
2589 	if (mtu)
2590 		goto out;
2591 
2592 	mtu = IPV6_MIN_MTU;
2593 
2594 	rcu_read_lock();
2595 	idev = __in6_dev_get(dst->dev);
2596 	if (idev)
2597 		mtu = idev->cnf.mtu6;
2598 	rcu_read_unlock();
2599 
2600 out:
2601 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2602 
2603 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2604 }
2605 
2606 /* MTU selection:
2607  * 1. mtu on route is locked - use it
2608  * 2. mtu from nexthop exception
2609  * 3. mtu from egress device
2610  *
2611  * based on ip6_dst_mtu_forward and exception logic of
2612  * rt6_find_cached_rt; called with rcu_read_lock
2613  */
2614 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2615 		      struct in6_addr *saddr)
2616 {
2617 	struct rt6_exception_bucket *bucket;
2618 	struct rt6_exception *rt6_ex;
2619 	struct in6_addr *src_key;
2620 	struct inet6_dev *idev;
2621 	u32 mtu = 0;
2622 
2623 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2624 		mtu = f6i->fib6_pmtu;
2625 		if (mtu)
2626 			goto out;
2627 	}
2628 
2629 	src_key = NULL;
2630 #ifdef CONFIG_IPV6_SUBTREES
2631 	if (f6i->fib6_src.plen)
2632 		src_key = saddr;
2633 #endif
2634 
2635 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2636 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2637 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2638 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2639 
2640 	if (likely(!mtu)) {
2641 		struct net_device *dev = fib6_info_nh_dev(f6i);
2642 
2643 		mtu = IPV6_MIN_MTU;
2644 		idev = __in6_dev_get(dev);
2645 		if (idev && idev->cnf.mtu6 > mtu)
2646 			mtu = idev->cnf.mtu6;
2647 	}
2648 
2649 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2650 out:
2651 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2652 }
2653 
2654 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2655 				  struct flowi6 *fl6)
2656 {
2657 	struct dst_entry *dst;
2658 	struct rt6_info *rt;
2659 	struct inet6_dev *idev = in6_dev_get(dev);
2660 	struct net *net = dev_net(dev);
2661 
2662 	if (unlikely(!idev))
2663 		return ERR_PTR(-ENODEV);
2664 
2665 	rt = ip6_dst_alloc(net, dev, 0);
2666 	if (unlikely(!rt)) {
2667 		in6_dev_put(idev);
2668 		dst = ERR_PTR(-ENOMEM);
2669 		goto out;
2670 	}
2671 
2672 	rt->dst.flags |= DST_HOST;
2673 	rt->dst.input = ip6_input;
2674 	rt->dst.output  = ip6_output;
2675 	rt->rt6i_gateway  = fl6->daddr;
2676 	rt->rt6i_dst.addr = fl6->daddr;
2677 	rt->rt6i_dst.plen = 128;
2678 	rt->rt6i_idev     = idev;
2679 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2680 
2681 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2682 	 * do proper release of the net_device
2683 	 */
2684 	rt6_uncached_list_add(rt);
2685 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2686 
2687 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2688 
2689 out:
2690 	return dst;
2691 }
2692 
2693 static int ip6_dst_gc(struct dst_ops *ops)
2694 {
2695 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2696 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2697 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2698 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2699 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2700 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2701 	int entries;
2702 
2703 	entries = dst_entries_get_fast(ops);
2704 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2705 	    entries <= rt_max_size)
2706 		goto out;
2707 
2708 	net->ipv6.ip6_rt_gc_expire++;
2709 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2710 	entries = dst_entries_get_slow(ops);
2711 	if (entries < ops->gc_thresh)
2712 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2713 out:
2714 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2715 	return entries > rt_max_size;
2716 }
2717 
2718 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2719 			       struct fib6_config *cfg)
2720 {
2721 	struct dst_metrics *p;
2722 
2723 	if (!cfg->fc_mx)
2724 		return 0;
2725 
2726 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2727 	if (unlikely(!p))
2728 		return -ENOMEM;
2729 
2730 	refcount_set(&p->refcnt, 1);
2731 	rt->fib6_metrics = p;
2732 
2733 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2734 }
2735 
2736 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2737 					    struct fib6_config *cfg,
2738 					    const struct in6_addr *gw_addr,
2739 					    u32 tbid, int flags)
2740 {
2741 	struct flowi6 fl6 = {
2742 		.flowi6_oif = cfg->fc_ifindex,
2743 		.daddr = *gw_addr,
2744 		.saddr = cfg->fc_prefsrc,
2745 	};
2746 	struct fib6_table *table;
2747 	struct rt6_info *rt;
2748 
2749 	table = fib6_get_table(net, tbid);
2750 	if (!table)
2751 		return NULL;
2752 
2753 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2754 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2755 
2756 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2757 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2758 
2759 	/* if table lookup failed, fall back to full lookup */
2760 	if (rt == net->ipv6.ip6_null_entry) {
2761 		ip6_rt_put(rt);
2762 		rt = NULL;
2763 	}
2764 
2765 	return rt;
2766 }
2767 
2768 static int ip6_route_check_nh_onlink(struct net *net,
2769 				     struct fib6_config *cfg,
2770 				     const struct net_device *dev,
2771 				     struct netlink_ext_ack *extack)
2772 {
2773 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2774 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2775 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2776 	struct rt6_info *grt;
2777 	int err;
2778 
2779 	err = 0;
2780 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2781 	if (grt) {
2782 		if (!grt->dst.error &&
2783 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2784 			NL_SET_ERR_MSG(extack,
2785 				       "Nexthop has invalid gateway or device mismatch");
2786 			err = -EINVAL;
2787 		}
2788 
2789 		ip6_rt_put(grt);
2790 	}
2791 
2792 	return err;
2793 }
2794 
2795 static int ip6_route_check_nh(struct net *net,
2796 			      struct fib6_config *cfg,
2797 			      struct net_device **_dev,
2798 			      struct inet6_dev **idev)
2799 {
2800 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2801 	struct net_device *dev = _dev ? *_dev : NULL;
2802 	struct rt6_info *grt = NULL;
2803 	int err = -EHOSTUNREACH;
2804 
2805 	if (cfg->fc_table) {
2806 		int flags = RT6_LOOKUP_F_IFACE;
2807 
2808 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2809 					  cfg->fc_table, flags);
2810 		if (grt) {
2811 			if (grt->rt6i_flags & RTF_GATEWAY ||
2812 			    (dev && dev != grt->dst.dev)) {
2813 				ip6_rt_put(grt);
2814 				grt = NULL;
2815 			}
2816 		}
2817 	}
2818 
2819 	if (!grt)
2820 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2821 
2822 	if (!grt)
2823 		goto out;
2824 
2825 	if (dev) {
2826 		if (dev != grt->dst.dev) {
2827 			ip6_rt_put(grt);
2828 			goto out;
2829 		}
2830 	} else {
2831 		*_dev = dev = grt->dst.dev;
2832 		*idev = grt->rt6i_idev;
2833 		dev_hold(dev);
2834 		in6_dev_hold(grt->rt6i_idev);
2835 	}
2836 
2837 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2838 		err = 0;
2839 
2840 	ip6_rt_put(grt);
2841 
2842 out:
2843 	return err;
2844 }
2845 
2846 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2847 			   struct net_device **_dev, struct inet6_dev **idev,
2848 			   struct netlink_ext_ack *extack)
2849 {
2850 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2851 	int gwa_type = ipv6_addr_type(gw_addr);
2852 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2853 	const struct net_device *dev = *_dev;
2854 	bool need_addr_check = !dev;
2855 	int err = -EINVAL;
2856 
2857 	/* if gw_addr is local we will fail to detect this in case
2858 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2859 	 * will return already-added prefix route via interface that
2860 	 * prefix route was assigned to, which might be non-loopback.
2861 	 */
2862 	if (dev &&
2863 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2864 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2865 		goto out;
2866 	}
2867 
2868 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2869 		/* IPv6 strictly inhibits using not link-local
2870 		 * addresses as nexthop address.
2871 		 * Otherwise, router will not able to send redirects.
2872 		 * It is very good, but in some (rare!) circumstances
2873 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2874 		 * some exceptions. --ANK
2875 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2876 		 * addressing
2877 		 */
2878 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2879 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2880 			goto out;
2881 		}
2882 
2883 		if (cfg->fc_flags & RTNH_F_ONLINK)
2884 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2885 		else
2886 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2887 
2888 		if (err)
2889 			goto out;
2890 	}
2891 
2892 	/* reload in case device was changed */
2893 	dev = *_dev;
2894 
2895 	err = -EINVAL;
2896 	if (!dev) {
2897 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2898 		goto out;
2899 	} else if (dev->flags & IFF_LOOPBACK) {
2900 		NL_SET_ERR_MSG(extack,
2901 			       "Egress device can not be loopback device for this route");
2902 		goto out;
2903 	}
2904 
2905 	/* if we did not check gw_addr above, do so now that the
2906 	 * egress device has been resolved.
2907 	 */
2908 	if (need_addr_check &&
2909 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2910 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2911 		goto out;
2912 	}
2913 
2914 	err = 0;
2915 out:
2916 	return err;
2917 }
2918 
2919 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2920 					      gfp_t gfp_flags,
2921 					      struct netlink_ext_ack *extack)
2922 {
2923 	struct net *net = cfg->fc_nlinfo.nl_net;
2924 	struct fib6_info *rt = NULL;
2925 	struct net_device *dev = NULL;
2926 	struct inet6_dev *idev = NULL;
2927 	struct fib6_table *table;
2928 	int addr_type;
2929 	int err = -EINVAL;
2930 
2931 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2932 	if (cfg->fc_flags & RTF_PCPU) {
2933 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2934 		goto out;
2935 	}
2936 
2937 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2938 	if (cfg->fc_flags & RTF_CACHE) {
2939 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2940 		goto out;
2941 	}
2942 
2943 	if (cfg->fc_type > RTN_MAX) {
2944 		NL_SET_ERR_MSG(extack, "Invalid route type");
2945 		goto out;
2946 	}
2947 
2948 	if (cfg->fc_dst_len > 128) {
2949 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2950 		goto out;
2951 	}
2952 	if (cfg->fc_src_len > 128) {
2953 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2954 		goto out;
2955 	}
2956 #ifndef CONFIG_IPV6_SUBTREES
2957 	if (cfg->fc_src_len) {
2958 		NL_SET_ERR_MSG(extack,
2959 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2960 		goto out;
2961 	}
2962 #endif
2963 	if (cfg->fc_ifindex) {
2964 		err = -ENODEV;
2965 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2966 		if (!dev)
2967 			goto out;
2968 		idev = in6_dev_get(dev);
2969 		if (!idev)
2970 			goto out;
2971 	}
2972 
2973 	if (cfg->fc_metric == 0)
2974 		cfg->fc_metric = IP6_RT_PRIO_USER;
2975 
2976 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2977 		if (!dev) {
2978 			NL_SET_ERR_MSG(extack,
2979 				       "Nexthop device required for onlink");
2980 			err = -ENODEV;
2981 			goto out;
2982 		}
2983 
2984 		if (!(dev->flags & IFF_UP)) {
2985 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2986 			err = -ENETDOWN;
2987 			goto out;
2988 		}
2989 	}
2990 
2991 	err = -ENOBUFS;
2992 	if (cfg->fc_nlinfo.nlh &&
2993 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2994 		table = fib6_get_table(net, cfg->fc_table);
2995 		if (!table) {
2996 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2997 			table = fib6_new_table(net, cfg->fc_table);
2998 		}
2999 	} else {
3000 		table = fib6_new_table(net, cfg->fc_table);
3001 	}
3002 
3003 	if (!table)
3004 		goto out;
3005 
3006 	err = -ENOMEM;
3007 	rt = fib6_info_alloc(gfp_flags);
3008 	if (!rt)
3009 		goto out;
3010 
3011 	if (cfg->fc_flags & RTF_ADDRCONF)
3012 		rt->dst_nocount = true;
3013 
3014 	err = ip6_convert_metrics(net, rt, cfg);
3015 	if (err < 0)
3016 		goto out;
3017 
3018 	if (cfg->fc_flags & RTF_EXPIRES)
3019 		fib6_set_expires(rt, jiffies +
3020 				clock_t_to_jiffies(cfg->fc_expires));
3021 	else
3022 		fib6_clean_expires(rt);
3023 
3024 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3025 		cfg->fc_protocol = RTPROT_BOOT;
3026 	rt->fib6_protocol = cfg->fc_protocol;
3027 
3028 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3029 
3030 	if (cfg->fc_encap) {
3031 		struct lwtunnel_state *lwtstate;
3032 
3033 		err = lwtunnel_build_state(cfg->fc_encap_type,
3034 					   cfg->fc_encap, AF_INET6, cfg,
3035 					   &lwtstate, extack);
3036 		if (err)
3037 			goto out;
3038 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3039 	}
3040 
3041 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3042 	rt->fib6_dst.plen = cfg->fc_dst_len;
3043 	if (rt->fib6_dst.plen == 128)
3044 		rt->dst_host = true;
3045 
3046 #ifdef CONFIG_IPV6_SUBTREES
3047 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3048 	rt->fib6_src.plen = cfg->fc_src_len;
3049 #endif
3050 
3051 	rt->fib6_metric = cfg->fc_metric;
3052 	rt->fib6_nh.nh_weight = 1;
3053 
3054 	rt->fib6_type = cfg->fc_type;
3055 
3056 	/* We cannot add true routes via loopback here,
3057 	   they would result in kernel looping; promote them to reject routes
3058 	 */
3059 	if ((cfg->fc_flags & RTF_REJECT) ||
3060 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3061 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3062 	     !(cfg->fc_flags & RTF_LOCAL))) {
3063 		/* hold loopback dev/idev if we haven't done so. */
3064 		if (dev != net->loopback_dev) {
3065 			if (dev) {
3066 				dev_put(dev);
3067 				in6_dev_put(idev);
3068 			}
3069 			dev = net->loopback_dev;
3070 			dev_hold(dev);
3071 			idev = in6_dev_get(dev);
3072 			if (!idev) {
3073 				err = -ENODEV;
3074 				goto out;
3075 			}
3076 		}
3077 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3078 		goto install_route;
3079 	}
3080 
3081 	if (cfg->fc_flags & RTF_GATEWAY) {
3082 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3083 		if (err)
3084 			goto out;
3085 
3086 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3087 	}
3088 
3089 	err = -ENODEV;
3090 	if (!dev)
3091 		goto out;
3092 
3093 	if (idev->cnf.disable_ipv6) {
3094 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3095 		err = -EACCES;
3096 		goto out;
3097 	}
3098 
3099 	if (!(dev->flags & IFF_UP)) {
3100 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3101 		err = -ENETDOWN;
3102 		goto out;
3103 	}
3104 
3105 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3106 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3107 			NL_SET_ERR_MSG(extack, "Invalid source address");
3108 			err = -EINVAL;
3109 			goto out;
3110 		}
3111 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3112 		rt->fib6_prefsrc.plen = 128;
3113 	} else
3114 		rt->fib6_prefsrc.plen = 0;
3115 
3116 	rt->fib6_flags = cfg->fc_flags;
3117 
3118 install_route:
3119 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3120 	    !netif_carrier_ok(dev))
3121 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3122 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3123 	rt->fib6_nh.nh_dev = dev;
3124 	rt->fib6_table = table;
3125 
3126 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3127 
3128 	if (idev)
3129 		in6_dev_put(idev);
3130 
3131 	return rt;
3132 out:
3133 	if (dev)
3134 		dev_put(dev);
3135 	if (idev)
3136 		in6_dev_put(idev);
3137 
3138 	fib6_info_release(rt);
3139 	return ERR_PTR(err);
3140 }
3141 
3142 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3143 		  struct netlink_ext_ack *extack)
3144 {
3145 	struct fib6_info *rt;
3146 	int err;
3147 
3148 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3149 	if (IS_ERR(rt))
3150 		return PTR_ERR(rt);
3151 
3152 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3153 	fib6_info_release(rt);
3154 
3155 	return err;
3156 }
3157 
3158 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3159 {
3160 	struct net *net = info->nl_net;
3161 	struct fib6_table *table;
3162 	int err;
3163 
3164 	if (rt == net->ipv6.fib6_null_entry) {
3165 		err = -ENOENT;
3166 		goto out;
3167 	}
3168 
3169 	table = rt->fib6_table;
3170 	spin_lock_bh(&table->tb6_lock);
3171 	err = fib6_del(rt, info);
3172 	spin_unlock_bh(&table->tb6_lock);
3173 
3174 out:
3175 	fib6_info_release(rt);
3176 	return err;
3177 }
3178 
3179 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3180 {
3181 	struct nl_info info = { .nl_net = net };
3182 
3183 	return __ip6_del_rt(rt, &info);
3184 }
3185 
3186 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3187 {
3188 	struct nl_info *info = &cfg->fc_nlinfo;
3189 	struct net *net = info->nl_net;
3190 	struct sk_buff *skb = NULL;
3191 	struct fib6_table *table;
3192 	int err = -ENOENT;
3193 
3194 	if (rt == net->ipv6.fib6_null_entry)
3195 		goto out_put;
3196 	table = rt->fib6_table;
3197 	spin_lock_bh(&table->tb6_lock);
3198 
3199 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3200 		struct fib6_info *sibling, *next_sibling;
3201 
3202 		/* prefer to send a single notification with all hops */
3203 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3204 		if (skb) {
3205 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3206 
3207 			if (rt6_fill_node(net, skb, rt, NULL,
3208 					  NULL, NULL, 0, RTM_DELROUTE,
3209 					  info->portid, seq, 0) < 0) {
3210 				kfree_skb(skb);
3211 				skb = NULL;
3212 			} else
3213 				info->skip_notify = 1;
3214 		}
3215 
3216 		list_for_each_entry_safe(sibling, next_sibling,
3217 					 &rt->fib6_siblings,
3218 					 fib6_siblings) {
3219 			err = fib6_del(sibling, info);
3220 			if (err)
3221 				goto out_unlock;
3222 		}
3223 	}
3224 
3225 	err = fib6_del(rt, info);
3226 out_unlock:
3227 	spin_unlock_bh(&table->tb6_lock);
3228 out_put:
3229 	fib6_info_release(rt);
3230 
3231 	if (skb) {
3232 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3233 			    info->nlh, gfp_any());
3234 	}
3235 	return err;
3236 }
3237 
3238 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3239 {
3240 	int rc = -ESRCH;
3241 
3242 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3243 		goto out;
3244 
3245 	if (cfg->fc_flags & RTF_GATEWAY &&
3246 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3247 		goto out;
3248 	if (dst_hold_safe(&rt->dst))
3249 		rc = rt6_remove_exception_rt(rt);
3250 out:
3251 	return rc;
3252 }
3253 
3254 static int ip6_route_del(struct fib6_config *cfg,
3255 			 struct netlink_ext_ack *extack)
3256 {
3257 	struct rt6_info *rt_cache;
3258 	struct fib6_table *table;
3259 	struct fib6_info *rt;
3260 	struct fib6_node *fn;
3261 	int err = -ESRCH;
3262 
3263 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3264 	if (!table) {
3265 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3266 		return err;
3267 	}
3268 
3269 	rcu_read_lock();
3270 
3271 	fn = fib6_locate(&table->tb6_root,
3272 			 &cfg->fc_dst, cfg->fc_dst_len,
3273 			 &cfg->fc_src, cfg->fc_src_len,
3274 			 !(cfg->fc_flags & RTF_CACHE));
3275 
3276 	if (fn) {
3277 		for_each_fib6_node_rt_rcu(fn) {
3278 			if (cfg->fc_flags & RTF_CACHE) {
3279 				int rc;
3280 
3281 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3282 							      &cfg->fc_src);
3283 				if (rt_cache) {
3284 					rc = ip6_del_cached_rt(rt_cache, cfg);
3285 					if (rc != -ESRCH) {
3286 						rcu_read_unlock();
3287 						return rc;
3288 					}
3289 				}
3290 				continue;
3291 			}
3292 			if (cfg->fc_ifindex &&
3293 			    (!rt->fib6_nh.nh_dev ||
3294 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3295 				continue;
3296 			if (cfg->fc_flags & RTF_GATEWAY &&
3297 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3298 				continue;
3299 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3300 				continue;
3301 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3302 				continue;
3303 			fib6_info_hold(rt);
3304 			rcu_read_unlock();
3305 
3306 			/* if gateway was specified only delete the one hop */
3307 			if (cfg->fc_flags & RTF_GATEWAY)
3308 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3309 
3310 			return __ip6_del_rt_siblings(rt, cfg);
3311 		}
3312 	}
3313 	rcu_read_unlock();
3314 
3315 	return err;
3316 }
3317 
3318 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3319 {
3320 	struct netevent_redirect netevent;
3321 	struct rt6_info *rt, *nrt = NULL;
3322 	struct ndisc_options ndopts;
3323 	struct inet6_dev *in6_dev;
3324 	struct neighbour *neigh;
3325 	struct fib6_info *from;
3326 	struct rd_msg *msg;
3327 	int optlen, on_link;
3328 	u8 *lladdr;
3329 
3330 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3331 	optlen -= sizeof(*msg);
3332 
3333 	if (optlen < 0) {
3334 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3335 		return;
3336 	}
3337 
3338 	msg = (struct rd_msg *)icmp6_hdr(skb);
3339 
3340 	if (ipv6_addr_is_multicast(&msg->dest)) {
3341 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3342 		return;
3343 	}
3344 
3345 	on_link = 0;
3346 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3347 		on_link = 1;
3348 	} else if (ipv6_addr_type(&msg->target) !=
3349 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3350 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3351 		return;
3352 	}
3353 
3354 	in6_dev = __in6_dev_get(skb->dev);
3355 	if (!in6_dev)
3356 		return;
3357 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3358 		return;
3359 
3360 	/* RFC2461 8.1:
3361 	 *	The IP source address of the Redirect MUST be the same as the current
3362 	 *	first-hop router for the specified ICMP Destination Address.
3363 	 */
3364 
3365 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3366 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3367 		return;
3368 	}
3369 
3370 	lladdr = NULL;
3371 	if (ndopts.nd_opts_tgt_lladdr) {
3372 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3373 					     skb->dev);
3374 		if (!lladdr) {
3375 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3376 			return;
3377 		}
3378 	}
3379 
3380 	rt = (struct rt6_info *) dst;
3381 	if (rt->rt6i_flags & RTF_REJECT) {
3382 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3383 		return;
3384 	}
3385 
3386 	/* Redirect received -> path was valid.
3387 	 * Look, redirects are sent only in response to data packets,
3388 	 * so that this nexthop apparently is reachable. --ANK
3389 	 */
3390 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3391 
3392 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3393 	if (!neigh)
3394 		return;
3395 
3396 	/*
3397 	 *	We have finally decided to accept it.
3398 	 */
3399 
3400 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3401 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3402 		     NEIGH_UPDATE_F_OVERRIDE|
3403 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3404 				     NEIGH_UPDATE_F_ISROUTER)),
3405 		     NDISC_REDIRECT, &ndopts);
3406 
3407 	rcu_read_lock();
3408 	from = rcu_dereference(rt->from);
3409 	fib6_info_hold(from);
3410 	rcu_read_unlock();
3411 
3412 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3413 	if (!nrt)
3414 		goto out;
3415 
3416 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3417 	if (on_link)
3418 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3419 
3420 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3421 
3422 	/* No need to remove rt from the exception table if rt is
3423 	 * a cached route because rt6_insert_exception() will
3424 	 * takes care of it
3425 	 */
3426 	if (rt6_insert_exception(nrt, from)) {
3427 		dst_release_immediate(&nrt->dst);
3428 		goto out;
3429 	}
3430 
3431 	netevent.old = &rt->dst;
3432 	netevent.new = &nrt->dst;
3433 	netevent.daddr = &msg->dest;
3434 	netevent.neigh = neigh;
3435 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3436 
3437 out:
3438 	fib6_info_release(from);
3439 	neigh_release(neigh);
3440 }
3441 
3442 #ifdef CONFIG_IPV6_ROUTE_INFO
3443 static struct fib6_info *rt6_get_route_info(struct net *net,
3444 					   const struct in6_addr *prefix, int prefixlen,
3445 					   const struct in6_addr *gwaddr,
3446 					   struct net_device *dev)
3447 {
3448 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3449 	int ifindex = dev->ifindex;
3450 	struct fib6_node *fn;
3451 	struct fib6_info *rt = NULL;
3452 	struct fib6_table *table;
3453 
3454 	table = fib6_get_table(net, tb_id);
3455 	if (!table)
3456 		return NULL;
3457 
3458 	rcu_read_lock();
3459 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3460 	if (!fn)
3461 		goto out;
3462 
3463 	for_each_fib6_node_rt_rcu(fn) {
3464 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3465 			continue;
3466 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3467 			continue;
3468 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3469 			continue;
3470 		fib6_info_hold(rt);
3471 		break;
3472 	}
3473 out:
3474 	rcu_read_unlock();
3475 	return rt;
3476 }
3477 
3478 static struct fib6_info *rt6_add_route_info(struct net *net,
3479 					   const struct in6_addr *prefix, int prefixlen,
3480 					   const struct in6_addr *gwaddr,
3481 					   struct net_device *dev,
3482 					   unsigned int pref)
3483 {
3484 	struct fib6_config cfg = {
3485 		.fc_metric	= IP6_RT_PRIO_USER,
3486 		.fc_ifindex	= dev->ifindex,
3487 		.fc_dst_len	= prefixlen,
3488 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3489 				  RTF_UP | RTF_PREF(pref),
3490 		.fc_protocol = RTPROT_RA,
3491 		.fc_type = RTN_UNICAST,
3492 		.fc_nlinfo.portid = 0,
3493 		.fc_nlinfo.nlh = NULL,
3494 		.fc_nlinfo.nl_net = net,
3495 	};
3496 
3497 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3498 	cfg.fc_dst = *prefix;
3499 	cfg.fc_gateway = *gwaddr;
3500 
3501 	/* We should treat it as a default route if prefix length is 0. */
3502 	if (!prefixlen)
3503 		cfg.fc_flags |= RTF_DEFAULT;
3504 
3505 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3506 
3507 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3508 }
3509 #endif
3510 
3511 struct fib6_info *rt6_get_dflt_router(struct net *net,
3512 				     const struct in6_addr *addr,
3513 				     struct net_device *dev)
3514 {
3515 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3516 	struct fib6_info *rt;
3517 	struct fib6_table *table;
3518 
3519 	table = fib6_get_table(net, tb_id);
3520 	if (!table)
3521 		return NULL;
3522 
3523 	rcu_read_lock();
3524 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3525 		if (dev == rt->fib6_nh.nh_dev &&
3526 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3527 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3528 			break;
3529 	}
3530 	if (rt)
3531 		fib6_info_hold(rt);
3532 	rcu_read_unlock();
3533 	return rt;
3534 }
3535 
3536 struct fib6_info *rt6_add_dflt_router(struct net *net,
3537 				     const struct in6_addr *gwaddr,
3538 				     struct net_device *dev,
3539 				     unsigned int pref)
3540 {
3541 	struct fib6_config cfg = {
3542 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3543 		.fc_metric	= IP6_RT_PRIO_USER,
3544 		.fc_ifindex	= dev->ifindex,
3545 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3546 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3547 		.fc_protocol = RTPROT_RA,
3548 		.fc_type = RTN_UNICAST,
3549 		.fc_nlinfo.portid = 0,
3550 		.fc_nlinfo.nlh = NULL,
3551 		.fc_nlinfo.nl_net = net,
3552 	};
3553 
3554 	cfg.fc_gateway = *gwaddr;
3555 
3556 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3557 		struct fib6_table *table;
3558 
3559 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3560 		if (table)
3561 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3562 	}
3563 
3564 	return rt6_get_dflt_router(net, gwaddr, dev);
3565 }
3566 
3567 static void __rt6_purge_dflt_routers(struct net *net,
3568 				     struct fib6_table *table)
3569 {
3570 	struct fib6_info *rt;
3571 
3572 restart:
3573 	rcu_read_lock();
3574 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3575 		struct net_device *dev = fib6_info_nh_dev(rt);
3576 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3577 
3578 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3579 		    (!idev || idev->cnf.accept_ra != 2)) {
3580 			fib6_info_hold(rt);
3581 			rcu_read_unlock();
3582 			ip6_del_rt(net, rt);
3583 			goto restart;
3584 		}
3585 	}
3586 	rcu_read_unlock();
3587 
3588 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3589 }
3590 
3591 void rt6_purge_dflt_routers(struct net *net)
3592 {
3593 	struct fib6_table *table;
3594 	struct hlist_head *head;
3595 	unsigned int h;
3596 
3597 	rcu_read_lock();
3598 
3599 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3600 		head = &net->ipv6.fib_table_hash[h];
3601 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3602 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3603 				__rt6_purge_dflt_routers(net, table);
3604 		}
3605 	}
3606 
3607 	rcu_read_unlock();
3608 }
3609 
3610 static void rtmsg_to_fib6_config(struct net *net,
3611 				 struct in6_rtmsg *rtmsg,
3612 				 struct fib6_config *cfg)
3613 {
3614 	memset(cfg, 0, sizeof(*cfg));
3615 
3616 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3617 			 : RT6_TABLE_MAIN;
3618 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3619 	cfg->fc_metric = rtmsg->rtmsg_metric;
3620 	cfg->fc_expires = rtmsg->rtmsg_info;
3621 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3622 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3623 	cfg->fc_flags = rtmsg->rtmsg_flags;
3624 	cfg->fc_type = rtmsg->rtmsg_type;
3625 
3626 	cfg->fc_nlinfo.nl_net = net;
3627 
3628 	cfg->fc_dst = rtmsg->rtmsg_dst;
3629 	cfg->fc_src = rtmsg->rtmsg_src;
3630 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3631 }
3632 
3633 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3634 {
3635 	struct fib6_config cfg;
3636 	struct in6_rtmsg rtmsg;
3637 	int err;
3638 
3639 	switch (cmd) {
3640 	case SIOCADDRT:		/* Add a route */
3641 	case SIOCDELRT:		/* Delete a route */
3642 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3643 			return -EPERM;
3644 		err = copy_from_user(&rtmsg, arg,
3645 				     sizeof(struct in6_rtmsg));
3646 		if (err)
3647 			return -EFAULT;
3648 
3649 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3650 
3651 		rtnl_lock();
3652 		switch (cmd) {
3653 		case SIOCADDRT:
3654 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3655 			break;
3656 		case SIOCDELRT:
3657 			err = ip6_route_del(&cfg, NULL);
3658 			break;
3659 		default:
3660 			err = -EINVAL;
3661 		}
3662 		rtnl_unlock();
3663 
3664 		return err;
3665 	}
3666 
3667 	return -EINVAL;
3668 }
3669 
3670 /*
3671  *	Drop the packet on the floor
3672  */
3673 
3674 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3675 {
3676 	int type;
3677 	struct dst_entry *dst = skb_dst(skb);
3678 	switch (ipstats_mib_noroutes) {
3679 	case IPSTATS_MIB_INNOROUTES:
3680 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3681 		if (type == IPV6_ADDR_ANY) {
3682 			IP6_INC_STATS(dev_net(dst->dev),
3683 				      __in6_dev_get_safely(skb->dev),
3684 				      IPSTATS_MIB_INADDRERRORS);
3685 			break;
3686 		}
3687 		/* FALLTHROUGH */
3688 	case IPSTATS_MIB_OUTNOROUTES:
3689 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3690 			      ipstats_mib_noroutes);
3691 		break;
3692 	}
3693 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3694 	kfree_skb(skb);
3695 	return 0;
3696 }
3697 
3698 static int ip6_pkt_discard(struct sk_buff *skb)
3699 {
3700 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3701 }
3702 
3703 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3704 {
3705 	skb->dev = skb_dst(skb)->dev;
3706 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3707 }
3708 
3709 static int ip6_pkt_prohibit(struct sk_buff *skb)
3710 {
3711 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3712 }
3713 
3714 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3715 {
3716 	skb->dev = skb_dst(skb)->dev;
3717 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3718 }
3719 
3720 /*
3721  *	Allocate a dst for local (unicast / anycast) address.
3722  */
3723 
3724 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3725 				     struct inet6_dev *idev,
3726 				     const struct in6_addr *addr,
3727 				     bool anycast, gfp_t gfp_flags)
3728 {
3729 	u32 tb_id;
3730 	struct net_device *dev = idev->dev;
3731 	struct fib6_info *f6i;
3732 
3733 	f6i = fib6_info_alloc(gfp_flags);
3734 	if (!f6i)
3735 		return ERR_PTR(-ENOMEM);
3736 
3737 	f6i->dst_nocount = true;
3738 	f6i->dst_host = true;
3739 	f6i->fib6_protocol = RTPROT_KERNEL;
3740 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3741 	if (anycast) {
3742 		f6i->fib6_type = RTN_ANYCAST;
3743 		f6i->fib6_flags |= RTF_ANYCAST;
3744 	} else {
3745 		f6i->fib6_type = RTN_LOCAL;
3746 		f6i->fib6_flags |= RTF_LOCAL;
3747 	}
3748 
3749 	f6i->fib6_nh.nh_gw = *addr;
3750 	dev_hold(dev);
3751 	f6i->fib6_nh.nh_dev = dev;
3752 	f6i->fib6_dst.addr = *addr;
3753 	f6i->fib6_dst.plen = 128;
3754 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3755 	f6i->fib6_table = fib6_get_table(net, tb_id);
3756 
3757 	return f6i;
3758 }
3759 
3760 /* remove deleted ip from prefsrc entries */
3761 struct arg_dev_net_ip {
3762 	struct net_device *dev;
3763 	struct net *net;
3764 	struct in6_addr *addr;
3765 };
3766 
3767 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3768 {
3769 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3770 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3771 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3772 
3773 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3774 	    rt != net->ipv6.fib6_null_entry &&
3775 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3776 		spin_lock_bh(&rt6_exception_lock);
3777 		/* remove prefsrc entry */
3778 		rt->fib6_prefsrc.plen = 0;
3779 		/* need to update cache as well */
3780 		rt6_exceptions_remove_prefsrc(rt);
3781 		spin_unlock_bh(&rt6_exception_lock);
3782 	}
3783 	return 0;
3784 }
3785 
3786 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3787 {
3788 	struct net *net = dev_net(ifp->idev->dev);
3789 	struct arg_dev_net_ip adni = {
3790 		.dev = ifp->idev->dev,
3791 		.net = net,
3792 		.addr = &ifp->addr,
3793 	};
3794 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3795 }
3796 
3797 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3798 
3799 /* Remove routers and update dst entries when gateway turn into host. */
3800 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3801 {
3802 	struct in6_addr *gateway = (struct in6_addr *)arg;
3803 
3804 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3805 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3806 		return -1;
3807 	}
3808 
3809 	/* Further clean up cached routes in exception table.
3810 	 * This is needed because cached route may have a different
3811 	 * gateway than its 'parent' in the case of an ip redirect.
3812 	 */
3813 	rt6_exceptions_clean_tohost(rt, gateway);
3814 
3815 	return 0;
3816 }
3817 
3818 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3819 {
3820 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3821 }
3822 
3823 struct arg_netdev_event {
3824 	const struct net_device *dev;
3825 	union {
3826 		unsigned int nh_flags;
3827 		unsigned long event;
3828 	};
3829 };
3830 
3831 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3832 {
3833 	struct fib6_info *iter;
3834 	struct fib6_node *fn;
3835 
3836 	fn = rcu_dereference_protected(rt->fib6_node,
3837 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3838 	iter = rcu_dereference_protected(fn->leaf,
3839 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3840 	while (iter) {
3841 		if (iter->fib6_metric == rt->fib6_metric &&
3842 		    rt6_qualify_for_ecmp(iter))
3843 			return iter;
3844 		iter = rcu_dereference_protected(iter->fib6_next,
3845 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3846 	}
3847 
3848 	return NULL;
3849 }
3850 
3851 static bool rt6_is_dead(const struct fib6_info *rt)
3852 {
3853 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3854 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3855 	     fib6_ignore_linkdown(rt)))
3856 		return true;
3857 
3858 	return false;
3859 }
3860 
3861 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3862 {
3863 	struct fib6_info *iter;
3864 	int total = 0;
3865 
3866 	if (!rt6_is_dead(rt))
3867 		total += rt->fib6_nh.nh_weight;
3868 
3869 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3870 		if (!rt6_is_dead(iter))
3871 			total += iter->fib6_nh.nh_weight;
3872 	}
3873 
3874 	return total;
3875 }
3876 
3877 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3878 {
3879 	int upper_bound = -1;
3880 
3881 	if (!rt6_is_dead(rt)) {
3882 		*weight += rt->fib6_nh.nh_weight;
3883 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3884 						    total) - 1;
3885 	}
3886 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3887 }
3888 
3889 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3890 {
3891 	struct fib6_info *iter;
3892 	int weight = 0;
3893 
3894 	rt6_upper_bound_set(rt, &weight, total);
3895 
3896 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3897 		rt6_upper_bound_set(iter, &weight, total);
3898 }
3899 
3900 void rt6_multipath_rebalance(struct fib6_info *rt)
3901 {
3902 	struct fib6_info *first;
3903 	int total;
3904 
3905 	/* In case the entire multipath route was marked for flushing,
3906 	 * then there is no need to rebalance upon the removal of every
3907 	 * sibling route.
3908 	 */
3909 	if (!rt->fib6_nsiblings || rt->should_flush)
3910 		return;
3911 
3912 	/* During lookup routes are evaluated in order, so we need to
3913 	 * make sure upper bounds are assigned from the first sibling
3914 	 * onwards.
3915 	 */
3916 	first = rt6_multipath_first_sibling(rt);
3917 	if (WARN_ON_ONCE(!first))
3918 		return;
3919 
3920 	total = rt6_multipath_total_weight(first);
3921 	rt6_multipath_upper_bound_set(first, total);
3922 }
3923 
3924 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3925 {
3926 	const struct arg_netdev_event *arg = p_arg;
3927 	struct net *net = dev_net(arg->dev);
3928 
3929 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3930 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3931 		fib6_update_sernum_upto_root(net, rt);
3932 		rt6_multipath_rebalance(rt);
3933 	}
3934 
3935 	return 0;
3936 }
3937 
3938 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3939 {
3940 	struct arg_netdev_event arg = {
3941 		.dev = dev,
3942 		{
3943 			.nh_flags = nh_flags,
3944 		},
3945 	};
3946 
3947 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3948 		arg.nh_flags |= RTNH_F_LINKDOWN;
3949 
3950 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3951 }
3952 
3953 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3954 				   const struct net_device *dev)
3955 {
3956 	struct fib6_info *iter;
3957 
3958 	if (rt->fib6_nh.nh_dev == dev)
3959 		return true;
3960 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3961 		if (iter->fib6_nh.nh_dev == dev)
3962 			return true;
3963 
3964 	return false;
3965 }
3966 
3967 static void rt6_multipath_flush(struct fib6_info *rt)
3968 {
3969 	struct fib6_info *iter;
3970 
3971 	rt->should_flush = 1;
3972 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3973 		iter->should_flush = 1;
3974 }
3975 
3976 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3977 					     const struct net_device *down_dev)
3978 {
3979 	struct fib6_info *iter;
3980 	unsigned int dead = 0;
3981 
3982 	if (rt->fib6_nh.nh_dev == down_dev ||
3983 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3984 		dead++;
3985 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3986 		if (iter->fib6_nh.nh_dev == down_dev ||
3987 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3988 			dead++;
3989 
3990 	return dead;
3991 }
3992 
3993 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3994 				       const struct net_device *dev,
3995 				       unsigned int nh_flags)
3996 {
3997 	struct fib6_info *iter;
3998 
3999 	if (rt->fib6_nh.nh_dev == dev)
4000 		rt->fib6_nh.nh_flags |= nh_flags;
4001 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4002 		if (iter->fib6_nh.nh_dev == dev)
4003 			iter->fib6_nh.nh_flags |= nh_flags;
4004 }
4005 
4006 /* called with write lock held for table with rt */
4007 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4008 {
4009 	const struct arg_netdev_event *arg = p_arg;
4010 	const struct net_device *dev = arg->dev;
4011 	struct net *net = dev_net(dev);
4012 
4013 	if (rt == net->ipv6.fib6_null_entry)
4014 		return 0;
4015 
4016 	switch (arg->event) {
4017 	case NETDEV_UNREGISTER:
4018 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4019 	case NETDEV_DOWN:
4020 		if (rt->should_flush)
4021 			return -1;
4022 		if (!rt->fib6_nsiblings)
4023 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4024 		if (rt6_multipath_uses_dev(rt, dev)) {
4025 			unsigned int count;
4026 
4027 			count = rt6_multipath_dead_count(rt, dev);
4028 			if (rt->fib6_nsiblings + 1 == count) {
4029 				rt6_multipath_flush(rt);
4030 				return -1;
4031 			}
4032 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4033 						   RTNH_F_LINKDOWN);
4034 			fib6_update_sernum(net, rt);
4035 			rt6_multipath_rebalance(rt);
4036 		}
4037 		return -2;
4038 	case NETDEV_CHANGE:
4039 		if (rt->fib6_nh.nh_dev != dev ||
4040 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4041 			break;
4042 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4043 		rt6_multipath_rebalance(rt);
4044 		break;
4045 	}
4046 
4047 	return 0;
4048 }
4049 
4050 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4051 {
4052 	struct arg_netdev_event arg = {
4053 		.dev = dev,
4054 		{
4055 			.event = event,
4056 		},
4057 	};
4058 
4059 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4060 }
4061 
4062 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4063 {
4064 	rt6_sync_down_dev(dev, event);
4065 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4066 	neigh_ifdown(&nd_tbl, dev);
4067 }
4068 
4069 struct rt6_mtu_change_arg {
4070 	struct net_device *dev;
4071 	unsigned int mtu;
4072 };
4073 
4074 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4075 {
4076 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4077 	struct inet6_dev *idev;
4078 
4079 	/* In IPv6 pmtu discovery is not optional,
4080 	   so that RTAX_MTU lock cannot disable it.
4081 	   We still use this lock to block changes
4082 	   caused by addrconf/ndisc.
4083 	*/
4084 
4085 	idev = __in6_dev_get(arg->dev);
4086 	if (!idev)
4087 		return 0;
4088 
4089 	/* For administrative MTU increase, there is no way to discover
4090 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4091 	   Since RFC 1981 doesn't include administrative MTU increase
4092 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4093 	 */
4094 	if (rt->fib6_nh.nh_dev == arg->dev &&
4095 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4096 		u32 mtu = rt->fib6_pmtu;
4097 
4098 		if (mtu >= arg->mtu ||
4099 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4100 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4101 
4102 		spin_lock_bh(&rt6_exception_lock);
4103 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4104 		spin_unlock_bh(&rt6_exception_lock);
4105 	}
4106 	return 0;
4107 }
4108 
4109 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4110 {
4111 	struct rt6_mtu_change_arg arg = {
4112 		.dev = dev,
4113 		.mtu = mtu,
4114 	};
4115 
4116 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4117 }
4118 
4119 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4120 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4121 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4122 	[RTA_OIF]               = { .type = NLA_U32 },
4123 	[RTA_IIF]		= { .type = NLA_U32 },
4124 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4125 	[RTA_METRICS]           = { .type = NLA_NESTED },
4126 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4127 	[RTA_PREF]              = { .type = NLA_U8 },
4128 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4129 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4130 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4131 	[RTA_UID]		= { .type = NLA_U32 },
4132 	[RTA_MARK]		= { .type = NLA_U32 },
4133 	[RTA_TABLE]		= { .type = NLA_U32 },
4134 };
4135 
4136 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4137 			      struct fib6_config *cfg,
4138 			      struct netlink_ext_ack *extack)
4139 {
4140 	struct rtmsg *rtm;
4141 	struct nlattr *tb[RTA_MAX+1];
4142 	unsigned int pref;
4143 	int err;
4144 
4145 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4146 			  NULL);
4147 	if (err < 0)
4148 		goto errout;
4149 
4150 	err = -EINVAL;
4151 	rtm = nlmsg_data(nlh);
4152 	memset(cfg, 0, sizeof(*cfg));
4153 
4154 	cfg->fc_table = rtm->rtm_table;
4155 	cfg->fc_dst_len = rtm->rtm_dst_len;
4156 	cfg->fc_src_len = rtm->rtm_src_len;
4157 	cfg->fc_flags = RTF_UP;
4158 	cfg->fc_protocol = rtm->rtm_protocol;
4159 	cfg->fc_type = rtm->rtm_type;
4160 
4161 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4162 	    rtm->rtm_type == RTN_BLACKHOLE ||
4163 	    rtm->rtm_type == RTN_PROHIBIT ||
4164 	    rtm->rtm_type == RTN_THROW)
4165 		cfg->fc_flags |= RTF_REJECT;
4166 
4167 	if (rtm->rtm_type == RTN_LOCAL)
4168 		cfg->fc_flags |= RTF_LOCAL;
4169 
4170 	if (rtm->rtm_flags & RTM_F_CLONED)
4171 		cfg->fc_flags |= RTF_CACHE;
4172 
4173 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4174 
4175 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4176 	cfg->fc_nlinfo.nlh = nlh;
4177 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4178 
4179 	if (tb[RTA_GATEWAY]) {
4180 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4181 		cfg->fc_flags |= RTF_GATEWAY;
4182 	}
4183 
4184 	if (tb[RTA_DST]) {
4185 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4186 
4187 		if (nla_len(tb[RTA_DST]) < plen)
4188 			goto errout;
4189 
4190 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4191 	}
4192 
4193 	if (tb[RTA_SRC]) {
4194 		int plen = (rtm->rtm_src_len + 7) >> 3;
4195 
4196 		if (nla_len(tb[RTA_SRC]) < plen)
4197 			goto errout;
4198 
4199 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4200 	}
4201 
4202 	if (tb[RTA_PREFSRC])
4203 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4204 
4205 	if (tb[RTA_OIF])
4206 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4207 
4208 	if (tb[RTA_PRIORITY])
4209 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4210 
4211 	if (tb[RTA_METRICS]) {
4212 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4213 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4214 	}
4215 
4216 	if (tb[RTA_TABLE])
4217 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4218 
4219 	if (tb[RTA_MULTIPATH]) {
4220 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4221 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4222 
4223 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4224 						     cfg->fc_mp_len, extack);
4225 		if (err < 0)
4226 			goto errout;
4227 	}
4228 
4229 	if (tb[RTA_PREF]) {
4230 		pref = nla_get_u8(tb[RTA_PREF]);
4231 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4232 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4233 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4234 		cfg->fc_flags |= RTF_PREF(pref);
4235 	}
4236 
4237 	if (tb[RTA_ENCAP])
4238 		cfg->fc_encap = tb[RTA_ENCAP];
4239 
4240 	if (tb[RTA_ENCAP_TYPE]) {
4241 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4242 
4243 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4244 		if (err < 0)
4245 			goto errout;
4246 	}
4247 
4248 	if (tb[RTA_EXPIRES]) {
4249 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4250 
4251 		if (addrconf_finite_timeout(timeout)) {
4252 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4253 			cfg->fc_flags |= RTF_EXPIRES;
4254 		}
4255 	}
4256 
4257 	err = 0;
4258 errout:
4259 	return err;
4260 }
4261 
4262 struct rt6_nh {
4263 	struct fib6_info *fib6_info;
4264 	struct fib6_config r_cfg;
4265 	struct list_head next;
4266 };
4267 
4268 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4269 {
4270 	struct rt6_nh *nh;
4271 
4272 	list_for_each_entry(nh, rt6_nh_list, next) {
4273 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4274 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4275 		        nh->r_cfg.fc_ifindex);
4276 	}
4277 }
4278 
4279 static int ip6_route_info_append(struct net *net,
4280 				 struct list_head *rt6_nh_list,
4281 				 struct fib6_info *rt,
4282 				 struct fib6_config *r_cfg)
4283 {
4284 	struct rt6_nh *nh;
4285 	int err = -EEXIST;
4286 
4287 	list_for_each_entry(nh, rt6_nh_list, next) {
4288 		/* check if fib6_info already exists */
4289 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4290 			return err;
4291 	}
4292 
4293 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4294 	if (!nh)
4295 		return -ENOMEM;
4296 	nh->fib6_info = rt;
4297 	err = ip6_convert_metrics(net, rt, r_cfg);
4298 	if (err) {
4299 		kfree(nh);
4300 		return err;
4301 	}
4302 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4303 	list_add_tail(&nh->next, rt6_nh_list);
4304 
4305 	return 0;
4306 }
4307 
4308 static void ip6_route_mpath_notify(struct fib6_info *rt,
4309 				   struct fib6_info *rt_last,
4310 				   struct nl_info *info,
4311 				   __u16 nlflags)
4312 {
4313 	/* if this is an APPEND route, then rt points to the first route
4314 	 * inserted and rt_last points to last route inserted. Userspace
4315 	 * wants a consistent dump of the route which starts at the first
4316 	 * nexthop. Since sibling routes are always added at the end of
4317 	 * the list, find the first sibling of the last route appended
4318 	 */
4319 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4320 		rt = list_first_entry(&rt_last->fib6_siblings,
4321 				      struct fib6_info,
4322 				      fib6_siblings);
4323 	}
4324 
4325 	if (rt)
4326 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4327 }
4328 
4329 static int ip6_route_multipath_add(struct fib6_config *cfg,
4330 				   struct netlink_ext_ack *extack)
4331 {
4332 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4333 	struct nl_info *info = &cfg->fc_nlinfo;
4334 	struct fib6_config r_cfg;
4335 	struct rtnexthop *rtnh;
4336 	struct fib6_info *rt;
4337 	struct rt6_nh *err_nh;
4338 	struct rt6_nh *nh, *nh_safe;
4339 	__u16 nlflags;
4340 	int remaining;
4341 	int attrlen;
4342 	int err = 1;
4343 	int nhn = 0;
4344 	int replace = (cfg->fc_nlinfo.nlh &&
4345 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4346 	LIST_HEAD(rt6_nh_list);
4347 
4348 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4349 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4350 		nlflags |= NLM_F_APPEND;
4351 
4352 	remaining = cfg->fc_mp_len;
4353 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4354 
4355 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4356 	 * fib6_info structs per nexthop
4357 	 */
4358 	while (rtnh_ok(rtnh, remaining)) {
4359 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4360 		if (rtnh->rtnh_ifindex)
4361 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4362 
4363 		attrlen = rtnh_attrlen(rtnh);
4364 		if (attrlen > 0) {
4365 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4366 
4367 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4368 			if (nla) {
4369 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4370 				r_cfg.fc_flags |= RTF_GATEWAY;
4371 			}
4372 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4373 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4374 			if (nla)
4375 				r_cfg.fc_encap_type = nla_get_u16(nla);
4376 		}
4377 
4378 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4379 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4380 		if (IS_ERR(rt)) {
4381 			err = PTR_ERR(rt);
4382 			rt = NULL;
4383 			goto cleanup;
4384 		}
4385 
4386 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4387 
4388 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4389 					    rt, &r_cfg);
4390 		if (err) {
4391 			fib6_info_release(rt);
4392 			goto cleanup;
4393 		}
4394 
4395 		rtnh = rtnh_next(rtnh, &remaining);
4396 	}
4397 
4398 	/* for add and replace send one notification with all nexthops.
4399 	 * Skip the notification in fib6_add_rt2node and send one with
4400 	 * the full route when done
4401 	 */
4402 	info->skip_notify = 1;
4403 
4404 	err_nh = NULL;
4405 	list_for_each_entry(nh, &rt6_nh_list, next) {
4406 		rt_last = nh->fib6_info;
4407 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4408 		fib6_info_release(nh->fib6_info);
4409 
4410 		/* save reference to first route for notification */
4411 		if (!rt_notif && !err)
4412 			rt_notif = nh->fib6_info;
4413 
4414 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4415 		nh->fib6_info = NULL;
4416 		if (err) {
4417 			if (replace && nhn)
4418 				ip6_print_replace_route_err(&rt6_nh_list);
4419 			err_nh = nh;
4420 			goto add_errout;
4421 		}
4422 
4423 		/* Because each route is added like a single route we remove
4424 		 * these flags after the first nexthop: if there is a collision,
4425 		 * we have already failed to add the first nexthop:
4426 		 * fib6_add_rt2node() has rejected it; when replacing, old
4427 		 * nexthops have been replaced by first new, the rest should
4428 		 * be added to it.
4429 		 */
4430 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4431 						     NLM_F_REPLACE);
4432 		nhn++;
4433 	}
4434 
4435 	/* success ... tell user about new route */
4436 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4437 	goto cleanup;
4438 
4439 add_errout:
4440 	/* send notification for routes that were added so that
4441 	 * the delete notifications sent by ip6_route_del are
4442 	 * coherent
4443 	 */
4444 	if (rt_notif)
4445 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4446 
4447 	/* Delete routes that were already added */
4448 	list_for_each_entry(nh, &rt6_nh_list, next) {
4449 		if (err_nh == nh)
4450 			break;
4451 		ip6_route_del(&nh->r_cfg, extack);
4452 	}
4453 
4454 cleanup:
4455 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4456 		if (nh->fib6_info)
4457 			fib6_info_release(nh->fib6_info);
4458 		list_del(&nh->next);
4459 		kfree(nh);
4460 	}
4461 
4462 	return err;
4463 }
4464 
4465 static int ip6_route_multipath_del(struct fib6_config *cfg,
4466 				   struct netlink_ext_ack *extack)
4467 {
4468 	struct fib6_config r_cfg;
4469 	struct rtnexthop *rtnh;
4470 	int remaining;
4471 	int attrlen;
4472 	int err = 1, last_err = 0;
4473 
4474 	remaining = cfg->fc_mp_len;
4475 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4476 
4477 	/* Parse a Multipath Entry */
4478 	while (rtnh_ok(rtnh, remaining)) {
4479 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4480 		if (rtnh->rtnh_ifindex)
4481 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4482 
4483 		attrlen = rtnh_attrlen(rtnh);
4484 		if (attrlen > 0) {
4485 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4486 
4487 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4488 			if (nla) {
4489 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4490 				r_cfg.fc_flags |= RTF_GATEWAY;
4491 			}
4492 		}
4493 		err = ip6_route_del(&r_cfg, extack);
4494 		if (err)
4495 			last_err = err;
4496 
4497 		rtnh = rtnh_next(rtnh, &remaining);
4498 	}
4499 
4500 	return last_err;
4501 }
4502 
4503 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4504 			      struct netlink_ext_ack *extack)
4505 {
4506 	struct fib6_config cfg;
4507 	int err;
4508 
4509 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4510 	if (err < 0)
4511 		return err;
4512 
4513 	if (cfg.fc_mp)
4514 		return ip6_route_multipath_del(&cfg, extack);
4515 	else {
4516 		cfg.fc_delete_all_nh = 1;
4517 		return ip6_route_del(&cfg, extack);
4518 	}
4519 }
4520 
4521 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4522 			      struct netlink_ext_ack *extack)
4523 {
4524 	struct fib6_config cfg;
4525 	int err;
4526 
4527 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4528 	if (err < 0)
4529 		return err;
4530 
4531 	if (cfg.fc_mp)
4532 		return ip6_route_multipath_add(&cfg, extack);
4533 	else
4534 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4535 }
4536 
4537 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4538 {
4539 	int nexthop_len = 0;
4540 
4541 	if (rt->fib6_nsiblings) {
4542 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4543 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4544 			    + nla_total_size(16) /* RTA_GATEWAY */
4545 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4546 
4547 		nexthop_len *= rt->fib6_nsiblings;
4548 	}
4549 
4550 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4551 	       + nla_total_size(16) /* RTA_SRC */
4552 	       + nla_total_size(16) /* RTA_DST */
4553 	       + nla_total_size(16) /* RTA_GATEWAY */
4554 	       + nla_total_size(16) /* RTA_PREFSRC */
4555 	       + nla_total_size(4) /* RTA_TABLE */
4556 	       + nla_total_size(4) /* RTA_IIF */
4557 	       + nla_total_size(4) /* RTA_OIF */
4558 	       + nla_total_size(4) /* RTA_PRIORITY */
4559 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4560 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4561 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4562 	       + nla_total_size(1) /* RTA_PREF */
4563 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4564 	       + nexthop_len;
4565 }
4566 
4567 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4568 			    unsigned int *flags, bool skip_oif)
4569 {
4570 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4571 		*flags |= RTNH_F_DEAD;
4572 
4573 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4574 		*flags |= RTNH_F_LINKDOWN;
4575 
4576 		rcu_read_lock();
4577 		if (fib6_ignore_linkdown(rt))
4578 			*flags |= RTNH_F_DEAD;
4579 		rcu_read_unlock();
4580 	}
4581 
4582 	if (rt->fib6_flags & RTF_GATEWAY) {
4583 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4584 			goto nla_put_failure;
4585 	}
4586 
4587 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4588 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4589 		*flags |= RTNH_F_OFFLOAD;
4590 
4591 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4592 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4593 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4594 		goto nla_put_failure;
4595 
4596 	if (rt->fib6_nh.nh_lwtstate &&
4597 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4598 		goto nla_put_failure;
4599 
4600 	return 0;
4601 
4602 nla_put_failure:
4603 	return -EMSGSIZE;
4604 }
4605 
4606 /* add multipath next hop */
4607 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4608 {
4609 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4610 	struct rtnexthop *rtnh;
4611 	unsigned int flags = 0;
4612 
4613 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4614 	if (!rtnh)
4615 		goto nla_put_failure;
4616 
4617 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4618 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4619 
4620 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4621 		goto nla_put_failure;
4622 
4623 	rtnh->rtnh_flags = flags;
4624 
4625 	/* length of rtnetlink header + attributes */
4626 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4627 
4628 	return 0;
4629 
4630 nla_put_failure:
4631 	return -EMSGSIZE;
4632 }
4633 
4634 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4635 			 struct fib6_info *rt, struct dst_entry *dst,
4636 			 struct in6_addr *dest, struct in6_addr *src,
4637 			 int iif, int type, u32 portid, u32 seq,
4638 			 unsigned int flags)
4639 {
4640 	struct rtmsg *rtm;
4641 	struct nlmsghdr *nlh;
4642 	long expires = 0;
4643 	u32 *pmetrics;
4644 	u32 table;
4645 
4646 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4647 	if (!nlh)
4648 		return -EMSGSIZE;
4649 
4650 	rtm = nlmsg_data(nlh);
4651 	rtm->rtm_family = AF_INET6;
4652 	rtm->rtm_dst_len = rt->fib6_dst.plen;
4653 	rtm->rtm_src_len = rt->fib6_src.plen;
4654 	rtm->rtm_tos = 0;
4655 	if (rt->fib6_table)
4656 		table = rt->fib6_table->tb6_id;
4657 	else
4658 		table = RT6_TABLE_UNSPEC;
4659 	rtm->rtm_table = table;
4660 	if (nla_put_u32(skb, RTA_TABLE, table))
4661 		goto nla_put_failure;
4662 
4663 	rtm->rtm_type = rt->fib6_type;
4664 	rtm->rtm_flags = 0;
4665 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4666 	rtm->rtm_protocol = rt->fib6_protocol;
4667 
4668 	if (rt->fib6_flags & RTF_CACHE)
4669 		rtm->rtm_flags |= RTM_F_CLONED;
4670 
4671 	if (dest) {
4672 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4673 			goto nla_put_failure;
4674 		rtm->rtm_dst_len = 128;
4675 	} else if (rtm->rtm_dst_len)
4676 		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4677 			goto nla_put_failure;
4678 #ifdef CONFIG_IPV6_SUBTREES
4679 	if (src) {
4680 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4681 			goto nla_put_failure;
4682 		rtm->rtm_src_len = 128;
4683 	} else if (rtm->rtm_src_len &&
4684 		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4685 		goto nla_put_failure;
4686 #endif
4687 	if (iif) {
4688 #ifdef CONFIG_IPV6_MROUTE
4689 		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4690 			int err = ip6mr_get_route(net, skb, rtm, portid);
4691 
4692 			if (err == 0)
4693 				return 0;
4694 			if (err < 0)
4695 				goto nla_put_failure;
4696 		} else
4697 #endif
4698 			if (nla_put_u32(skb, RTA_IIF, iif))
4699 				goto nla_put_failure;
4700 	} else if (dest) {
4701 		struct in6_addr saddr_buf;
4702 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4703 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4704 			goto nla_put_failure;
4705 	}
4706 
4707 	if (rt->fib6_prefsrc.plen) {
4708 		struct in6_addr saddr_buf;
4709 		saddr_buf = rt->fib6_prefsrc.addr;
4710 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4711 			goto nla_put_failure;
4712 	}
4713 
4714 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4715 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4716 		goto nla_put_failure;
4717 
4718 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4719 		goto nla_put_failure;
4720 
4721 	/* For multipath routes, walk the siblings list and add
4722 	 * each as a nexthop within RTA_MULTIPATH.
4723 	 */
4724 	if (rt->fib6_nsiblings) {
4725 		struct fib6_info *sibling, *next_sibling;
4726 		struct nlattr *mp;
4727 
4728 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4729 		if (!mp)
4730 			goto nla_put_failure;
4731 
4732 		if (rt6_add_nexthop(skb, rt) < 0)
4733 			goto nla_put_failure;
4734 
4735 		list_for_each_entry_safe(sibling, next_sibling,
4736 					 &rt->fib6_siblings, fib6_siblings) {
4737 			if (rt6_add_nexthop(skb, sibling) < 0)
4738 				goto nla_put_failure;
4739 		}
4740 
4741 		nla_nest_end(skb, mp);
4742 	} else {
4743 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4744 			goto nla_put_failure;
4745 	}
4746 
4747 	if (rt->fib6_flags & RTF_EXPIRES) {
4748 		expires = dst ? dst->expires : rt->expires;
4749 		expires -= jiffies;
4750 	}
4751 
4752 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4753 		goto nla_put_failure;
4754 
4755 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4756 		goto nla_put_failure;
4757 
4758 
4759 	nlmsg_end(skb, nlh);
4760 	return 0;
4761 
4762 nla_put_failure:
4763 	nlmsg_cancel(skb, nlh);
4764 	return -EMSGSIZE;
4765 }
4766 
4767 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4768 {
4769 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4770 	struct net *net = arg->net;
4771 
4772 	if (rt == net->ipv6.fib6_null_entry)
4773 		return 0;
4774 
4775 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4776 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4777 
4778 		/* user wants prefix routes only */
4779 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4780 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4781 			/* success since this is not a prefix route */
4782 			return 1;
4783 		}
4784 	}
4785 
4786 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4787 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4788 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4789 }
4790 
4791 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4792 			      struct netlink_ext_ack *extack)
4793 {
4794 	struct net *net = sock_net(in_skb->sk);
4795 	struct nlattr *tb[RTA_MAX+1];
4796 	int err, iif = 0, oif = 0;
4797 	struct fib6_info *from;
4798 	struct dst_entry *dst;
4799 	struct rt6_info *rt;
4800 	struct sk_buff *skb;
4801 	struct rtmsg *rtm;
4802 	struct flowi6 fl6;
4803 	bool fibmatch;
4804 
4805 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4806 			  extack);
4807 	if (err < 0)
4808 		goto errout;
4809 
4810 	err = -EINVAL;
4811 	memset(&fl6, 0, sizeof(fl6));
4812 	rtm = nlmsg_data(nlh);
4813 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4814 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4815 
4816 	if (tb[RTA_SRC]) {
4817 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4818 			goto errout;
4819 
4820 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4821 	}
4822 
4823 	if (tb[RTA_DST]) {
4824 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4825 			goto errout;
4826 
4827 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4828 	}
4829 
4830 	if (tb[RTA_IIF])
4831 		iif = nla_get_u32(tb[RTA_IIF]);
4832 
4833 	if (tb[RTA_OIF])
4834 		oif = nla_get_u32(tb[RTA_OIF]);
4835 
4836 	if (tb[RTA_MARK])
4837 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4838 
4839 	if (tb[RTA_UID])
4840 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4841 					   nla_get_u32(tb[RTA_UID]));
4842 	else
4843 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4844 
4845 	if (iif) {
4846 		struct net_device *dev;
4847 		int flags = 0;
4848 
4849 		rcu_read_lock();
4850 
4851 		dev = dev_get_by_index_rcu(net, iif);
4852 		if (!dev) {
4853 			rcu_read_unlock();
4854 			err = -ENODEV;
4855 			goto errout;
4856 		}
4857 
4858 		fl6.flowi6_iif = iif;
4859 
4860 		if (!ipv6_addr_any(&fl6.saddr))
4861 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4862 
4863 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4864 
4865 		rcu_read_unlock();
4866 	} else {
4867 		fl6.flowi6_oif = oif;
4868 
4869 		dst = ip6_route_output(net, NULL, &fl6);
4870 	}
4871 
4872 
4873 	rt = container_of(dst, struct rt6_info, dst);
4874 	if (rt->dst.error) {
4875 		err = rt->dst.error;
4876 		ip6_rt_put(rt);
4877 		goto errout;
4878 	}
4879 
4880 	if (rt == net->ipv6.ip6_null_entry) {
4881 		err = rt->dst.error;
4882 		ip6_rt_put(rt);
4883 		goto errout;
4884 	}
4885 
4886 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4887 	if (!skb) {
4888 		ip6_rt_put(rt);
4889 		err = -ENOBUFS;
4890 		goto errout;
4891 	}
4892 
4893 	skb_dst_set(skb, &rt->dst);
4894 
4895 	rcu_read_lock();
4896 	from = rcu_dereference(rt->from);
4897 
4898 	if (fibmatch)
4899 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4900 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4901 				    nlh->nlmsg_seq, 0);
4902 	else
4903 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4904 				    &fl6.saddr, iif, RTM_NEWROUTE,
4905 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4906 				    0);
4907 	rcu_read_unlock();
4908 
4909 	if (err < 0) {
4910 		kfree_skb(skb);
4911 		goto errout;
4912 	}
4913 
4914 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4915 errout:
4916 	return err;
4917 }
4918 
4919 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4920 		     unsigned int nlm_flags)
4921 {
4922 	struct sk_buff *skb;
4923 	struct net *net = info->nl_net;
4924 	u32 seq;
4925 	int err;
4926 
4927 	err = -ENOBUFS;
4928 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4929 
4930 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4931 	if (!skb)
4932 		goto errout;
4933 
4934 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4935 			    event, info->portid, seq, nlm_flags);
4936 	if (err < 0) {
4937 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4938 		WARN_ON(err == -EMSGSIZE);
4939 		kfree_skb(skb);
4940 		goto errout;
4941 	}
4942 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4943 		    info->nlh, gfp_any());
4944 	return;
4945 errout:
4946 	if (err < 0)
4947 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4948 }
4949 
4950 static int ip6_route_dev_notify(struct notifier_block *this,
4951 				unsigned long event, void *ptr)
4952 {
4953 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4954 	struct net *net = dev_net(dev);
4955 
4956 	if (!(dev->flags & IFF_LOOPBACK))
4957 		return NOTIFY_OK;
4958 
4959 	if (event == NETDEV_REGISTER) {
4960 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4961 		net->ipv6.ip6_null_entry->dst.dev = dev;
4962 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4963 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4964 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4965 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4966 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4967 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4968 #endif
4969 	 } else if (event == NETDEV_UNREGISTER &&
4970 		    dev->reg_state != NETREG_UNREGISTERED) {
4971 		/* NETDEV_UNREGISTER could be fired for multiple times by
4972 		 * netdev_wait_allrefs(). Make sure we only call this once.
4973 		 */
4974 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4975 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4976 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4977 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4978 #endif
4979 	}
4980 
4981 	return NOTIFY_OK;
4982 }
4983 
4984 /*
4985  *	/proc
4986  */
4987 
4988 #ifdef CONFIG_PROC_FS
4989 
4990 static const struct file_operations ipv6_route_proc_fops = {
4991 	.open		= ipv6_route_open,
4992 	.read		= seq_read,
4993 	.llseek		= seq_lseek,
4994 	.release	= seq_release_net,
4995 };
4996 
4997 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4998 {
4999 	struct net *net = (struct net *)seq->private;
5000 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5001 		   net->ipv6.rt6_stats->fib_nodes,
5002 		   net->ipv6.rt6_stats->fib_route_nodes,
5003 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5004 		   net->ipv6.rt6_stats->fib_rt_entries,
5005 		   net->ipv6.rt6_stats->fib_rt_cache,
5006 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5007 		   net->ipv6.rt6_stats->fib_discarded_routes);
5008 
5009 	return 0;
5010 }
5011 
5012 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
5013 {
5014 	return single_open_net(inode, file, rt6_stats_seq_show);
5015 }
5016 
5017 static const struct file_operations rt6_stats_seq_fops = {
5018 	.open	 = rt6_stats_seq_open,
5019 	.read	 = seq_read,
5020 	.llseek	 = seq_lseek,
5021 	.release = single_release_net,
5022 };
5023 #endif	/* CONFIG_PROC_FS */
5024 
5025 #ifdef CONFIG_SYSCTL
5026 
5027 static
5028 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5029 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5030 {
5031 	struct net *net;
5032 	int delay;
5033 	if (!write)
5034 		return -EINVAL;
5035 
5036 	net = (struct net *)ctl->extra1;
5037 	delay = net->ipv6.sysctl.flush_delay;
5038 	proc_dointvec(ctl, write, buffer, lenp, ppos);
5039 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5040 	return 0;
5041 }
5042 
5043 struct ctl_table ipv6_route_table_template[] = {
5044 	{
5045 		.procname	=	"flush",
5046 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5047 		.maxlen		=	sizeof(int),
5048 		.mode		=	0200,
5049 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5050 	},
5051 	{
5052 		.procname	=	"gc_thresh",
5053 		.data		=	&ip6_dst_ops_template.gc_thresh,
5054 		.maxlen		=	sizeof(int),
5055 		.mode		=	0644,
5056 		.proc_handler	=	proc_dointvec,
5057 	},
5058 	{
5059 		.procname	=	"max_size",
5060 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5061 		.maxlen		=	sizeof(int),
5062 		.mode		=	0644,
5063 		.proc_handler	=	proc_dointvec,
5064 	},
5065 	{
5066 		.procname	=	"gc_min_interval",
5067 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5068 		.maxlen		=	sizeof(int),
5069 		.mode		=	0644,
5070 		.proc_handler	=	proc_dointvec_jiffies,
5071 	},
5072 	{
5073 		.procname	=	"gc_timeout",
5074 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5075 		.maxlen		=	sizeof(int),
5076 		.mode		=	0644,
5077 		.proc_handler	=	proc_dointvec_jiffies,
5078 	},
5079 	{
5080 		.procname	=	"gc_interval",
5081 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5082 		.maxlen		=	sizeof(int),
5083 		.mode		=	0644,
5084 		.proc_handler	=	proc_dointvec_jiffies,
5085 	},
5086 	{
5087 		.procname	=	"gc_elasticity",
5088 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5089 		.maxlen		=	sizeof(int),
5090 		.mode		=	0644,
5091 		.proc_handler	=	proc_dointvec,
5092 	},
5093 	{
5094 		.procname	=	"mtu_expires",
5095 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5096 		.maxlen		=	sizeof(int),
5097 		.mode		=	0644,
5098 		.proc_handler	=	proc_dointvec_jiffies,
5099 	},
5100 	{
5101 		.procname	=	"min_adv_mss",
5102 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5103 		.maxlen		=	sizeof(int),
5104 		.mode		=	0644,
5105 		.proc_handler	=	proc_dointvec,
5106 	},
5107 	{
5108 		.procname	=	"gc_min_interval_ms",
5109 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5110 		.maxlen		=	sizeof(int),
5111 		.mode		=	0644,
5112 		.proc_handler	=	proc_dointvec_ms_jiffies,
5113 	},
5114 	{ }
5115 };
5116 
5117 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5118 {
5119 	struct ctl_table *table;
5120 
5121 	table = kmemdup(ipv6_route_table_template,
5122 			sizeof(ipv6_route_table_template),
5123 			GFP_KERNEL);
5124 
5125 	if (table) {
5126 		table[0].data = &net->ipv6.sysctl.flush_delay;
5127 		table[0].extra1 = net;
5128 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5129 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5130 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5131 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5132 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5133 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5134 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5135 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5136 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5137 
5138 		/* Don't export sysctls to unprivileged users */
5139 		if (net->user_ns != &init_user_ns)
5140 			table[0].procname = NULL;
5141 	}
5142 
5143 	return table;
5144 }
5145 #endif
5146 
5147 static int __net_init ip6_route_net_init(struct net *net)
5148 {
5149 	int ret = -ENOMEM;
5150 
5151 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5152 	       sizeof(net->ipv6.ip6_dst_ops));
5153 
5154 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5155 		goto out_ip6_dst_ops;
5156 
5157 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5158 					    sizeof(*net->ipv6.fib6_null_entry),
5159 					    GFP_KERNEL);
5160 	if (!net->ipv6.fib6_null_entry)
5161 		goto out_ip6_dst_entries;
5162 
5163 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5164 					   sizeof(*net->ipv6.ip6_null_entry),
5165 					   GFP_KERNEL);
5166 	if (!net->ipv6.ip6_null_entry)
5167 		goto out_fib6_null_entry;
5168 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5169 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5170 			 ip6_template_metrics, true);
5171 
5172 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5173 	net->ipv6.fib6_has_custom_rules = false;
5174 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5175 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5176 					       GFP_KERNEL);
5177 	if (!net->ipv6.ip6_prohibit_entry)
5178 		goto out_ip6_null_entry;
5179 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5180 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5181 			 ip6_template_metrics, true);
5182 
5183 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5184 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5185 					       GFP_KERNEL);
5186 	if (!net->ipv6.ip6_blk_hole_entry)
5187 		goto out_ip6_prohibit_entry;
5188 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5189 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5190 			 ip6_template_metrics, true);
5191 #endif
5192 
5193 	net->ipv6.sysctl.flush_delay = 0;
5194 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5195 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5196 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5197 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5198 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5199 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5200 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5201 
5202 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5203 
5204 	ret = 0;
5205 out:
5206 	return ret;
5207 
5208 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5209 out_ip6_prohibit_entry:
5210 	kfree(net->ipv6.ip6_prohibit_entry);
5211 out_ip6_null_entry:
5212 	kfree(net->ipv6.ip6_null_entry);
5213 #endif
5214 out_fib6_null_entry:
5215 	kfree(net->ipv6.fib6_null_entry);
5216 out_ip6_dst_entries:
5217 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5218 out_ip6_dst_ops:
5219 	goto out;
5220 }
5221 
5222 static void __net_exit ip6_route_net_exit(struct net *net)
5223 {
5224 	kfree(net->ipv6.fib6_null_entry);
5225 	kfree(net->ipv6.ip6_null_entry);
5226 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5227 	kfree(net->ipv6.ip6_prohibit_entry);
5228 	kfree(net->ipv6.ip6_blk_hole_entry);
5229 #endif
5230 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5231 }
5232 
5233 static int __net_init ip6_route_net_init_late(struct net *net)
5234 {
5235 #ifdef CONFIG_PROC_FS
5236 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5237 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5238 #endif
5239 	return 0;
5240 }
5241 
5242 static void __net_exit ip6_route_net_exit_late(struct net *net)
5243 {
5244 #ifdef CONFIG_PROC_FS
5245 	remove_proc_entry("ipv6_route", net->proc_net);
5246 	remove_proc_entry("rt6_stats", net->proc_net);
5247 #endif
5248 }
5249 
5250 static struct pernet_operations ip6_route_net_ops = {
5251 	.init = ip6_route_net_init,
5252 	.exit = ip6_route_net_exit,
5253 };
5254 
5255 static int __net_init ipv6_inetpeer_init(struct net *net)
5256 {
5257 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5258 
5259 	if (!bp)
5260 		return -ENOMEM;
5261 	inet_peer_base_init(bp);
5262 	net->ipv6.peers = bp;
5263 	return 0;
5264 }
5265 
5266 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5267 {
5268 	struct inet_peer_base *bp = net->ipv6.peers;
5269 
5270 	net->ipv6.peers = NULL;
5271 	inetpeer_invalidate_tree(bp);
5272 	kfree(bp);
5273 }
5274 
5275 static struct pernet_operations ipv6_inetpeer_ops = {
5276 	.init	=	ipv6_inetpeer_init,
5277 	.exit	=	ipv6_inetpeer_exit,
5278 };
5279 
5280 static struct pernet_operations ip6_route_net_late_ops = {
5281 	.init = ip6_route_net_init_late,
5282 	.exit = ip6_route_net_exit_late,
5283 };
5284 
5285 static struct notifier_block ip6_route_dev_notifier = {
5286 	.notifier_call = ip6_route_dev_notify,
5287 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5288 };
5289 
5290 void __init ip6_route_init_special_entries(void)
5291 {
5292 	/* Registering of the loopback is done before this portion of code,
5293 	 * the loopback reference in rt6_info will not be taken, do it
5294 	 * manually for init_net */
5295 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5296 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5297 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5298   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5299 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5300 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5301 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5302 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5303   #endif
5304 }
5305 
5306 int __init ip6_route_init(void)
5307 {
5308 	int ret;
5309 	int cpu;
5310 
5311 	ret = -ENOMEM;
5312 	ip6_dst_ops_template.kmem_cachep =
5313 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5314 				  SLAB_HWCACHE_ALIGN, NULL);
5315 	if (!ip6_dst_ops_template.kmem_cachep)
5316 		goto out;
5317 
5318 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5319 	if (ret)
5320 		goto out_kmem_cache;
5321 
5322 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5323 	if (ret)
5324 		goto out_dst_entries;
5325 
5326 	ret = register_pernet_subsys(&ip6_route_net_ops);
5327 	if (ret)
5328 		goto out_register_inetpeer;
5329 
5330 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5331 
5332 	ret = fib6_init();
5333 	if (ret)
5334 		goto out_register_subsys;
5335 
5336 	ret = xfrm6_init();
5337 	if (ret)
5338 		goto out_fib6_init;
5339 
5340 	ret = fib6_rules_init();
5341 	if (ret)
5342 		goto xfrm6_init;
5343 
5344 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5345 	if (ret)
5346 		goto fib6_rules_init;
5347 
5348 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5349 				   inet6_rtm_newroute, NULL, 0);
5350 	if (ret < 0)
5351 		goto out_register_late_subsys;
5352 
5353 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5354 				   inet6_rtm_delroute, NULL, 0);
5355 	if (ret < 0)
5356 		goto out_register_late_subsys;
5357 
5358 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5359 				   inet6_rtm_getroute, NULL,
5360 				   RTNL_FLAG_DOIT_UNLOCKED);
5361 	if (ret < 0)
5362 		goto out_register_late_subsys;
5363 
5364 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5365 	if (ret)
5366 		goto out_register_late_subsys;
5367 
5368 	for_each_possible_cpu(cpu) {
5369 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5370 
5371 		INIT_LIST_HEAD(&ul->head);
5372 		spin_lock_init(&ul->lock);
5373 	}
5374 
5375 out:
5376 	return ret;
5377 
5378 out_register_late_subsys:
5379 	rtnl_unregister_all(PF_INET6);
5380 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5381 fib6_rules_init:
5382 	fib6_rules_cleanup();
5383 xfrm6_init:
5384 	xfrm6_fini();
5385 out_fib6_init:
5386 	fib6_gc_cleanup();
5387 out_register_subsys:
5388 	unregister_pernet_subsys(&ip6_route_net_ops);
5389 out_register_inetpeer:
5390 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5391 out_dst_entries:
5392 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5393 out_kmem_cache:
5394 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5395 	goto out;
5396 }
5397 
5398 void ip6_route_cleanup(void)
5399 {
5400 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5401 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5402 	fib6_rules_cleanup();
5403 	xfrm6_fini();
5404 	fib6_gc_cleanup();
5405 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5406 	unregister_pernet_subsys(&ip6_route_net_ops);
5407 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5408 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5409 }
5410