xref: /linux/net/ipv6/route.c (revision 22d55f02b8922a097cd4be1e2f131dfa7ef65901)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	Linux INET6 implementation
4  *	FIB front-end.
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  */
9 
10 /*	Changes:
11  *
12  *	YOSHIFUJI Hideaki @USAGI
13  *		reworked default router selection.
14  *		- respect outgoing interface
15  *		- select from (probably) reachable routers (i.e.
16  *		routers in REACHABLE, STALE, DELAY or PROBE states).
17  *		- always select the same router if it is (probably)
18  *		reachable.  otherwise, round-robin the list.
19  *	Ville Nuorvala
20  *		Fixed routing subtrees.
21  */
22 
23 #define pr_fmt(fmt) "IPv6: " fmt
24 
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/dst_metadata.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 #include <net/rtnh.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
62 #include <net/ip.h>
63 #include <linux/uaccess.h>
64 
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68 
69 static int ip6_rt_type_to_error(u8 fib6_type);
70 
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/fib6.h>
73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
74 #undef CREATE_TRACE_POINTS
75 
76 enum rt6_nud_state {
77 	RT6_NUD_FAIL_HARD = -3,
78 	RT6_NUD_FAIL_PROBE = -2,
79 	RT6_NUD_FAIL_DO_RR = -1,
80 	RT6_NUD_SUCCEED = 1
81 };
82 
83 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
85 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void		ip6_dst_destroy(struct dst_entry *);
88 static void		ip6_dst_ifdown(struct dst_entry *,
89 				       struct net_device *dev, int how);
90 static int		 ip6_dst_gc(struct dst_ops *ops);
91 
92 static int		ip6_pkt_discard(struct sk_buff *skb);
93 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static int		ip6_pkt_prohibit(struct sk_buff *skb);
95 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
96 static void		ip6_link_failure(struct sk_buff *skb);
97 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
98 					   struct sk_buff *skb, u32 mtu);
99 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 					struct sk_buff *skb);
101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
102 			   int strict);
103 static size_t rt6_nlmsg_size(struct fib6_info *rt);
104 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
105 			 struct fib6_info *rt, struct dst_entry *dst,
106 			 struct in6_addr *dest, struct in6_addr *src,
107 			 int iif, int type, u32 portid, u32 seq,
108 			 unsigned int flags);
109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
110 					   const struct in6_addr *daddr,
111 					   const struct in6_addr *saddr);
112 
113 #ifdef CONFIG_IPV6_ROUTE_INFO
114 static struct fib6_info *rt6_add_route_info(struct net *net,
115 					   const struct in6_addr *prefix, int prefixlen,
116 					   const struct in6_addr *gwaddr,
117 					   struct net_device *dev,
118 					   unsigned int pref);
119 static struct fib6_info *rt6_get_route_info(struct net *net,
120 					   const struct in6_addr *prefix, int prefixlen,
121 					   const struct in6_addr *gwaddr,
122 					   struct net_device *dev);
123 #endif
124 
125 struct uncached_list {
126 	spinlock_t		lock;
127 	struct list_head	head;
128 };
129 
130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 
132 void rt6_uncached_list_add(struct rt6_info *rt)
133 {
134 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 
136 	rt->rt6i_uncached_list = ul;
137 
138 	spin_lock_bh(&ul->lock);
139 	list_add_tail(&rt->rt6i_uncached, &ul->head);
140 	spin_unlock_bh(&ul->lock);
141 }
142 
143 void rt6_uncached_list_del(struct rt6_info *rt)
144 {
145 	if (!list_empty(&rt->rt6i_uncached)) {
146 		struct uncached_list *ul = rt->rt6i_uncached_list;
147 		struct net *net = dev_net(rt->dst.dev);
148 
149 		spin_lock_bh(&ul->lock);
150 		list_del(&rt->rt6i_uncached);
151 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
152 		spin_unlock_bh(&ul->lock);
153 	}
154 }
155 
156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 {
158 	struct net_device *loopback_dev = net->loopback_dev;
159 	int cpu;
160 
161 	if (dev == loopback_dev)
162 		return;
163 
164 	for_each_possible_cpu(cpu) {
165 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
166 		struct rt6_info *rt;
167 
168 		spin_lock_bh(&ul->lock);
169 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
170 			struct inet6_dev *rt_idev = rt->rt6i_idev;
171 			struct net_device *rt_dev = rt->dst.dev;
172 
173 			if (rt_idev->dev == dev) {
174 				rt->rt6i_idev = in6_dev_get(loopback_dev);
175 				in6_dev_put(rt_idev);
176 			}
177 
178 			if (rt_dev == dev) {
179 				rt->dst.dev = loopback_dev;
180 				dev_hold(rt->dst.dev);
181 				dev_put(rt_dev);
182 			}
183 		}
184 		spin_unlock_bh(&ul->lock);
185 	}
186 }
187 
188 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 					     struct sk_buff *skb,
190 					     const void *daddr)
191 {
192 	if (!ipv6_addr_any(p))
193 		return (const void *) p;
194 	else if (skb)
195 		return &ipv6_hdr(skb)->daddr;
196 	return daddr;
197 }
198 
199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
200 				   struct net_device *dev,
201 				   struct sk_buff *skb,
202 				   const void *daddr)
203 {
204 	struct neighbour *n;
205 
206 	daddr = choose_neigh_daddr(gw, skb, daddr);
207 	n = __ipv6_neigh_lookup(dev, daddr);
208 	if (n)
209 		return n;
210 
211 	n = neigh_create(&nd_tbl, daddr, dev);
212 	return IS_ERR(n) ? NULL : n;
213 }
214 
215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
216 					      struct sk_buff *skb,
217 					      const void *daddr)
218 {
219 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
220 
221 	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
222 				dst->dev, skb, daddr);
223 }
224 
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227 	struct net_device *dev = dst->dev;
228 	struct rt6_info *rt = (struct rt6_info *)dst;
229 
230 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 	if (!daddr)
232 		return;
233 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 		return;
235 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 		return;
237 	__ipv6_confirm_neigh(dev, daddr);
238 }
239 
240 static struct dst_ops ip6_dst_ops_template = {
241 	.family			=	AF_INET6,
242 	.gc			=	ip6_dst_gc,
243 	.gc_thresh		=	1024,
244 	.check			=	ip6_dst_check,
245 	.default_advmss		=	ip6_default_advmss,
246 	.mtu			=	ip6_mtu,
247 	.cow_metrics		=	dst_cow_metrics_generic,
248 	.destroy		=	ip6_dst_destroy,
249 	.ifdown			=	ip6_dst_ifdown,
250 	.negative_advice	=	ip6_negative_advice,
251 	.link_failure		=	ip6_link_failure,
252 	.update_pmtu		=	ip6_rt_update_pmtu,
253 	.redirect		=	rt6_do_redirect,
254 	.local_out		=	__ip6_local_out,
255 	.neigh_lookup		=	ip6_dst_neigh_lookup,
256 	.confirm_neigh		=	ip6_confirm_neigh,
257 };
258 
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262 
263 	return mtu ? : dst->dev->mtu;
264 }
265 
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 					 struct sk_buff *skb, u32 mtu)
268 {
269 }
270 
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 				      struct sk_buff *skb)
273 {
274 }
275 
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 	.family			=	AF_INET6,
278 	.destroy		=	ip6_dst_destroy,
279 	.check			=	ip6_dst_check,
280 	.mtu			=	ip6_blackhole_mtu,
281 	.default_advmss		=	ip6_default_advmss,
282 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
283 	.redirect		=	ip6_rt_blackhole_redirect,
284 	.cow_metrics		=	dst_cow_metrics_generic,
285 	.neigh_lookup		=	ip6_dst_neigh_lookup,
286 };
287 
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 	[RTAX_HOPLIMIT - 1] = 0,
290 };
291 
292 static const struct fib6_info fib6_null_entry_template = {
293 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
294 	.fib6_protocol  = RTPROT_KERNEL,
295 	.fib6_metric	= ~(u32)0,
296 	.fib6_ref	= REFCOUNT_INIT(1),
297 	.fib6_type	= RTN_UNREACHABLE,
298 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
299 };
300 
301 static const struct rt6_info ip6_null_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -ENETUNREACH,
307 		.input		= ip6_pkt_discard,
308 		.output		= ip6_pkt_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 };
326 
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328 	.dst = {
329 		.__refcnt	= ATOMIC_INIT(1),
330 		.__use		= 1,
331 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
332 		.error		= -EINVAL,
333 		.input		= dst_discard,
334 		.output		= dst_discard_out,
335 	},
336 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
337 };
338 
339 #endif
340 
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343 	struct dst_entry *dst = &rt->dst;
344 
345 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 	INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348 
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 			       int flags)
352 {
353 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 					1, DST_OBSOLETE_FORCE_CHK, flags);
355 
356 	if (rt) {
357 		rt6_info_init(rt);
358 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359 	}
360 
361 	return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364 
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct rt6_info *rt = (struct rt6_info *)dst;
368 	struct fib6_info *from;
369 	struct inet6_dev *idev;
370 
371 	ip_dst_metrics_put(dst);
372 	rt6_uncached_list_del(rt);
373 
374 	idev = rt->rt6i_idev;
375 	if (idev) {
376 		rt->rt6i_idev = NULL;
377 		in6_dev_put(idev);
378 	}
379 
380 	from = xchg((__force struct fib6_info **)&rt->from, NULL);
381 	fib6_info_release(from);
382 }
383 
384 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
385 			   int how)
386 {
387 	struct rt6_info *rt = (struct rt6_info *)dst;
388 	struct inet6_dev *idev = rt->rt6i_idev;
389 	struct net_device *loopback_dev =
390 		dev_net(dev)->loopback_dev;
391 
392 	if (idev && idev->dev != loopback_dev) {
393 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
394 		if (loopback_idev) {
395 			rt->rt6i_idev = loopback_idev;
396 			in6_dev_put(idev);
397 		}
398 	}
399 }
400 
401 static bool __rt6_check_expired(const struct rt6_info *rt)
402 {
403 	if (rt->rt6i_flags & RTF_EXPIRES)
404 		return time_after(jiffies, rt->dst.expires);
405 	else
406 		return false;
407 }
408 
409 static bool rt6_check_expired(const struct rt6_info *rt)
410 {
411 	struct fib6_info *from;
412 
413 	from = rcu_dereference(rt->from);
414 
415 	if (rt->rt6i_flags & RTF_EXPIRES) {
416 		if (time_after(jiffies, rt->dst.expires))
417 			return true;
418 	} else if (from) {
419 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
420 			fib6_check_expired(from);
421 	}
422 	return false;
423 }
424 
425 void fib6_select_path(const struct net *net, struct fib6_result *res,
426 		      struct flowi6 *fl6, int oif, bool have_oif_match,
427 		      const struct sk_buff *skb, int strict)
428 {
429 	struct fib6_info *sibling, *next_sibling;
430 	struct fib6_info *match = res->f6i;
431 
432 	if (!match->fib6_nsiblings || have_oif_match)
433 		goto out;
434 
435 	/* We might have already computed the hash for ICMPv6 errors. In such
436 	 * case it will always be non-zero. Otherwise now is the time to do it.
437 	 */
438 	if (!fl6->mp_hash)
439 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
440 
441 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
442 		goto out;
443 
444 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
445 				 fib6_siblings) {
446 		const struct fib6_nh *nh = &sibling->fib6_nh;
447 		int nh_upper_bound;
448 
449 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
450 		if (fl6->mp_hash > nh_upper_bound)
451 			continue;
452 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
453 			break;
454 		match = sibling;
455 		break;
456 	}
457 
458 out:
459 	res->f6i = match;
460 	res->nh = &match->fib6_nh;
461 }
462 
463 /*
464  *	Route lookup. rcu_read_lock() should be held.
465  */
466 
467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
468 			       const struct in6_addr *saddr, int oif, int flags)
469 {
470 	const struct net_device *dev;
471 
472 	if (nh->fib_nh_flags & RTNH_F_DEAD)
473 		return false;
474 
475 	dev = nh->fib_nh_dev;
476 	if (oif) {
477 		if (dev->ifindex == oif)
478 			return true;
479 	} else {
480 		if (ipv6_chk_addr(net, saddr, dev,
481 				  flags & RT6_LOOKUP_F_IFACE))
482 			return true;
483 	}
484 
485 	return false;
486 }
487 
488 static void rt6_device_match(struct net *net, struct fib6_result *res,
489 			     const struct in6_addr *saddr, int oif, int flags)
490 {
491 	struct fib6_info *f6i = res->f6i;
492 	struct fib6_info *spf6i;
493 	struct fib6_nh *nh;
494 
495 	if (!oif && ipv6_addr_any(saddr)) {
496 		nh = &f6i->fib6_nh;
497 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
498 			goto out;
499 	}
500 
501 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
502 		nh = &spf6i->fib6_nh;
503 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
504 			res->f6i = spf6i;
505 			goto out;
506 		}
507 	}
508 
509 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
510 		res->f6i = net->ipv6.fib6_null_entry;
511 		nh = &res->f6i->fib6_nh;
512 		goto out;
513 	}
514 
515 	nh = &f6i->fib6_nh;
516 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
517 		res->f6i = net->ipv6.fib6_null_entry;
518 		nh = &res->f6i->fib6_nh;
519 	}
520 out:
521 	res->nh = nh;
522 	res->fib6_type = res->f6i->fib6_type;
523 	res->fib6_flags = res->f6i->fib6_flags;
524 }
525 
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528 	struct work_struct work;
529 	struct in6_addr target;
530 	struct net_device *dev;
531 };
532 
533 static void rt6_probe_deferred(struct work_struct *w)
534 {
535 	struct in6_addr mcaddr;
536 	struct __rt6_probe_work *work =
537 		container_of(w, struct __rt6_probe_work, work);
538 
539 	addrconf_addr_solict_mult(&work->target, &mcaddr);
540 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
541 	dev_put(work->dev);
542 	kfree(work);
543 }
544 
545 static void rt6_probe(struct fib6_nh *fib6_nh)
546 {
547 	struct __rt6_probe_work *work = NULL;
548 	const struct in6_addr *nh_gw;
549 	struct neighbour *neigh;
550 	struct net_device *dev;
551 	struct inet6_dev *idev;
552 
553 	/*
554 	 * Okay, this does not seem to be appropriate
555 	 * for now, however, we need to check if it
556 	 * is really so; aka Router Reachability Probing.
557 	 *
558 	 * Router Reachability Probe MUST be rate-limited
559 	 * to no more than one per minute.
560 	 */
561 	if (fib6_nh->fib_nh_gw_family)
562 		return;
563 
564 	nh_gw = &fib6_nh->fib_nh_gw6;
565 	dev = fib6_nh->fib_nh_dev;
566 	rcu_read_lock_bh();
567 	idev = __in6_dev_get(dev);
568 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
569 	if (neigh) {
570 		if (neigh->nud_state & NUD_VALID)
571 			goto out;
572 
573 		write_lock(&neigh->lock);
574 		if (!(neigh->nud_state & NUD_VALID) &&
575 		    time_after(jiffies,
576 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
577 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
578 			if (work)
579 				__neigh_set_probe_once(neigh);
580 		}
581 		write_unlock(&neigh->lock);
582 	} else if (time_after(jiffies, fib6_nh->last_probe +
583 				       idev->cnf.rtr_probe_interval)) {
584 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
585 	}
586 
587 	if (work) {
588 		fib6_nh->last_probe = jiffies;
589 		INIT_WORK(&work->work, rt6_probe_deferred);
590 		work->target = *nh_gw;
591 		dev_hold(dev);
592 		work->dev = dev;
593 		schedule_work(&work->work);
594 	}
595 
596 out:
597 	rcu_read_unlock_bh();
598 }
599 #else
600 static inline void rt6_probe(struct fib6_nh *fib6_nh)
601 {
602 }
603 #endif
604 
605 /*
606  * Default Router Selection (RFC 2461 6.3.6)
607  */
608 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
609 {
610 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611 	struct neighbour *neigh;
612 
613 	rcu_read_lock_bh();
614 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
615 					  &fib6_nh->fib_nh_gw6);
616 	if (neigh) {
617 		read_lock(&neigh->lock);
618 		if (neigh->nud_state & NUD_VALID)
619 			ret = RT6_NUD_SUCCEED;
620 #ifdef CONFIG_IPV6_ROUTER_PREF
621 		else if (!(neigh->nud_state & NUD_FAILED))
622 			ret = RT6_NUD_SUCCEED;
623 		else
624 			ret = RT6_NUD_FAIL_PROBE;
625 #endif
626 		read_unlock(&neigh->lock);
627 	} else {
628 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
629 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
630 	}
631 	rcu_read_unlock_bh();
632 
633 	return ret;
634 }
635 
636 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
637 			   int strict)
638 {
639 	int m = 0;
640 
641 	if (!oif || nh->fib_nh_dev->ifindex == oif)
642 		m = 2;
643 
644 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
645 		return RT6_NUD_FAIL_HARD;
646 #ifdef CONFIG_IPV6_ROUTER_PREF
647 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
648 #endif
649 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
650 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
651 		int n = rt6_check_neigh(nh);
652 		if (n < 0)
653 			return n;
654 	}
655 	return m;
656 }
657 
658 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
659 		       int oif, int strict, int *mpri, bool *do_rr)
660 {
661 	bool match_do_rr = false;
662 	bool rc = false;
663 	int m;
664 
665 	if (nh->fib_nh_flags & RTNH_F_DEAD)
666 		goto out;
667 
668 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
669 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
670 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671 		goto out;
672 
673 	m = rt6_score_route(nh, fib6_flags, oif, strict);
674 	if (m == RT6_NUD_FAIL_DO_RR) {
675 		match_do_rr = true;
676 		m = 0; /* lowest valid score */
677 	} else if (m == RT6_NUD_FAIL_HARD) {
678 		goto out;
679 	}
680 
681 	if (strict & RT6_LOOKUP_F_REACHABLE)
682 		rt6_probe(nh);
683 
684 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
685 	if (m > *mpri) {
686 		*do_rr = match_do_rr;
687 		*mpri = m;
688 		rc = true;
689 	}
690 out:
691 	return rc;
692 }
693 
694 static void __find_rr_leaf(struct fib6_info *f6i_start,
695 			   struct fib6_info *nomatch, u32 metric,
696 			   struct fib6_result *res, struct fib6_info **cont,
697 			   int oif, int strict, bool *do_rr, int *mpri)
698 {
699 	struct fib6_info *f6i;
700 
701 	for (f6i = f6i_start;
702 	     f6i && f6i != nomatch;
703 	     f6i = rcu_dereference(f6i->fib6_next)) {
704 		struct fib6_nh *nh;
705 
706 		if (cont && f6i->fib6_metric != metric) {
707 			*cont = f6i;
708 			return;
709 		}
710 
711 		if (fib6_check_expired(f6i))
712 			continue;
713 
714 		nh = &f6i->fib6_nh;
715 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
716 			res->f6i = f6i;
717 			res->nh = nh;
718 			res->fib6_flags = f6i->fib6_flags;
719 			res->fib6_type = f6i->fib6_type;
720 		}
721 	}
722 }
723 
724 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
725 			 struct fib6_info *rr_head, int oif, int strict,
726 			 bool *do_rr, struct fib6_result *res)
727 {
728 	u32 metric = rr_head->fib6_metric;
729 	struct fib6_info *cont = NULL;
730 	int mpri = -1;
731 
732 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
733 		       oif, strict, do_rr, &mpri);
734 
735 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
736 		       oif, strict, do_rr, &mpri);
737 
738 	if (res->f6i || !cont)
739 		return;
740 
741 	__find_rr_leaf(cont, NULL, metric, res, NULL,
742 		       oif, strict, do_rr, &mpri);
743 }
744 
745 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
746 		       struct fib6_result *res, int strict)
747 {
748 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
749 	struct fib6_info *rt0;
750 	bool do_rr = false;
751 	int key_plen;
752 
753 	/* make sure this function or its helpers sets f6i */
754 	res->f6i = NULL;
755 
756 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
757 		goto out;
758 
759 	rt0 = rcu_dereference(fn->rr_ptr);
760 	if (!rt0)
761 		rt0 = leaf;
762 
763 	/* Double check to make sure fn is not an intermediate node
764 	 * and fn->leaf does not points to its child's leaf
765 	 * (This might happen if all routes under fn are deleted from
766 	 * the tree and fib6_repair_tree() is called on the node.)
767 	 */
768 	key_plen = rt0->fib6_dst.plen;
769 #ifdef CONFIG_IPV6_SUBTREES
770 	if (rt0->fib6_src.plen)
771 		key_plen = rt0->fib6_src.plen;
772 #endif
773 	if (fn->fn_bit != key_plen)
774 		goto out;
775 
776 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
777 	if (do_rr) {
778 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
779 
780 		/* no entries matched; do round-robin */
781 		if (!next || next->fib6_metric != rt0->fib6_metric)
782 			next = leaf;
783 
784 		if (next != rt0) {
785 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
786 			/* make sure next is not being deleted from the tree */
787 			if (next->fib6_node)
788 				rcu_assign_pointer(fn->rr_ptr, next);
789 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
790 		}
791 	}
792 
793 out:
794 	if (!res->f6i) {
795 		res->f6i = net->ipv6.fib6_null_entry;
796 		res->nh = &res->f6i->fib6_nh;
797 		res->fib6_flags = res->f6i->fib6_flags;
798 		res->fib6_type = res->f6i->fib6_type;
799 	}
800 }
801 
802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
803 {
804 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
805 	       res->nh->fib_nh_gw_family;
806 }
807 
808 #ifdef CONFIG_IPV6_ROUTE_INFO
809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
810 		  const struct in6_addr *gwaddr)
811 {
812 	struct net *net = dev_net(dev);
813 	struct route_info *rinfo = (struct route_info *) opt;
814 	struct in6_addr prefix_buf, *prefix;
815 	unsigned int pref;
816 	unsigned long lifetime;
817 	struct fib6_info *rt;
818 
819 	if (len < sizeof(struct route_info)) {
820 		return -EINVAL;
821 	}
822 
823 	/* Sanity check for prefix_len and length */
824 	if (rinfo->length > 3) {
825 		return -EINVAL;
826 	} else if (rinfo->prefix_len > 128) {
827 		return -EINVAL;
828 	} else if (rinfo->prefix_len > 64) {
829 		if (rinfo->length < 2) {
830 			return -EINVAL;
831 		}
832 	} else if (rinfo->prefix_len > 0) {
833 		if (rinfo->length < 1) {
834 			return -EINVAL;
835 		}
836 	}
837 
838 	pref = rinfo->route_pref;
839 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
840 		return -EINVAL;
841 
842 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
843 
844 	if (rinfo->length == 3)
845 		prefix = (struct in6_addr *)rinfo->prefix;
846 	else {
847 		/* this function is safe */
848 		ipv6_addr_prefix(&prefix_buf,
849 				 (struct in6_addr *)rinfo->prefix,
850 				 rinfo->prefix_len);
851 		prefix = &prefix_buf;
852 	}
853 
854 	if (rinfo->prefix_len == 0)
855 		rt = rt6_get_dflt_router(net, gwaddr, dev);
856 	else
857 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858 					gwaddr, dev);
859 
860 	if (rt && !lifetime) {
861 		ip6_del_rt(net, rt);
862 		rt = NULL;
863 	}
864 
865 	if (!rt && lifetime)
866 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867 					dev, pref);
868 	else if (rt)
869 		rt->fib6_flags = RTF_ROUTEINFO |
870 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871 
872 	if (rt) {
873 		if (!addrconf_finite_timeout(lifetime))
874 			fib6_clean_expires(rt);
875 		else
876 			fib6_set_expires(rt, jiffies + HZ * lifetime);
877 
878 		fib6_info_release(rt);
879 	}
880 	return 0;
881 }
882 #endif
883 
884 /*
885  *	Misc support functions
886  */
887 
888 /* called with rcu_lock held */
889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
890 {
891 	struct net_device *dev = res->nh->fib_nh_dev;
892 
893 	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
894 		/* for copies of local routes, dst->dev needs to be the
895 		 * device if it is a master device, the master device if
896 		 * device is enslaved, and the loopback as the default
897 		 */
898 		if (netif_is_l3_slave(dev) &&
899 		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
900 			dev = l3mdev_master_dev_rcu(dev);
901 		else if (!netif_is_l3_master(dev))
902 			dev = dev_net(dev)->loopback_dev;
903 		/* last case is netif_is_l3_master(dev) is true in which
904 		 * case we want dev returned to be dev
905 		 */
906 	}
907 
908 	return dev;
909 }
910 
911 static const int fib6_prop[RTN_MAX + 1] = {
912 	[RTN_UNSPEC]	= 0,
913 	[RTN_UNICAST]	= 0,
914 	[RTN_LOCAL]	= 0,
915 	[RTN_BROADCAST]	= 0,
916 	[RTN_ANYCAST]	= 0,
917 	[RTN_MULTICAST]	= 0,
918 	[RTN_BLACKHOLE]	= -EINVAL,
919 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
920 	[RTN_PROHIBIT]	= -EACCES,
921 	[RTN_THROW]	= -EAGAIN,
922 	[RTN_NAT]	= -EINVAL,
923 	[RTN_XRESOLVE]	= -EINVAL,
924 };
925 
926 static int ip6_rt_type_to_error(u8 fib6_type)
927 {
928 	return fib6_prop[fib6_type];
929 }
930 
931 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
932 {
933 	unsigned short flags = 0;
934 
935 	if (rt->dst_nocount)
936 		flags |= DST_NOCOUNT;
937 	if (rt->dst_nopolicy)
938 		flags |= DST_NOPOLICY;
939 	if (rt->dst_host)
940 		flags |= DST_HOST;
941 
942 	return flags;
943 }
944 
945 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
946 {
947 	rt->dst.error = ip6_rt_type_to_error(fib6_type);
948 
949 	switch (fib6_type) {
950 	case RTN_BLACKHOLE:
951 		rt->dst.output = dst_discard_out;
952 		rt->dst.input = dst_discard;
953 		break;
954 	case RTN_PROHIBIT:
955 		rt->dst.output = ip6_pkt_prohibit_out;
956 		rt->dst.input = ip6_pkt_prohibit;
957 		break;
958 	case RTN_THROW:
959 	case RTN_UNREACHABLE:
960 	default:
961 		rt->dst.output = ip6_pkt_discard_out;
962 		rt->dst.input = ip6_pkt_discard;
963 		break;
964 	}
965 }
966 
967 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
968 {
969 	struct fib6_info *f6i = res->f6i;
970 
971 	if (res->fib6_flags & RTF_REJECT) {
972 		ip6_rt_init_dst_reject(rt, res->fib6_type);
973 		return;
974 	}
975 
976 	rt->dst.error = 0;
977 	rt->dst.output = ip6_output;
978 
979 	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
980 		rt->dst.input = ip6_input;
981 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
982 		rt->dst.input = ip6_mc_input;
983 	} else {
984 		rt->dst.input = ip6_forward;
985 	}
986 
987 	if (res->nh->fib_nh_lws) {
988 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
989 		lwtunnel_set_redirect(&rt->dst);
990 	}
991 
992 	rt->dst.lastuse = jiffies;
993 }
994 
995 /* Caller must already hold reference to @from */
996 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
997 {
998 	rt->rt6i_flags &= ~RTF_EXPIRES;
999 	rcu_assign_pointer(rt->from, from);
1000 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1001 }
1002 
1003 /* Caller must already hold reference to f6i in result */
1004 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1005 {
1006 	const struct fib6_nh *nh = res->nh;
1007 	const struct net_device *dev = nh->fib_nh_dev;
1008 	struct fib6_info *f6i = res->f6i;
1009 
1010 	ip6_rt_init_dst(rt, res);
1011 
1012 	rt->rt6i_dst = f6i->fib6_dst;
1013 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1014 	rt->rt6i_flags = res->fib6_flags;
1015 	if (nh->fib_nh_gw_family) {
1016 		rt->rt6i_gateway = nh->fib_nh_gw6;
1017 		rt->rt6i_flags |= RTF_GATEWAY;
1018 	}
1019 	rt6_set_from(rt, f6i);
1020 #ifdef CONFIG_IPV6_SUBTREES
1021 	rt->rt6i_src = f6i->fib6_src;
1022 #endif
1023 }
1024 
1025 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1026 					struct in6_addr *saddr)
1027 {
1028 	struct fib6_node *pn, *sn;
1029 	while (1) {
1030 		if (fn->fn_flags & RTN_TL_ROOT)
1031 			return NULL;
1032 		pn = rcu_dereference(fn->parent);
1033 		sn = FIB6_SUBTREE(pn);
1034 		if (sn && sn != fn)
1035 			fn = fib6_node_lookup(sn, NULL, saddr);
1036 		else
1037 			fn = pn;
1038 		if (fn->fn_flags & RTN_RTINFO)
1039 			return fn;
1040 	}
1041 }
1042 
1043 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1044 {
1045 	struct rt6_info *rt = *prt;
1046 
1047 	if (dst_hold_safe(&rt->dst))
1048 		return true;
1049 	if (net) {
1050 		rt = net->ipv6.ip6_null_entry;
1051 		dst_hold(&rt->dst);
1052 	} else {
1053 		rt = NULL;
1054 	}
1055 	*prt = rt;
1056 	return false;
1057 }
1058 
1059 /* called with rcu_lock held */
1060 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1061 {
1062 	struct net_device *dev = res->nh->fib_nh_dev;
1063 	struct fib6_info *f6i = res->f6i;
1064 	unsigned short flags;
1065 	struct rt6_info *nrt;
1066 
1067 	if (!fib6_info_hold_safe(f6i))
1068 		goto fallback;
1069 
1070 	flags = fib6_info_dst_flags(f6i);
1071 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1072 	if (!nrt) {
1073 		fib6_info_release(f6i);
1074 		goto fallback;
1075 	}
1076 
1077 	ip6_rt_copy_init(nrt, res);
1078 	return nrt;
1079 
1080 fallback:
1081 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1082 	dst_hold(&nrt->dst);
1083 	return nrt;
1084 }
1085 
1086 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1087 					     struct fib6_table *table,
1088 					     struct flowi6 *fl6,
1089 					     const struct sk_buff *skb,
1090 					     int flags)
1091 {
1092 	struct fib6_result res = {};
1093 	struct fib6_node *fn;
1094 	struct rt6_info *rt;
1095 
1096 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1097 		flags &= ~RT6_LOOKUP_F_IFACE;
1098 
1099 	rcu_read_lock();
1100 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1101 restart:
1102 	res.f6i = rcu_dereference(fn->leaf);
1103 	if (!res.f6i)
1104 		res.f6i = net->ipv6.fib6_null_entry;
1105 	else
1106 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1107 				 flags);
1108 
1109 	if (res.f6i == net->ipv6.fib6_null_entry) {
1110 		fn = fib6_backtrack(fn, &fl6->saddr);
1111 		if (fn)
1112 			goto restart;
1113 
1114 		rt = net->ipv6.ip6_null_entry;
1115 		dst_hold(&rt->dst);
1116 		goto out;
1117 	}
1118 
1119 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1120 			 fl6->flowi6_oif != 0, skb, flags);
1121 
1122 	/* Search through exception table */
1123 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1124 	if (rt) {
1125 		if (ip6_hold_safe(net, &rt))
1126 			dst_use_noref(&rt->dst, jiffies);
1127 	} else {
1128 		rt = ip6_create_rt_rcu(&res);
1129 	}
1130 
1131 out:
1132 	trace_fib6_table_lookup(net, &res, table, fl6);
1133 
1134 	rcu_read_unlock();
1135 
1136 	return rt;
1137 }
1138 
1139 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1140 				   const struct sk_buff *skb, int flags)
1141 {
1142 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1143 }
1144 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1145 
1146 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1147 			    const struct in6_addr *saddr, int oif,
1148 			    const struct sk_buff *skb, int strict)
1149 {
1150 	struct flowi6 fl6 = {
1151 		.flowi6_oif = oif,
1152 		.daddr = *daddr,
1153 	};
1154 	struct dst_entry *dst;
1155 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1156 
1157 	if (saddr) {
1158 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1159 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1160 	}
1161 
1162 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1163 	if (dst->error == 0)
1164 		return (struct rt6_info *) dst;
1165 
1166 	dst_release(dst);
1167 
1168 	return NULL;
1169 }
1170 EXPORT_SYMBOL(rt6_lookup);
1171 
1172 /* ip6_ins_rt is called with FREE table->tb6_lock.
1173  * It takes new route entry, the addition fails by any reason the
1174  * route is released.
1175  * Caller must hold dst before calling it.
1176  */
1177 
1178 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1179 			struct netlink_ext_ack *extack)
1180 {
1181 	int err;
1182 	struct fib6_table *table;
1183 
1184 	table = rt->fib6_table;
1185 	spin_lock_bh(&table->tb6_lock);
1186 	err = fib6_add(&table->tb6_root, rt, info, extack);
1187 	spin_unlock_bh(&table->tb6_lock);
1188 
1189 	return err;
1190 }
1191 
1192 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1193 {
1194 	struct nl_info info = {	.nl_net = net, };
1195 
1196 	return __ip6_ins_rt(rt, &info, NULL);
1197 }
1198 
1199 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1200 					   const struct in6_addr *daddr,
1201 					   const struct in6_addr *saddr)
1202 {
1203 	struct fib6_info *f6i = res->f6i;
1204 	struct net_device *dev;
1205 	struct rt6_info *rt;
1206 
1207 	/*
1208 	 *	Clone the route.
1209 	 */
1210 
1211 	if (!fib6_info_hold_safe(f6i))
1212 		return NULL;
1213 
1214 	dev = ip6_rt_get_dev_rcu(res);
1215 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1216 	if (!rt) {
1217 		fib6_info_release(f6i);
1218 		return NULL;
1219 	}
1220 
1221 	ip6_rt_copy_init(rt, res);
1222 	rt->rt6i_flags |= RTF_CACHE;
1223 	rt->dst.flags |= DST_HOST;
1224 	rt->rt6i_dst.addr = *daddr;
1225 	rt->rt6i_dst.plen = 128;
1226 
1227 	if (!rt6_is_gw_or_nonexthop(res)) {
1228 		if (f6i->fib6_dst.plen != 128 &&
1229 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1230 			rt->rt6i_flags |= RTF_ANYCAST;
1231 #ifdef CONFIG_IPV6_SUBTREES
1232 		if (rt->rt6i_src.plen && saddr) {
1233 			rt->rt6i_src.addr = *saddr;
1234 			rt->rt6i_src.plen = 128;
1235 		}
1236 #endif
1237 	}
1238 
1239 	return rt;
1240 }
1241 
1242 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1243 {
1244 	struct fib6_info *f6i = res->f6i;
1245 	unsigned short flags = fib6_info_dst_flags(f6i);
1246 	struct net_device *dev;
1247 	struct rt6_info *pcpu_rt;
1248 
1249 	if (!fib6_info_hold_safe(f6i))
1250 		return NULL;
1251 
1252 	rcu_read_lock();
1253 	dev = ip6_rt_get_dev_rcu(res);
1254 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1255 	rcu_read_unlock();
1256 	if (!pcpu_rt) {
1257 		fib6_info_release(f6i);
1258 		return NULL;
1259 	}
1260 	ip6_rt_copy_init(pcpu_rt, res);
1261 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1262 	return pcpu_rt;
1263 }
1264 
1265 /* It should be called with rcu_read_lock() acquired */
1266 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1267 {
1268 	struct rt6_info *pcpu_rt, **p;
1269 
1270 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1271 	pcpu_rt = *p;
1272 
1273 	if (pcpu_rt)
1274 		ip6_hold_safe(NULL, &pcpu_rt);
1275 
1276 	return pcpu_rt;
1277 }
1278 
1279 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1280 					    const struct fib6_result *res)
1281 {
1282 	struct rt6_info *pcpu_rt, *prev, **p;
1283 
1284 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1285 	if (!pcpu_rt) {
1286 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1287 		return net->ipv6.ip6_null_entry;
1288 	}
1289 
1290 	dst_hold(&pcpu_rt->dst);
1291 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1292 	prev = cmpxchg(p, NULL, pcpu_rt);
1293 	BUG_ON(prev);
1294 
1295 	if (res->f6i->fib6_destroying) {
1296 		struct fib6_info *from;
1297 
1298 		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1299 		fib6_info_release(from);
1300 	}
1301 
1302 	return pcpu_rt;
1303 }
1304 
1305 /* exception hash table implementation
1306  */
1307 static DEFINE_SPINLOCK(rt6_exception_lock);
1308 
1309 /* Remove rt6_ex from hash table and free the memory
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1313 				 struct rt6_exception *rt6_ex)
1314 {
1315 	struct fib6_info *from;
1316 	struct net *net;
1317 
1318 	if (!bucket || !rt6_ex)
1319 		return;
1320 
1321 	net = dev_net(rt6_ex->rt6i->dst.dev);
1322 	net->ipv6.rt6_stats->fib_rt_cache--;
1323 
1324 	/* purge completely the exception to allow releasing the held resources:
1325 	 * some [sk] cache may keep the dst around for unlimited time
1326 	 */
1327 	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1328 	fib6_info_release(from);
1329 	dst_dev_put(&rt6_ex->rt6i->dst);
1330 
1331 	hlist_del_rcu(&rt6_ex->hlist);
1332 	dst_release(&rt6_ex->rt6i->dst);
1333 	kfree_rcu(rt6_ex, rcu);
1334 	WARN_ON_ONCE(!bucket->depth);
1335 	bucket->depth--;
1336 }
1337 
1338 /* Remove oldest rt6_ex in bucket and free the memory
1339  * Caller must hold rt6_exception_lock
1340  */
1341 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1342 {
1343 	struct rt6_exception *rt6_ex, *oldest = NULL;
1344 
1345 	if (!bucket)
1346 		return;
1347 
1348 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1349 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1350 			oldest = rt6_ex;
1351 	}
1352 	rt6_remove_exception(bucket, oldest);
1353 }
1354 
1355 static u32 rt6_exception_hash(const struct in6_addr *dst,
1356 			      const struct in6_addr *src)
1357 {
1358 	static u32 seed __read_mostly;
1359 	u32 val;
1360 
1361 	net_get_random_once(&seed, sizeof(seed));
1362 	val = jhash(dst, sizeof(*dst), seed);
1363 
1364 #ifdef CONFIG_IPV6_SUBTREES
1365 	if (src)
1366 		val = jhash(src, sizeof(*src), val);
1367 #endif
1368 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1369 }
1370 
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rt6_exception_lock
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1378 			      const struct in6_addr *daddr,
1379 			      const struct in6_addr *saddr)
1380 {
1381 	struct rt6_exception *rt6_ex;
1382 	u32 hval;
1383 
1384 	if (!(*bucket) || !daddr)
1385 		return NULL;
1386 
1387 	hval = rt6_exception_hash(daddr, saddr);
1388 	*bucket += hval;
1389 
1390 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1391 		struct rt6_info *rt6 = rt6_ex->rt6i;
1392 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1393 
1394 #ifdef CONFIG_IPV6_SUBTREES
1395 		if (matched && saddr)
1396 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1397 #endif
1398 		if (matched)
1399 			return rt6_ex;
1400 	}
1401 	return NULL;
1402 }
1403 
1404 /* Helper function to find the cached rt in the hash table
1405  * and update bucket pointer to point to the bucket for this
1406  * (daddr, saddr) pair
1407  * Caller must hold rcu_read_lock()
1408  */
1409 static struct rt6_exception *
1410 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1411 			 const struct in6_addr *daddr,
1412 			 const struct in6_addr *saddr)
1413 {
1414 	struct rt6_exception *rt6_ex;
1415 	u32 hval;
1416 
1417 	WARN_ON_ONCE(!rcu_read_lock_held());
1418 
1419 	if (!(*bucket) || !daddr)
1420 		return NULL;
1421 
1422 	hval = rt6_exception_hash(daddr, saddr);
1423 	*bucket += hval;
1424 
1425 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1426 		struct rt6_info *rt6 = rt6_ex->rt6i;
1427 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1428 
1429 #ifdef CONFIG_IPV6_SUBTREES
1430 		if (matched && saddr)
1431 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1432 #endif
1433 		if (matched)
1434 			return rt6_ex;
1435 	}
1436 	return NULL;
1437 }
1438 
1439 static unsigned int fib6_mtu(const struct fib6_result *res)
1440 {
1441 	const struct fib6_nh *nh = res->nh;
1442 	unsigned int mtu;
1443 
1444 	if (res->f6i->fib6_pmtu) {
1445 		mtu = res->f6i->fib6_pmtu;
1446 	} else {
1447 		struct net_device *dev = nh->fib_nh_dev;
1448 		struct inet6_dev *idev;
1449 
1450 		rcu_read_lock();
1451 		idev = __in6_dev_get(dev);
1452 		mtu = idev->cnf.mtu6;
1453 		rcu_read_unlock();
1454 	}
1455 
1456 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1457 
1458 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1459 }
1460 
1461 static int rt6_insert_exception(struct rt6_info *nrt,
1462 				const struct fib6_result *res)
1463 {
1464 	struct net *net = dev_net(nrt->dst.dev);
1465 	struct rt6_exception_bucket *bucket;
1466 	struct in6_addr *src_key = NULL;
1467 	struct rt6_exception *rt6_ex;
1468 	struct fib6_info *f6i = res->f6i;
1469 	int err = 0;
1470 
1471 	spin_lock_bh(&rt6_exception_lock);
1472 
1473 	if (f6i->exception_bucket_flushed) {
1474 		err = -EINVAL;
1475 		goto out;
1476 	}
1477 
1478 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1479 					lockdep_is_held(&rt6_exception_lock));
1480 	if (!bucket) {
1481 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1482 				 GFP_ATOMIC);
1483 		if (!bucket) {
1484 			err = -ENOMEM;
1485 			goto out;
1486 		}
1487 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1488 	}
1489 
1490 #ifdef CONFIG_IPV6_SUBTREES
1491 	/* fib6_src.plen != 0 indicates f6i is in subtree
1492 	 * and exception table is indexed by a hash of
1493 	 * both fib6_dst and fib6_src.
1494 	 * Otherwise, the exception table is indexed by
1495 	 * a hash of only fib6_dst.
1496 	 */
1497 	if (f6i->fib6_src.plen)
1498 		src_key = &nrt->rt6i_src.addr;
1499 #endif
1500 	/* rt6_mtu_change() might lower mtu on f6i.
1501 	 * Only insert this exception route if its mtu
1502 	 * is less than f6i's mtu value.
1503 	 */
1504 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1505 		err = -EINVAL;
1506 		goto out;
1507 	}
1508 
1509 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1510 					       src_key);
1511 	if (rt6_ex)
1512 		rt6_remove_exception(bucket, rt6_ex);
1513 
1514 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1515 	if (!rt6_ex) {
1516 		err = -ENOMEM;
1517 		goto out;
1518 	}
1519 	rt6_ex->rt6i = nrt;
1520 	rt6_ex->stamp = jiffies;
1521 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1522 	bucket->depth++;
1523 	net->ipv6.rt6_stats->fib_rt_cache++;
1524 
1525 	if (bucket->depth > FIB6_MAX_DEPTH)
1526 		rt6_exception_remove_oldest(bucket);
1527 
1528 out:
1529 	spin_unlock_bh(&rt6_exception_lock);
1530 
1531 	/* Update fn->fn_sernum to invalidate all cached dst */
1532 	if (!err) {
1533 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1534 		fib6_update_sernum(net, f6i);
1535 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1536 		fib6_force_start_gc(net);
1537 	}
1538 
1539 	return err;
1540 }
1541 
1542 void rt6_flush_exceptions(struct fib6_info *rt)
1543 {
1544 	struct rt6_exception_bucket *bucket;
1545 	struct rt6_exception *rt6_ex;
1546 	struct hlist_node *tmp;
1547 	int i;
1548 
1549 	spin_lock_bh(&rt6_exception_lock);
1550 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1551 	rt->exception_bucket_flushed = 1;
1552 
1553 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554 				    lockdep_is_held(&rt6_exception_lock));
1555 	if (!bucket)
1556 		goto out;
1557 
1558 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1559 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1560 			rt6_remove_exception(bucket, rt6_ex);
1561 		WARN_ON_ONCE(bucket->depth);
1562 		bucket++;
1563 	}
1564 
1565 out:
1566 	spin_unlock_bh(&rt6_exception_lock);
1567 }
1568 
1569 /* Find cached rt in the hash table inside passed in rt
1570  * Caller has to hold rcu_read_lock()
1571  */
1572 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1573 					   const struct in6_addr *daddr,
1574 					   const struct in6_addr *saddr)
1575 {
1576 	const struct in6_addr *src_key = NULL;
1577 	struct rt6_exception_bucket *bucket;
1578 	struct rt6_exception *rt6_ex;
1579 	struct rt6_info *ret = NULL;
1580 
1581 #ifdef CONFIG_IPV6_SUBTREES
1582 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1583 	 * and exception table is indexed by a hash of
1584 	 * both fib6_dst and fib6_src.
1585 	 * However, the src addr used to create the hash
1586 	 * might not be exactly the passed in saddr which
1587 	 * is a /128 addr from the flow.
1588 	 * So we need to use f6i->fib6_src to redo lookup
1589 	 * if the passed in saddr does not find anything.
1590 	 * (See the logic in ip6_rt_cache_alloc() on how
1591 	 * rt->rt6i_src is updated.)
1592 	 */
1593 	if (res->f6i->fib6_src.plen)
1594 		src_key = saddr;
1595 find_ex:
1596 #endif
1597 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1598 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1599 
1600 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1601 		ret = rt6_ex->rt6i;
1602 
1603 #ifdef CONFIG_IPV6_SUBTREES
1604 	/* Use fib6_src as src_key and redo lookup */
1605 	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1606 		src_key = &res->f6i->fib6_src.addr;
1607 		goto find_ex;
1608 	}
1609 #endif
1610 
1611 	return ret;
1612 }
1613 
1614 /* Remove the passed in cached rt from the hash table that contains it */
1615 static int rt6_remove_exception_rt(struct rt6_info *rt)
1616 {
1617 	struct rt6_exception_bucket *bucket;
1618 	struct in6_addr *src_key = NULL;
1619 	struct rt6_exception *rt6_ex;
1620 	struct fib6_info *from;
1621 	int err;
1622 
1623 	from = rcu_dereference(rt->from);
1624 	if (!from ||
1625 	    !(rt->rt6i_flags & RTF_CACHE))
1626 		return -EINVAL;
1627 
1628 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1629 		return -ENOENT;
1630 
1631 	spin_lock_bh(&rt6_exception_lock);
1632 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1633 				    lockdep_is_held(&rt6_exception_lock));
1634 #ifdef CONFIG_IPV6_SUBTREES
1635 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1636 	 * and exception table is indexed by a hash of
1637 	 * both rt6i_dst and rt6i_src.
1638 	 * Otherwise, the exception table is indexed by
1639 	 * a hash of only rt6i_dst.
1640 	 */
1641 	if (from->fib6_src.plen)
1642 		src_key = &rt->rt6i_src.addr;
1643 #endif
1644 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1645 					       &rt->rt6i_dst.addr,
1646 					       src_key);
1647 	if (rt6_ex) {
1648 		rt6_remove_exception(bucket, rt6_ex);
1649 		err = 0;
1650 	} else {
1651 		err = -ENOENT;
1652 	}
1653 
1654 	spin_unlock_bh(&rt6_exception_lock);
1655 	return err;
1656 }
1657 
1658 /* Find rt6_ex which contains the passed in rt cache and
1659  * refresh its stamp
1660  */
1661 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1662 {
1663 	struct rt6_exception_bucket *bucket;
1664 	struct in6_addr *src_key = NULL;
1665 	struct rt6_exception *rt6_ex;
1666 	struct fib6_info *from;
1667 
1668 	rcu_read_lock();
1669 	from = rcu_dereference(rt->from);
1670 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1671 		goto unlock;
1672 
1673 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1674 
1675 #ifdef CONFIG_IPV6_SUBTREES
1676 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1677 	 * and exception table is indexed by a hash of
1678 	 * both rt6i_dst and rt6i_src.
1679 	 * Otherwise, the exception table is indexed by
1680 	 * a hash of only rt6i_dst.
1681 	 */
1682 	if (from->fib6_src.plen)
1683 		src_key = &rt->rt6i_src.addr;
1684 #endif
1685 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1686 					  &rt->rt6i_dst.addr,
1687 					  src_key);
1688 	if (rt6_ex)
1689 		rt6_ex->stamp = jiffies;
1690 
1691 unlock:
1692 	rcu_read_unlock();
1693 }
1694 
1695 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1696 					 struct rt6_info *rt, int mtu)
1697 {
1698 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1699 	 * lowest MTU in the path: always allow updating the route PMTU to
1700 	 * reflect PMTU decreases.
1701 	 *
1702 	 * If the new MTU is higher, and the route PMTU is equal to the local
1703 	 * MTU, this means the old MTU is the lowest in the path, so allow
1704 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1705 	 * handle this.
1706 	 */
1707 
1708 	if (dst_mtu(&rt->dst) >= mtu)
1709 		return true;
1710 
1711 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1712 		return true;
1713 
1714 	return false;
1715 }
1716 
1717 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1718 				       struct fib6_info *rt, int mtu)
1719 {
1720 	struct rt6_exception_bucket *bucket;
1721 	struct rt6_exception *rt6_ex;
1722 	int i;
1723 
1724 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1725 					lockdep_is_held(&rt6_exception_lock));
1726 
1727 	if (!bucket)
1728 		return;
1729 
1730 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1731 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1732 			struct rt6_info *entry = rt6_ex->rt6i;
1733 
1734 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1735 			 * route), the metrics of its rt->from have already
1736 			 * been updated.
1737 			 */
1738 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1739 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1740 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1741 		}
1742 		bucket++;
1743 	}
1744 }
1745 
1746 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1747 
1748 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1749 					struct in6_addr *gateway)
1750 {
1751 	struct rt6_exception_bucket *bucket;
1752 	struct rt6_exception *rt6_ex;
1753 	struct hlist_node *tmp;
1754 	int i;
1755 
1756 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1757 		return;
1758 
1759 	spin_lock_bh(&rt6_exception_lock);
1760 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1761 				     lockdep_is_held(&rt6_exception_lock));
1762 
1763 	if (bucket) {
1764 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1765 			hlist_for_each_entry_safe(rt6_ex, tmp,
1766 						  &bucket->chain, hlist) {
1767 				struct rt6_info *entry = rt6_ex->rt6i;
1768 
1769 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1770 				    RTF_CACHE_GATEWAY &&
1771 				    ipv6_addr_equal(gateway,
1772 						    &entry->rt6i_gateway)) {
1773 					rt6_remove_exception(bucket, rt6_ex);
1774 				}
1775 			}
1776 			bucket++;
1777 		}
1778 	}
1779 
1780 	spin_unlock_bh(&rt6_exception_lock);
1781 }
1782 
1783 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1784 				      struct rt6_exception *rt6_ex,
1785 				      struct fib6_gc_args *gc_args,
1786 				      unsigned long now)
1787 {
1788 	struct rt6_info *rt = rt6_ex->rt6i;
1789 
1790 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1791 	 * even if others have still references to them, so that on next
1792 	 * dst_check() such references can be dropped.
1793 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1794 	 * expired, independently from their aging, as per RFC 8201 section 4
1795 	 */
1796 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1797 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1798 			RT6_TRACE("aging clone %p\n", rt);
1799 			rt6_remove_exception(bucket, rt6_ex);
1800 			return;
1801 		}
1802 	} else if (time_after(jiffies, rt->dst.expires)) {
1803 		RT6_TRACE("purging expired route %p\n", rt);
1804 		rt6_remove_exception(bucket, rt6_ex);
1805 		return;
1806 	}
1807 
1808 	if (rt->rt6i_flags & RTF_GATEWAY) {
1809 		struct neighbour *neigh;
1810 		__u8 neigh_flags = 0;
1811 
1812 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1813 		if (neigh)
1814 			neigh_flags = neigh->flags;
1815 
1816 		if (!(neigh_flags & NTF_ROUTER)) {
1817 			RT6_TRACE("purging route %p via non-router but gateway\n",
1818 				  rt);
1819 			rt6_remove_exception(bucket, rt6_ex);
1820 			return;
1821 		}
1822 	}
1823 
1824 	gc_args->more++;
1825 }
1826 
1827 void rt6_age_exceptions(struct fib6_info *rt,
1828 			struct fib6_gc_args *gc_args,
1829 			unsigned long now)
1830 {
1831 	struct rt6_exception_bucket *bucket;
1832 	struct rt6_exception *rt6_ex;
1833 	struct hlist_node *tmp;
1834 	int i;
1835 
1836 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1837 		return;
1838 
1839 	rcu_read_lock_bh();
1840 	spin_lock(&rt6_exception_lock);
1841 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1842 				    lockdep_is_held(&rt6_exception_lock));
1843 
1844 	if (bucket) {
1845 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1846 			hlist_for_each_entry_safe(rt6_ex, tmp,
1847 						  &bucket->chain, hlist) {
1848 				rt6_age_examine_exception(bucket, rt6_ex,
1849 							  gc_args, now);
1850 			}
1851 			bucket++;
1852 		}
1853 	}
1854 	spin_unlock(&rt6_exception_lock);
1855 	rcu_read_unlock_bh();
1856 }
1857 
1858 /* must be called with rcu lock held */
1859 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1860 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1861 {
1862 	struct fib6_node *fn, *saved_fn;
1863 
1864 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1865 	saved_fn = fn;
1866 
1867 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1868 		oif = 0;
1869 
1870 redo_rt6_select:
1871 	rt6_select(net, fn, oif, res, strict);
1872 	if (res->f6i == net->ipv6.fib6_null_entry) {
1873 		fn = fib6_backtrack(fn, &fl6->saddr);
1874 		if (fn)
1875 			goto redo_rt6_select;
1876 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1877 			/* also consider unreachable route */
1878 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1879 			fn = saved_fn;
1880 			goto redo_rt6_select;
1881 		}
1882 	}
1883 
1884 	trace_fib6_table_lookup(net, res, table, fl6);
1885 
1886 	return 0;
1887 }
1888 
1889 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1890 			       int oif, struct flowi6 *fl6,
1891 			       const struct sk_buff *skb, int flags)
1892 {
1893 	struct fib6_result res = {};
1894 	struct rt6_info *rt;
1895 	int strict = 0;
1896 
1897 	strict |= flags & RT6_LOOKUP_F_IFACE;
1898 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1899 	if (net->ipv6.devconf_all->forwarding == 0)
1900 		strict |= RT6_LOOKUP_F_REACHABLE;
1901 
1902 	rcu_read_lock();
1903 
1904 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1905 	if (res.f6i == net->ipv6.fib6_null_entry) {
1906 		rt = net->ipv6.ip6_null_entry;
1907 		rcu_read_unlock();
1908 		dst_hold(&rt->dst);
1909 		return rt;
1910 	}
1911 
1912 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1913 
1914 	/*Search through exception table */
1915 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1916 	if (rt) {
1917 		if (ip6_hold_safe(net, &rt))
1918 			dst_use_noref(&rt->dst, jiffies);
1919 
1920 		rcu_read_unlock();
1921 		return rt;
1922 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1923 			    !res.nh->fib_nh_gw_family)) {
1924 		/* Create a RTF_CACHE clone which will not be
1925 		 * owned by the fib6 tree.  It is for the special case where
1926 		 * the daddr in the skb during the neighbor look-up is different
1927 		 * from the fl6->daddr used to look-up route here.
1928 		 */
1929 		struct rt6_info *uncached_rt;
1930 
1931 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1932 
1933 		rcu_read_unlock();
1934 
1935 		if (uncached_rt) {
1936 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1937 			 * No need for another dst_hold()
1938 			 */
1939 			rt6_uncached_list_add(uncached_rt);
1940 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1941 		} else {
1942 			uncached_rt = net->ipv6.ip6_null_entry;
1943 			dst_hold(&uncached_rt->dst);
1944 		}
1945 
1946 		return uncached_rt;
1947 	} else {
1948 		/* Get a percpu copy */
1949 
1950 		struct rt6_info *pcpu_rt;
1951 
1952 		local_bh_disable();
1953 		pcpu_rt = rt6_get_pcpu_route(&res);
1954 
1955 		if (!pcpu_rt)
1956 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1957 
1958 		local_bh_enable();
1959 		rcu_read_unlock();
1960 
1961 		return pcpu_rt;
1962 	}
1963 }
1964 EXPORT_SYMBOL_GPL(ip6_pol_route);
1965 
1966 static struct rt6_info *ip6_pol_route_input(struct net *net,
1967 					    struct fib6_table *table,
1968 					    struct flowi6 *fl6,
1969 					    const struct sk_buff *skb,
1970 					    int flags)
1971 {
1972 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1973 }
1974 
1975 struct dst_entry *ip6_route_input_lookup(struct net *net,
1976 					 struct net_device *dev,
1977 					 struct flowi6 *fl6,
1978 					 const struct sk_buff *skb,
1979 					 int flags)
1980 {
1981 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1982 		flags |= RT6_LOOKUP_F_IFACE;
1983 
1984 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1985 }
1986 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1987 
1988 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1989 				  struct flow_keys *keys,
1990 				  struct flow_keys *flkeys)
1991 {
1992 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1993 	const struct ipv6hdr *key_iph = outer_iph;
1994 	struct flow_keys *_flkeys = flkeys;
1995 	const struct ipv6hdr *inner_iph;
1996 	const struct icmp6hdr *icmph;
1997 	struct ipv6hdr _inner_iph;
1998 	struct icmp6hdr _icmph;
1999 
2000 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2001 		goto out;
2002 
2003 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2004 				   sizeof(_icmph), &_icmph);
2005 	if (!icmph)
2006 		goto out;
2007 
2008 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2009 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2010 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2011 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
2012 		goto out;
2013 
2014 	inner_iph = skb_header_pointer(skb,
2015 				       skb_transport_offset(skb) + sizeof(*icmph),
2016 				       sizeof(_inner_iph), &_inner_iph);
2017 	if (!inner_iph)
2018 		goto out;
2019 
2020 	key_iph = inner_iph;
2021 	_flkeys = NULL;
2022 out:
2023 	if (_flkeys) {
2024 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2025 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2026 		keys->tags.flow_label = _flkeys->tags.flow_label;
2027 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2028 	} else {
2029 		keys->addrs.v6addrs.src = key_iph->saddr;
2030 		keys->addrs.v6addrs.dst = key_iph->daddr;
2031 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2032 		keys->basic.ip_proto = key_iph->nexthdr;
2033 	}
2034 }
2035 
2036 /* if skb is set it will be used and fl6 can be NULL */
2037 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2038 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2039 {
2040 	struct flow_keys hash_keys;
2041 	u32 mhash;
2042 
2043 	switch (ip6_multipath_hash_policy(net)) {
2044 	case 0:
2045 		memset(&hash_keys, 0, sizeof(hash_keys));
2046 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2047 		if (skb) {
2048 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2049 		} else {
2050 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2051 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2052 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2053 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054 		}
2055 		break;
2056 	case 1:
2057 		if (skb) {
2058 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2059 			struct flow_keys keys;
2060 
2061 			/* short-circuit if we already have L4 hash present */
2062 			if (skb->l4_hash)
2063 				return skb_get_hash_raw(skb) >> 1;
2064 
2065 			memset(&hash_keys, 0, sizeof(hash_keys));
2066 
2067                         if (!flkeys) {
2068 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2069 				flkeys = &keys;
2070 			}
2071 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2072 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2073 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2074 			hash_keys.ports.src = flkeys->ports.src;
2075 			hash_keys.ports.dst = flkeys->ports.dst;
2076 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2077 		} else {
2078 			memset(&hash_keys, 0, sizeof(hash_keys));
2079 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2080 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2081 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2082 			hash_keys.ports.src = fl6->fl6_sport;
2083 			hash_keys.ports.dst = fl6->fl6_dport;
2084 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2085 		}
2086 		break;
2087 	}
2088 	mhash = flow_hash_from_keys(&hash_keys);
2089 
2090 	return mhash >> 1;
2091 }
2092 
2093 void ip6_route_input(struct sk_buff *skb)
2094 {
2095 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2096 	struct net *net = dev_net(skb->dev);
2097 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2098 	struct ip_tunnel_info *tun_info;
2099 	struct flowi6 fl6 = {
2100 		.flowi6_iif = skb->dev->ifindex,
2101 		.daddr = iph->daddr,
2102 		.saddr = iph->saddr,
2103 		.flowlabel = ip6_flowinfo(iph),
2104 		.flowi6_mark = skb->mark,
2105 		.flowi6_proto = iph->nexthdr,
2106 	};
2107 	struct flow_keys *flkeys = NULL, _flkeys;
2108 
2109 	tun_info = skb_tunnel_info(skb);
2110 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2111 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2112 
2113 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2114 		flkeys = &_flkeys;
2115 
2116 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2117 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2118 	skb_dst_drop(skb);
2119 	skb_dst_set(skb,
2120 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2121 }
2122 
2123 static struct rt6_info *ip6_pol_route_output(struct net *net,
2124 					     struct fib6_table *table,
2125 					     struct flowi6 *fl6,
2126 					     const struct sk_buff *skb,
2127 					     int flags)
2128 {
2129 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2130 }
2131 
2132 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2133 					 struct flowi6 *fl6, int flags)
2134 {
2135 	bool any_src;
2136 
2137 	if (ipv6_addr_type(&fl6->daddr) &
2138 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2139 		struct dst_entry *dst;
2140 
2141 		dst = l3mdev_link_scope_lookup(net, fl6);
2142 		if (dst)
2143 			return dst;
2144 	}
2145 
2146 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2147 
2148 	any_src = ipv6_addr_any(&fl6->saddr);
2149 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2150 	    (fl6->flowi6_oif && any_src))
2151 		flags |= RT6_LOOKUP_F_IFACE;
2152 
2153 	if (!any_src)
2154 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2155 	else if (sk)
2156 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2157 
2158 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2159 }
2160 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2161 
2162 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2163 {
2164 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2165 	struct net_device *loopback_dev = net->loopback_dev;
2166 	struct dst_entry *new = NULL;
2167 
2168 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2169 		       DST_OBSOLETE_DEAD, 0);
2170 	if (rt) {
2171 		rt6_info_init(rt);
2172 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2173 
2174 		new = &rt->dst;
2175 		new->__use = 1;
2176 		new->input = dst_discard;
2177 		new->output = dst_discard_out;
2178 
2179 		dst_copy_metrics(new, &ort->dst);
2180 
2181 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2182 		rt->rt6i_gateway = ort->rt6i_gateway;
2183 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2184 
2185 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2186 #ifdef CONFIG_IPV6_SUBTREES
2187 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2188 #endif
2189 	}
2190 
2191 	dst_release(dst_orig);
2192 	return new ? new : ERR_PTR(-ENOMEM);
2193 }
2194 
2195 /*
2196  *	Destination cache support functions
2197  */
2198 
2199 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2200 {
2201 	u32 rt_cookie = 0;
2202 
2203 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2204 		return false;
2205 
2206 	if (fib6_check_expired(f6i))
2207 		return false;
2208 
2209 	return true;
2210 }
2211 
2212 static struct dst_entry *rt6_check(struct rt6_info *rt,
2213 				   struct fib6_info *from,
2214 				   u32 cookie)
2215 {
2216 	u32 rt_cookie = 0;
2217 
2218 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2219 	    rt_cookie != cookie)
2220 		return NULL;
2221 
2222 	if (rt6_check_expired(rt))
2223 		return NULL;
2224 
2225 	return &rt->dst;
2226 }
2227 
2228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2229 					    struct fib6_info *from,
2230 					    u32 cookie)
2231 {
2232 	if (!__rt6_check_expired(rt) &&
2233 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2234 	    fib6_check(from, cookie))
2235 		return &rt->dst;
2236 	else
2237 		return NULL;
2238 }
2239 
2240 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2241 {
2242 	struct dst_entry *dst_ret;
2243 	struct fib6_info *from;
2244 	struct rt6_info *rt;
2245 
2246 	rt = container_of(dst, struct rt6_info, dst);
2247 
2248 	rcu_read_lock();
2249 
2250 	/* All IPV6 dsts are created with ->obsolete set to the value
2251 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2252 	 * into this function always.
2253 	 */
2254 
2255 	from = rcu_dereference(rt->from);
2256 
2257 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2258 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2259 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2260 	else
2261 		dst_ret = rt6_check(rt, from, cookie);
2262 
2263 	rcu_read_unlock();
2264 
2265 	return dst_ret;
2266 }
2267 
2268 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2269 {
2270 	struct rt6_info *rt = (struct rt6_info *) dst;
2271 
2272 	if (rt) {
2273 		if (rt->rt6i_flags & RTF_CACHE) {
2274 			rcu_read_lock();
2275 			if (rt6_check_expired(rt)) {
2276 				rt6_remove_exception_rt(rt);
2277 				dst = NULL;
2278 			}
2279 			rcu_read_unlock();
2280 		} else {
2281 			dst_release(dst);
2282 			dst = NULL;
2283 		}
2284 	}
2285 	return dst;
2286 }
2287 
2288 static void ip6_link_failure(struct sk_buff *skb)
2289 {
2290 	struct rt6_info *rt;
2291 
2292 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2293 
2294 	rt = (struct rt6_info *) skb_dst(skb);
2295 	if (rt) {
2296 		rcu_read_lock();
2297 		if (rt->rt6i_flags & RTF_CACHE) {
2298 			rt6_remove_exception_rt(rt);
2299 		} else {
2300 			struct fib6_info *from;
2301 			struct fib6_node *fn;
2302 
2303 			from = rcu_dereference(rt->from);
2304 			if (from) {
2305 				fn = rcu_dereference(from->fib6_node);
2306 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2307 					fn->fn_sernum = -1;
2308 			}
2309 		}
2310 		rcu_read_unlock();
2311 	}
2312 }
2313 
2314 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2315 {
2316 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2317 		struct fib6_info *from;
2318 
2319 		rcu_read_lock();
2320 		from = rcu_dereference(rt0->from);
2321 		if (from)
2322 			rt0->dst.expires = from->expires;
2323 		rcu_read_unlock();
2324 	}
2325 
2326 	dst_set_expires(&rt0->dst, timeout);
2327 	rt0->rt6i_flags |= RTF_EXPIRES;
2328 }
2329 
2330 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2331 {
2332 	struct net *net = dev_net(rt->dst.dev);
2333 
2334 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2335 	rt->rt6i_flags |= RTF_MODIFIED;
2336 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2337 }
2338 
2339 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2340 {
2341 	return !(rt->rt6i_flags & RTF_CACHE) &&
2342 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2343 }
2344 
2345 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2346 				 const struct ipv6hdr *iph, u32 mtu)
2347 {
2348 	const struct in6_addr *daddr, *saddr;
2349 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2350 
2351 	if (dst_metric_locked(dst, RTAX_MTU))
2352 		return;
2353 
2354 	if (iph) {
2355 		daddr = &iph->daddr;
2356 		saddr = &iph->saddr;
2357 	} else if (sk) {
2358 		daddr = &sk->sk_v6_daddr;
2359 		saddr = &inet6_sk(sk)->saddr;
2360 	} else {
2361 		daddr = NULL;
2362 		saddr = NULL;
2363 	}
2364 	dst_confirm_neigh(dst, daddr);
2365 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2366 	if (mtu >= dst_mtu(dst))
2367 		return;
2368 
2369 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2370 		rt6_do_update_pmtu(rt6, mtu);
2371 		/* update rt6_ex->stamp for cache */
2372 		if (rt6->rt6i_flags & RTF_CACHE)
2373 			rt6_update_exception_stamp_rt(rt6);
2374 	} else if (daddr) {
2375 		struct fib6_result res = {};
2376 		struct rt6_info *nrt6;
2377 
2378 		rcu_read_lock();
2379 		res.f6i = rcu_dereference(rt6->from);
2380 		if (!res.f6i) {
2381 			rcu_read_unlock();
2382 			return;
2383 		}
2384 		res.nh = &res.f6i->fib6_nh;
2385 		res.fib6_flags = res.f6i->fib6_flags;
2386 		res.fib6_type = res.f6i->fib6_type;
2387 
2388 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2389 		if (nrt6) {
2390 			rt6_do_update_pmtu(nrt6, mtu);
2391 			if (rt6_insert_exception(nrt6, &res))
2392 				dst_release_immediate(&nrt6->dst);
2393 		}
2394 		rcu_read_unlock();
2395 	}
2396 }
2397 
2398 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2399 			       struct sk_buff *skb, u32 mtu)
2400 {
2401 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2402 }
2403 
2404 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2405 		     int oif, u32 mark, kuid_t uid)
2406 {
2407 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2408 	struct dst_entry *dst;
2409 	struct flowi6 fl6 = {
2410 		.flowi6_oif = oif,
2411 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2412 		.daddr = iph->daddr,
2413 		.saddr = iph->saddr,
2414 		.flowlabel = ip6_flowinfo(iph),
2415 		.flowi6_uid = uid,
2416 	};
2417 
2418 	dst = ip6_route_output(net, NULL, &fl6);
2419 	if (!dst->error)
2420 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2421 	dst_release(dst);
2422 }
2423 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2424 
2425 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2426 {
2427 	int oif = sk->sk_bound_dev_if;
2428 	struct dst_entry *dst;
2429 
2430 	if (!oif && skb->dev)
2431 		oif = l3mdev_master_ifindex(skb->dev);
2432 
2433 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2434 
2435 	dst = __sk_dst_get(sk);
2436 	if (!dst || !dst->obsolete ||
2437 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2438 		return;
2439 
2440 	bh_lock_sock(sk);
2441 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2442 		ip6_datagram_dst_update(sk, false);
2443 	bh_unlock_sock(sk);
2444 }
2445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2446 
2447 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2448 			   const struct flowi6 *fl6)
2449 {
2450 #ifdef CONFIG_IPV6_SUBTREES
2451 	struct ipv6_pinfo *np = inet6_sk(sk);
2452 #endif
2453 
2454 	ip6_dst_store(sk, dst,
2455 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2456 		      &sk->sk_v6_daddr : NULL,
2457 #ifdef CONFIG_IPV6_SUBTREES
2458 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2459 		      &np->saddr :
2460 #endif
2461 		      NULL);
2462 }
2463 
2464 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2465 				  struct flowi6 *fl6,
2466 				  const struct in6_addr *gw,
2467 				  struct rt6_info **ret)
2468 {
2469 	const struct fib6_nh *nh = res->nh;
2470 
2471 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2472 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2473 		return false;
2474 
2475 	/* rt_cache's gateway might be different from its 'parent'
2476 	 * in the case of an ip redirect.
2477 	 * So we keep searching in the exception table if the gateway
2478 	 * is different.
2479 	 */
2480 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2481 		struct rt6_info *rt_cache;
2482 
2483 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2484 		if (rt_cache &&
2485 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2486 			*ret = rt_cache;
2487 			return true;
2488 		}
2489 		return false;
2490 	}
2491 	return true;
2492 }
2493 
2494 /* Handle redirects */
2495 struct ip6rd_flowi {
2496 	struct flowi6 fl6;
2497 	struct in6_addr gateway;
2498 };
2499 
2500 static struct rt6_info *__ip6_route_redirect(struct net *net,
2501 					     struct fib6_table *table,
2502 					     struct flowi6 *fl6,
2503 					     const struct sk_buff *skb,
2504 					     int flags)
2505 {
2506 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2507 	struct rt6_info *ret = NULL;
2508 	struct fib6_result res = {};
2509 	struct fib6_info *rt;
2510 	struct fib6_node *fn;
2511 
2512 	/* l3mdev_update_flow overrides oif if the device is enslaved; in
2513 	 * this case we must match on the real ingress device, so reset it
2514 	 */
2515 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2516 		fl6->flowi6_oif = skb->dev->ifindex;
2517 
2518 	/* Get the "current" route for this destination and
2519 	 * check if the redirect has come from appropriate router.
2520 	 *
2521 	 * RFC 4861 specifies that redirects should only be
2522 	 * accepted if they come from the nexthop to the target.
2523 	 * Due to the way the routes are chosen, this notion
2524 	 * is a bit fuzzy and one might need to check all possible
2525 	 * routes.
2526 	 */
2527 
2528 	rcu_read_lock();
2529 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2530 restart:
2531 	for_each_fib6_node_rt_rcu(fn) {
2532 		res.f6i = rt;
2533 		res.nh = &rt->fib6_nh;
2534 
2535 		if (fib6_check_expired(rt))
2536 			continue;
2537 		if (rt->fib6_flags & RTF_REJECT)
2538 			break;
2539 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2540 			goto out;
2541 	}
2542 
2543 	if (!rt)
2544 		rt = net->ipv6.fib6_null_entry;
2545 	else if (rt->fib6_flags & RTF_REJECT) {
2546 		ret = net->ipv6.ip6_null_entry;
2547 		goto out;
2548 	}
2549 
2550 	if (rt == net->ipv6.fib6_null_entry) {
2551 		fn = fib6_backtrack(fn, &fl6->saddr);
2552 		if (fn)
2553 			goto restart;
2554 	}
2555 
2556 	res.f6i = rt;
2557 	res.nh = &rt->fib6_nh;
2558 out:
2559 	if (ret) {
2560 		ip6_hold_safe(net, &ret);
2561 	} else {
2562 		res.fib6_flags = res.f6i->fib6_flags;
2563 		res.fib6_type = res.f6i->fib6_type;
2564 		ret = ip6_create_rt_rcu(&res);
2565 	}
2566 
2567 	rcu_read_unlock();
2568 
2569 	trace_fib6_table_lookup(net, &res, table, fl6);
2570 	return ret;
2571 };
2572 
2573 static struct dst_entry *ip6_route_redirect(struct net *net,
2574 					    const struct flowi6 *fl6,
2575 					    const struct sk_buff *skb,
2576 					    const struct in6_addr *gateway)
2577 {
2578 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2579 	struct ip6rd_flowi rdfl;
2580 
2581 	rdfl.fl6 = *fl6;
2582 	rdfl.gateway = *gateway;
2583 
2584 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2585 				flags, __ip6_route_redirect);
2586 }
2587 
2588 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2589 		  kuid_t uid)
2590 {
2591 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2592 	struct dst_entry *dst;
2593 	struct flowi6 fl6 = {
2594 		.flowi6_iif = LOOPBACK_IFINDEX,
2595 		.flowi6_oif = oif,
2596 		.flowi6_mark = mark,
2597 		.daddr = iph->daddr,
2598 		.saddr = iph->saddr,
2599 		.flowlabel = ip6_flowinfo(iph),
2600 		.flowi6_uid = uid,
2601 	};
2602 
2603 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2604 	rt6_do_redirect(dst, NULL, skb);
2605 	dst_release(dst);
2606 }
2607 EXPORT_SYMBOL_GPL(ip6_redirect);
2608 
2609 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2610 {
2611 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2612 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2613 	struct dst_entry *dst;
2614 	struct flowi6 fl6 = {
2615 		.flowi6_iif = LOOPBACK_IFINDEX,
2616 		.flowi6_oif = oif,
2617 		.daddr = msg->dest,
2618 		.saddr = iph->daddr,
2619 		.flowi6_uid = sock_net_uid(net, NULL),
2620 	};
2621 
2622 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2623 	rt6_do_redirect(dst, NULL, skb);
2624 	dst_release(dst);
2625 }
2626 
2627 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2628 {
2629 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2630 		     sk->sk_uid);
2631 }
2632 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2633 
2634 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2635 {
2636 	struct net_device *dev = dst->dev;
2637 	unsigned int mtu = dst_mtu(dst);
2638 	struct net *net = dev_net(dev);
2639 
2640 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2641 
2642 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2643 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2644 
2645 	/*
2646 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2647 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2648 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2649 	 * rely only on pmtu discovery"
2650 	 */
2651 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2652 		mtu = IPV6_MAXPLEN;
2653 	return mtu;
2654 }
2655 
2656 static unsigned int ip6_mtu(const struct dst_entry *dst)
2657 {
2658 	struct inet6_dev *idev;
2659 	unsigned int mtu;
2660 
2661 	mtu = dst_metric_raw(dst, RTAX_MTU);
2662 	if (mtu)
2663 		goto out;
2664 
2665 	mtu = IPV6_MIN_MTU;
2666 
2667 	rcu_read_lock();
2668 	idev = __in6_dev_get(dst->dev);
2669 	if (idev)
2670 		mtu = idev->cnf.mtu6;
2671 	rcu_read_unlock();
2672 
2673 out:
2674 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2675 
2676 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2677 }
2678 
2679 /* MTU selection:
2680  * 1. mtu on route is locked - use it
2681  * 2. mtu from nexthop exception
2682  * 3. mtu from egress device
2683  *
2684  * based on ip6_dst_mtu_forward and exception logic of
2685  * rt6_find_cached_rt; called with rcu_read_lock
2686  */
2687 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2688 		      const struct in6_addr *daddr,
2689 		      const struct in6_addr *saddr)
2690 {
2691 	const struct fib6_nh *nh = res->nh;
2692 	struct fib6_info *f6i = res->f6i;
2693 	struct inet6_dev *idev;
2694 	struct rt6_info *rt;
2695 	u32 mtu = 0;
2696 
2697 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2698 		mtu = f6i->fib6_pmtu;
2699 		if (mtu)
2700 			goto out;
2701 	}
2702 
2703 	rt = rt6_find_cached_rt(res, daddr, saddr);
2704 	if (unlikely(rt)) {
2705 		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2706 	} else {
2707 		struct net_device *dev = nh->fib_nh_dev;
2708 
2709 		mtu = IPV6_MIN_MTU;
2710 		idev = __in6_dev_get(dev);
2711 		if (idev && idev->cnf.mtu6 > mtu)
2712 			mtu = idev->cnf.mtu6;
2713 	}
2714 
2715 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2716 out:
2717 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2718 }
2719 
2720 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2721 				  struct flowi6 *fl6)
2722 {
2723 	struct dst_entry *dst;
2724 	struct rt6_info *rt;
2725 	struct inet6_dev *idev = in6_dev_get(dev);
2726 	struct net *net = dev_net(dev);
2727 
2728 	if (unlikely(!idev))
2729 		return ERR_PTR(-ENODEV);
2730 
2731 	rt = ip6_dst_alloc(net, dev, 0);
2732 	if (unlikely(!rt)) {
2733 		in6_dev_put(idev);
2734 		dst = ERR_PTR(-ENOMEM);
2735 		goto out;
2736 	}
2737 
2738 	rt->dst.flags |= DST_HOST;
2739 	rt->dst.input = ip6_input;
2740 	rt->dst.output  = ip6_output;
2741 	rt->rt6i_gateway  = fl6->daddr;
2742 	rt->rt6i_dst.addr = fl6->daddr;
2743 	rt->rt6i_dst.plen = 128;
2744 	rt->rt6i_idev     = idev;
2745 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2746 
2747 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2748 	 * do proper release of the net_device
2749 	 */
2750 	rt6_uncached_list_add(rt);
2751 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2752 
2753 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2754 
2755 out:
2756 	return dst;
2757 }
2758 
2759 static int ip6_dst_gc(struct dst_ops *ops)
2760 {
2761 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2762 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2763 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2764 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2765 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2766 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2767 	int entries;
2768 
2769 	entries = dst_entries_get_fast(ops);
2770 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2771 	    entries <= rt_max_size)
2772 		goto out;
2773 
2774 	net->ipv6.ip6_rt_gc_expire++;
2775 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2776 	entries = dst_entries_get_slow(ops);
2777 	if (entries < ops->gc_thresh)
2778 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2779 out:
2780 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2781 	return entries > rt_max_size;
2782 }
2783 
2784 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2785 					    struct fib6_config *cfg,
2786 					    const struct in6_addr *gw_addr,
2787 					    u32 tbid, int flags)
2788 {
2789 	struct flowi6 fl6 = {
2790 		.flowi6_oif = cfg->fc_ifindex,
2791 		.daddr = *gw_addr,
2792 		.saddr = cfg->fc_prefsrc,
2793 	};
2794 	struct fib6_table *table;
2795 	struct rt6_info *rt;
2796 
2797 	table = fib6_get_table(net, tbid);
2798 	if (!table)
2799 		return NULL;
2800 
2801 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2802 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2803 
2804 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2805 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2806 
2807 	/* if table lookup failed, fall back to full lookup */
2808 	if (rt == net->ipv6.ip6_null_entry) {
2809 		ip6_rt_put(rt);
2810 		rt = NULL;
2811 	}
2812 
2813 	return rt;
2814 }
2815 
2816 static int ip6_route_check_nh_onlink(struct net *net,
2817 				     struct fib6_config *cfg,
2818 				     const struct net_device *dev,
2819 				     struct netlink_ext_ack *extack)
2820 {
2821 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2822 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2823 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2824 	struct fib6_info *from;
2825 	struct rt6_info *grt;
2826 	int err;
2827 
2828 	err = 0;
2829 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2830 	if (grt) {
2831 		rcu_read_lock();
2832 		from = rcu_dereference(grt->from);
2833 		if (!grt->dst.error &&
2834 		    /* ignore match if it is the default route */
2835 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2836 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2837 			NL_SET_ERR_MSG(extack,
2838 				       "Nexthop has invalid gateway or device mismatch");
2839 			err = -EINVAL;
2840 		}
2841 		rcu_read_unlock();
2842 
2843 		ip6_rt_put(grt);
2844 	}
2845 
2846 	return err;
2847 }
2848 
2849 static int ip6_route_check_nh(struct net *net,
2850 			      struct fib6_config *cfg,
2851 			      struct net_device **_dev,
2852 			      struct inet6_dev **idev)
2853 {
2854 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2855 	struct net_device *dev = _dev ? *_dev : NULL;
2856 	struct rt6_info *grt = NULL;
2857 	int err = -EHOSTUNREACH;
2858 
2859 	if (cfg->fc_table) {
2860 		int flags = RT6_LOOKUP_F_IFACE;
2861 
2862 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2863 					  cfg->fc_table, flags);
2864 		if (grt) {
2865 			if (grt->rt6i_flags & RTF_GATEWAY ||
2866 			    (dev && dev != grt->dst.dev)) {
2867 				ip6_rt_put(grt);
2868 				grt = NULL;
2869 			}
2870 		}
2871 	}
2872 
2873 	if (!grt)
2874 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2875 
2876 	if (!grt)
2877 		goto out;
2878 
2879 	if (dev) {
2880 		if (dev != grt->dst.dev) {
2881 			ip6_rt_put(grt);
2882 			goto out;
2883 		}
2884 	} else {
2885 		*_dev = dev = grt->dst.dev;
2886 		*idev = grt->rt6i_idev;
2887 		dev_hold(dev);
2888 		in6_dev_hold(grt->rt6i_idev);
2889 	}
2890 
2891 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2892 		err = 0;
2893 
2894 	ip6_rt_put(grt);
2895 
2896 out:
2897 	return err;
2898 }
2899 
2900 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2901 			   struct net_device **_dev, struct inet6_dev **idev,
2902 			   struct netlink_ext_ack *extack)
2903 {
2904 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2905 	int gwa_type = ipv6_addr_type(gw_addr);
2906 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2907 	const struct net_device *dev = *_dev;
2908 	bool need_addr_check = !dev;
2909 	int err = -EINVAL;
2910 
2911 	/* if gw_addr is local we will fail to detect this in case
2912 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2913 	 * will return already-added prefix route via interface that
2914 	 * prefix route was assigned to, which might be non-loopback.
2915 	 */
2916 	if (dev &&
2917 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2918 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2919 		goto out;
2920 	}
2921 
2922 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2923 		/* IPv6 strictly inhibits using not link-local
2924 		 * addresses as nexthop address.
2925 		 * Otherwise, router will not able to send redirects.
2926 		 * It is very good, but in some (rare!) circumstances
2927 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2928 		 * some exceptions. --ANK
2929 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2930 		 * addressing
2931 		 */
2932 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2933 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2934 			goto out;
2935 		}
2936 
2937 		if (cfg->fc_flags & RTNH_F_ONLINK)
2938 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2939 		else
2940 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2941 
2942 		if (err)
2943 			goto out;
2944 	}
2945 
2946 	/* reload in case device was changed */
2947 	dev = *_dev;
2948 
2949 	err = -EINVAL;
2950 	if (!dev) {
2951 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2952 		goto out;
2953 	} else if (dev->flags & IFF_LOOPBACK) {
2954 		NL_SET_ERR_MSG(extack,
2955 			       "Egress device can not be loopback device for this route");
2956 		goto out;
2957 	}
2958 
2959 	/* if we did not check gw_addr above, do so now that the
2960 	 * egress device has been resolved.
2961 	 */
2962 	if (need_addr_check &&
2963 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2964 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2965 		goto out;
2966 	}
2967 
2968 	err = 0;
2969 out:
2970 	return err;
2971 }
2972 
2973 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2974 {
2975 	if ((flags & RTF_REJECT) ||
2976 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2977 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2978 	     !(flags & RTF_LOCAL)))
2979 		return true;
2980 
2981 	return false;
2982 }
2983 
2984 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2985 		 struct fib6_config *cfg, gfp_t gfp_flags,
2986 		 struct netlink_ext_ack *extack)
2987 {
2988 	struct net_device *dev = NULL;
2989 	struct inet6_dev *idev = NULL;
2990 	int addr_type;
2991 	int err;
2992 
2993 	fib6_nh->fib_nh_family = AF_INET6;
2994 
2995 	err = -ENODEV;
2996 	if (cfg->fc_ifindex) {
2997 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2998 		if (!dev)
2999 			goto out;
3000 		idev = in6_dev_get(dev);
3001 		if (!idev)
3002 			goto out;
3003 	}
3004 
3005 	if (cfg->fc_flags & RTNH_F_ONLINK) {
3006 		if (!dev) {
3007 			NL_SET_ERR_MSG(extack,
3008 				       "Nexthop device required for onlink");
3009 			goto out;
3010 		}
3011 
3012 		if (!(dev->flags & IFF_UP)) {
3013 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3014 			err = -ENETDOWN;
3015 			goto out;
3016 		}
3017 
3018 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3019 	}
3020 
3021 	fib6_nh->fib_nh_weight = 1;
3022 
3023 	/* We cannot add true routes via loopback here,
3024 	 * they would result in kernel looping; promote them to reject routes
3025 	 */
3026 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3027 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3028 		/* hold loopback dev/idev if we haven't done so. */
3029 		if (dev != net->loopback_dev) {
3030 			if (dev) {
3031 				dev_put(dev);
3032 				in6_dev_put(idev);
3033 			}
3034 			dev = net->loopback_dev;
3035 			dev_hold(dev);
3036 			idev = in6_dev_get(dev);
3037 			if (!idev) {
3038 				err = -ENODEV;
3039 				goto out;
3040 			}
3041 		}
3042 		goto set_dev;
3043 	}
3044 
3045 	if (cfg->fc_flags & RTF_GATEWAY) {
3046 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3047 		if (err)
3048 			goto out;
3049 
3050 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3051 		fib6_nh->fib_nh_gw_family = AF_INET6;
3052 	}
3053 
3054 	err = -ENODEV;
3055 	if (!dev)
3056 		goto out;
3057 
3058 	if (idev->cnf.disable_ipv6) {
3059 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3060 		err = -EACCES;
3061 		goto out;
3062 	}
3063 
3064 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3065 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3066 		err = -ENETDOWN;
3067 		goto out;
3068 	}
3069 
3070 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3071 	    !netif_carrier_ok(dev))
3072 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3073 
3074 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3075 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3076 	if (err)
3077 		goto out;
3078 set_dev:
3079 	fib6_nh->fib_nh_dev = dev;
3080 	fib6_nh->fib_nh_oif = dev->ifindex;
3081 	err = 0;
3082 out:
3083 	if (idev)
3084 		in6_dev_put(idev);
3085 
3086 	if (err) {
3087 		lwtstate_put(fib6_nh->fib_nh_lws);
3088 		fib6_nh->fib_nh_lws = NULL;
3089 		if (dev)
3090 			dev_put(dev);
3091 	}
3092 
3093 	return err;
3094 }
3095 
3096 void fib6_nh_release(struct fib6_nh *fib6_nh)
3097 {
3098 	fib_nh_common_release(&fib6_nh->nh_common);
3099 }
3100 
3101 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3102 					      gfp_t gfp_flags,
3103 					      struct netlink_ext_ack *extack)
3104 {
3105 	struct net *net = cfg->fc_nlinfo.nl_net;
3106 	struct fib6_info *rt = NULL;
3107 	struct fib6_table *table;
3108 	int err = -EINVAL;
3109 	int addr_type;
3110 
3111 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3112 	if (cfg->fc_flags & RTF_PCPU) {
3113 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3114 		goto out;
3115 	}
3116 
3117 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3118 	if (cfg->fc_flags & RTF_CACHE) {
3119 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3120 		goto out;
3121 	}
3122 
3123 	if (cfg->fc_type > RTN_MAX) {
3124 		NL_SET_ERR_MSG(extack, "Invalid route type");
3125 		goto out;
3126 	}
3127 
3128 	if (cfg->fc_dst_len > 128) {
3129 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3130 		goto out;
3131 	}
3132 	if (cfg->fc_src_len > 128) {
3133 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3134 		goto out;
3135 	}
3136 #ifndef CONFIG_IPV6_SUBTREES
3137 	if (cfg->fc_src_len) {
3138 		NL_SET_ERR_MSG(extack,
3139 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3140 		goto out;
3141 	}
3142 #endif
3143 
3144 	err = -ENOBUFS;
3145 	if (cfg->fc_nlinfo.nlh &&
3146 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3147 		table = fib6_get_table(net, cfg->fc_table);
3148 		if (!table) {
3149 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3150 			table = fib6_new_table(net, cfg->fc_table);
3151 		}
3152 	} else {
3153 		table = fib6_new_table(net, cfg->fc_table);
3154 	}
3155 
3156 	if (!table)
3157 		goto out;
3158 
3159 	err = -ENOMEM;
3160 	rt = fib6_info_alloc(gfp_flags);
3161 	if (!rt)
3162 		goto out;
3163 
3164 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3165 					       extack);
3166 	if (IS_ERR(rt->fib6_metrics)) {
3167 		err = PTR_ERR(rt->fib6_metrics);
3168 		/* Do not leave garbage there. */
3169 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3170 		goto out;
3171 	}
3172 
3173 	if (cfg->fc_flags & RTF_ADDRCONF)
3174 		rt->dst_nocount = true;
3175 
3176 	if (cfg->fc_flags & RTF_EXPIRES)
3177 		fib6_set_expires(rt, jiffies +
3178 				clock_t_to_jiffies(cfg->fc_expires));
3179 	else
3180 		fib6_clean_expires(rt);
3181 
3182 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3183 		cfg->fc_protocol = RTPROT_BOOT;
3184 	rt->fib6_protocol = cfg->fc_protocol;
3185 
3186 	rt->fib6_table = table;
3187 	rt->fib6_metric = cfg->fc_metric;
3188 	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3189 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3190 
3191 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3192 	rt->fib6_dst.plen = cfg->fc_dst_len;
3193 	if (rt->fib6_dst.plen == 128)
3194 		rt->dst_host = true;
3195 
3196 #ifdef CONFIG_IPV6_SUBTREES
3197 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3198 	rt->fib6_src.plen = cfg->fc_src_len;
3199 #endif
3200 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3201 	if (err)
3202 		goto out;
3203 
3204 	/* We cannot add true routes via loopback here,
3205 	 * they would result in kernel looping; promote them to reject routes
3206 	 */
3207 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3208 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3209 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3210 
3211 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3212 		struct net_device *dev = fib6_info_nh_dev(rt);
3213 
3214 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3215 			NL_SET_ERR_MSG(extack, "Invalid source address");
3216 			err = -EINVAL;
3217 			goto out;
3218 		}
3219 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3220 		rt->fib6_prefsrc.plen = 128;
3221 	} else
3222 		rt->fib6_prefsrc.plen = 0;
3223 
3224 	return rt;
3225 out:
3226 	fib6_info_release(rt);
3227 	return ERR_PTR(err);
3228 }
3229 
3230 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3231 		  struct netlink_ext_ack *extack)
3232 {
3233 	struct fib6_info *rt;
3234 	int err;
3235 
3236 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3237 	if (IS_ERR(rt))
3238 		return PTR_ERR(rt);
3239 
3240 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3241 	fib6_info_release(rt);
3242 
3243 	return err;
3244 }
3245 
3246 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3247 {
3248 	struct net *net = info->nl_net;
3249 	struct fib6_table *table;
3250 	int err;
3251 
3252 	if (rt == net->ipv6.fib6_null_entry) {
3253 		err = -ENOENT;
3254 		goto out;
3255 	}
3256 
3257 	table = rt->fib6_table;
3258 	spin_lock_bh(&table->tb6_lock);
3259 	err = fib6_del(rt, info);
3260 	spin_unlock_bh(&table->tb6_lock);
3261 
3262 out:
3263 	fib6_info_release(rt);
3264 	return err;
3265 }
3266 
3267 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3268 {
3269 	struct nl_info info = { .nl_net = net };
3270 
3271 	return __ip6_del_rt(rt, &info);
3272 }
3273 
3274 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3275 {
3276 	struct nl_info *info = &cfg->fc_nlinfo;
3277 	struct net *net = info->nl_net;
3278 	struct sk_buff *skb = NULL;
3279 	struct fib6_table *table;
3280 	int err = -ENOENT;
3281 
3282 	if (rt == net->ipv6.fib6_null_entry)
3283 		goto out_put;
3284 	table = rt->fib6_table;
3285 	spin_lock_bh(&table->tb6_lock);
3286 
3287 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3288 		struct fib6_info *sibling, *next_sibling;
3289 
3290 		/* prefer to send a single notification with all hops */
3291 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3292 		if (skb) {
3293 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3294 
3295 			if (rt6_fill_node(net, skb, rt, NULL,
3296 					  NULL, NULL, 0, RTM_DELROUTE,
3297 					  info->portid, seq, 0) < 0) {
3298 				kfree_skb(skb);
3299 				skb = NULL;
3300 			} else
3301 				info->skip_notify = 1;
3302 		}
3303 
3304 		list_for_each_entry_safe(sibling, next_sibling,
3305 					 &rt->fib6_siblings,
3306 					 fib6_siblings) {
3307 			err = fib6_del(sibling, info);
3308 			if (err)
3309 				goto out_unlock;
3310 		}
3311 	}
3312 
3313 	err = fib6_del(rt, info);
3314 out_unlock:
3315 	spin_unlock_bh(&table->tb6_lock);
3316 out_put:
3317 	fib6_info_release(rt);
3318 
3319 	if (skb) {
3320 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3321 			    info->nlh, gfp_any());
3322 	}
3323 	return err;
3324 }
3325 
3326 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3327 {
3328 	int rc = -ESRCH;
3329 
3330 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3331 		goto out;
3332 
3333 	if (cfg->fc_flags & RTF_GATEWAY &&
3334 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3335 		goto out;
3336 
3337 	rc = rt6_remove_exception_rt(rt);
3338 out:
3339 	return rc;
3340 }
3341 
3342 static int ip6_route_del(struct fib6_config *cfg,
3343 			 struct netlink_ext_ack *extack)
3344 {
3345 	struct rt6_info *rt_cache;
3346 	struct fib6_table *table;
3347 	struct fib6_info *rt;
3348 	struct fib6_node *fn;
3349 	int err = -ESRCH;
3350 
3351 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3352 	if (!table) {
3353 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3354 		return err;
3355 	}
3356 
3357 	rcu_read_lock();
3358 
3359 	fn = fib6_locate(&table->tb6_root,
3360 			 &cfg->fc_dst, cfg->fc_dst_len,
3361 			 &cfg->fc_src, cfg->fc_src_len,
3362 			 !(cfg->fc_flags & RTF_CACHE));
3363 
3364 	if (fn) {
3365 		for_each_fib6_node_rt_rcu(fn) {
3366 			struct fib6_nh *nh;
3367 
3368 			if (cfg->fc_flags & RTF_CACHE) {
3369 				struct fib6_result res = {
3370 					.f6i = rt,
3371 				};
3372 				int rc;
3373 
3374 				rt_cache = rt6_find_cached_rt(&res,
3375 							      &cfg->fc_dst,
3376 							      &cfg->fc_src);
3377 				if (rt_cache) {
3378 					rc = ip6_del_cached_rt(rt_cache, cfg);
3379 					if (rc != -ESRCH) {
3380 						rcu_read_unlock();
3381 						return rc;
3382 					}
3383 				}
3384 				continue;
3385 			}
3386 
3387 			nh = &rt->fib6_nh;
3388 			if (cfg->fc_ifindex &&
3389 			    (!nh->fib_nh_dev ||
3390 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3391 				continue;
3392 			if (cfg->fc_flags & RTF_GATEWAY &&
3393 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3394 				continue;
3395 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3396 				continue;
3397 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3398 				continue;
3399 			if (!fib6_info_hold_safe(rt))
3400 				continue;
3401 			rcu_read_unlock();
3402 
3403 			/* if gateway was specified only delete the one hop */
3404 			if (cfg->fc_flags & RTF_GATEWAY)
3405 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3406 
3407 			return __ip6_del_rt_siblings(rt, cfg);
3408 		}
3409 	}
3410 	rcu_read_unlock();
3411 
3412 	return err;
3413 }
3414 
3415 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3416 {
3417 	struct netevent_redirect netevent;
3418 	struct rt6_info *rt, *nrt = NULL;
3419 	struct fib6_result res = {};
3420 	struct ndisc_options ndopts;
3421 	struct inet6_dev *in6_dev;
3422 	struct neighbour *neigh;
3423 	struct rd_msg *msg;
3424 	int optlen, on_link;
3425 	u8 *lladdr;
3426 
3427 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3428 	optlen -= sizeof(*msg);
3429 
3430 	if (optlen < 0) {
3431 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3432 		return;
3433 	}
3434 
3435 	msg = (struct rd_msg *)icmp6_hdr(skb);
3436 
3437 	if (ipv6_addr_is_multicast(&msg->dest)) {
3438 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3439 		return;
3440 	}
3441 
3442 	on_link = 0;
3443 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3444 		on_link = 1;
3445 	} else if (ipv6_addr_type(&msg->target) !=
3446 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3447 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3448 		return;
3449 	}
3450 
3451 	in6_dev = __in6_dev_get(skb->dev);
3452 	if (!in6_dev)
3453 		return;
3454 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3455 		return;
3456 
3457 	/* RFC2461 8.1:
3458 	 *	The IP source address of the Redirect MUST be the same as the current
3459 	 *	first-hop router for the specified ICMP Destination Address.
3460 	 */
3461 
3462 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3463 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3464 		return;
3465 	}
3466 
3467 	lladdr = NULL;
3468 	if (ndopts.nd_opts_tgt_lladdr) {
3469 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3470 					     skb->dev);
3471 		if (!lladdr) {
3472 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3473 			return;
3474 		}
3475 	}
3476 
3477 	rt = (struct rt6_info *) dst;
3478 	if (rt->rt6i_flags & RTF_REJECT) {
3479 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3480 		return;
3481 	}
3482 
3483 	/* Redirect received -> path was valid.
3484 	 * Look, redirects are sent only in response to data packets,
3485 	 * so that this nexthop apparently is reachable. --ANK
3486 	 */
3487 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3488 
3489 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3490 	if (!neigh)
3491 		return;
3492 
3493 	/*
3494 	 *	We have finally decided to accept it.
3495 	 */
3496 
3497 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3498 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3499 		     NEIGH_UPDATE_F_OVERRIDE|
3500 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3501 				     NEIGH_UPDATE_F_ISROUTER)),
3502 		     NDISC_REDIRECT, &ndopts);
3503 
3504 	rcu_read_lock();
3505 	res.f6i = rcu_dereference(rt->from);
3506 	if (!res.f6i)
3507 		goto out;
3508 
3509 	res.nh = &res.f6i->fib6_nh;
3510 	res.fib6_flags = res.f6i->fib6_flags;
3511 	res.fib6_type = res.f6i->fib6_type;
3512 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3513 	if (!nrt)
3514 		goto out;
3515 
3516 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3517 	if (on_link)
3518 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3519 
3520 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3521 
3522 	/* rt6_insert_exception() will take care of duplicated exceptions */
3523 	if (rt6_insert_exception(nrt, &res)) {
3524 		dst_release_immediate(&nrt->dst);
3525 		goto out;
3526 	}
3527 
3528 	netevent.old = &rt->dst;
3529 	netevent.new = &nrt->dst;
3530 	netevent.daddr = &msg->dest;
3531 	netevent.neigh = neigh;
3532 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3533 
3534 out:
3535 	rcu_read_unlock();
3536 	neigh_release(neigh);
3537 }
3538 
3539 #ifdef CONFIG_IPV6_ROUTE_INFO
3540 static struct fib6_info *rt6_get_route_info(struct net *net,
3541 					   const struct in6_addr *prefix, int prefixlen,
3542 					   const struct in6_addr *gwaddr,
3543 					   struct net_device *dev)
3544 {
3545 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3546 	int ifindex = dev->ifindex;
3547 	struct fib6_node *fn;
3548 	struct fib6_info *rt = NULL;
3549 	struct fib6_table *table;
3550 
3551 	table = fib6_get_table(net, tb_id);
3552 	if (!table)
3553 		return NULL;
3554 
3555 	rcu_read_lock();
3556 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3557 	if (!fn)
3558 		goto out;
3559 
3560 	for_each_fib6_node_rt_rcu(fn) {
3561 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3562 			continue;
3563 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3564 		    !rt->fib6_nh.fib_nh_gw_family)
3565 			continue;
3566 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3567 			continue;
3568 		if (!fib6_info_hold_safe(rt))
3569 			continue;
3570 		break;
3571 	}
3572 out:
3573 	rcu_read_unlock();
3574 	return rt;
3575 }
3576 
3577 static struct fib6_info *rt6_add_route_info(struct net *net,
3578 					   const struct in6_addr *prefix, int prefixlen,
3579 					   const struct in6_addr *gwaddr,
3580 					   struct net_device *dev,
3581 					   unsigned int pref)
3582 {
3583 	struct fib6_config cfg = {
3584 		.fc_metric	= IP6_RT_PRIO_USER,
3585 		.fc_ifindex	= dev->ifindex,
3586 		.fc_dst_len	= prefixlen,
3587 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3588 				  RTF_UP | RTF_PREF(pref),
3589 		.fc_protocol = RTPROT_RA,
3590 		.fc_type = RTN_UNICAST,
3591 		.fc_nlinfo.portid = 0,
3592 		.fc_nlinfo.nlh = NULL,
3593 		.fc_nlinfo.nl_net = net,
3594 	};
3595 
3596 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3597 	cfg.fc_dst = *prefix;
3598 	cfg.fc_gateway = *gwaddr;
3599 
3600 	/* We should treat it as a default route if prefix length is 0. */
3601 	if (!prefixlen)
3602 		cfg.fc_flags |= RTF_DEFAULT;
3603 
3604 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3605 
3606 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3607 }
3608 #endif
3609 
3610 struct fib6_info *rt6_get_dflt_router(struct net *net,
3611 				     const struct in6_addr *addr,
3612 				     struct net_device *dev)
3613 {
3614 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3615 	struct fib6_info *rt;
3616 	struct fib6_table *table;
3617 
3618 	table = fib6_get_table(net, tb_id);
3619 	if (!table)
3620 		return NULL;
3621 
3622 	rcu_read_lock();
3623 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3624 		struct fib6_nh *nh = &rt->fib6_nh;
3625 
3626 		if (dev == nh->fib_nh_dev &&
3627 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3628 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3629 			break;
3630 	}
3631 	if (rt && !fib6_info_hold_safe(rt))
3632 		rt = NULL;
3633 	rcu_read_unlock();
3634 	return rt;
3635 }
3636 
3637 struct fib6_info *rt6_add_dflt_router(struct net *net,
3638 				     const struct in6_addr *gwaddr,
3639 				     struct net_device *dev,
3640 				     unsigned int pref)
3641 {
3642 	struct fib6_config cfg = {
3643 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3644 		.fc_metric	= IP6_RT_PRIO_USER,
3645 		.fc_ifindex	= dev->ifindex,
3646 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3647 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3648 		.fc_protocol = RTPROT_RA,
3649 		.fc_type = RTN_UNICAST,
3650 		.fc_nlinfo.portid = 0,
3651 		.fc_nlinfo.nlh = NULL,
3652 		.fc_nlinfo.nl_net = net,
3653 	};
3654 
3655 	cfg.fc_gateway = *gwaddr;
3656 
3657 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3658 		struct fib6_table *table;
3659 
3660 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3661 		if (table)
3662 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3663 	}
3664 
3665 	return rt6_get_dflt_router(net, gwaddr, dev);
3666 }
3667 
3668 static void __rt6_purge_dflt_routers(struct net *net,
3669 				     struct fib6_table *table)
3670 {
3671 	struct fib6_info *rt;
3672 
3673 restart:
3674 	rcu_read_lock();
3675 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3676 		struct net_device *dev = fib6_info_nh_dev(rt);
3677 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3678 
3679 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3680 		    (!idev || idev->cnf.accept_ra != 2) &&
3681 		    fib6_info_hold_safe(rt)) {
3682 			rcu_read_unlock();
3683 			ip6_del_rt(net, rt);
3684 			goto restart;
3685 		}
3686 	}
3687 	rcu_read_unlock();
3688 
3689 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3690 }
3691 
3692 void rt6_purge_dflt_routers(struct net *net)
3693 {
3694 	struct fib6_table *table;
3695 	struct hlist_head *head;
3696 	unsigned int h;
3697 
3698 	rcu_read_lock();
3699 
3700 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3701 		head = &net->ipv6.fib_table_hash[h];
3702 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3703 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3704 				__rt6_purge_dflt_routers(net, table);
3705 		}
3706 	}
3707 
3708 	rcu_read_unlock();
3709 }
3710 
3711 static void rtmsg_to_fib6_config(struct net *net,
3712 				 struct in6_rtmsg *rtmsg,
3713 				 struct fib6_config *cfg)
3714 {
3715 	*cfg = (struct fib6_config){
3716 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3717 			 : RT6_TABLE_MAIN,
3718 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3719 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3720 		.fc_expires = rtmsg->rtmsg_info,
3721 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3722 		.fc_src_len = rtmsg->rtmsg_src_len,
3723 		.fc_flags = rtmsg->rtmsg_flags,
3724 		.fc_type = rtmsg->rtmsg_type,
3725 
3726 		.fc_nlinfo.nl_net = net,
3727 
3728 		.fc_dst = rtmsg->rtmsg_dst,
3729 		.fc_src = rtmsg->rtmsg_src,
3730 		.fc_gateway = rtmsg->rtmsg_gateway,
3731 	};
3732 }
3733 
3734 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3735 {
3736 	struct fib6_config cfg;
3737 	struct in6_rtmsg rtmsg;
3738 	int err;
3739 
3740 	switch (cmd) {
3741 	case SIOCADDRT:		/* Add a route */
3742 	case SIOCDELRT:		/* Delete a route */
3743 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3744 			return -EPERM;
3745 		err = copy_from_user(&rtmsg, arg,
3746 				     sizeof(struct in6_rtmsg));
3747 		if (err)
3748 			return -EFAULT;
3749 
3750 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3751 
3752 		rtnl_lock();
3753 		switch (cmd) {
3754 		case SIOCADDRT:
3755 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3756 			break;
3757 		case SIOCDELRT:
3758 			err = ip6_route_del(&cfg, NULL);
3759 			break;
3760 		default:
3761 			err = -EINVAL;
3762 		}
3763 		rtnl_unlock();
3764 
3765 		return err;
3766 	}
3767 
3768 	return -EINVAL;
3769 }
3770 
3771 /*
3772  *	Drop the packet on the floor
3773  */
3774 
3775 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3776 {
3777 	struct dst_entry *dst = skb_dst(skb);
3778 	struct net *net = dev_net(dst->dev);
3779 	struct inet6_dev *idev;
3780 	int type;
3781 
3782 	if (netif_is_l3_master(skb->dev) &&
3783 	    dst->dev == net->loopback_dev)
3784 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3785 	else
3786 		idev = ip6_dst_idev(dst);
3787 
3788 	switch (ipstats_mib_noroutes) {
3789 	case IPSTATS_MIB_INNOROUTES:
3790 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3791 		if (type == IPV6_ADDR_ANY) {
3792 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3793 			break;
3794 		}
3795 		/* FALLTHROUGH */
3796 	case IPSTATS_MIB_OUTNOROUTES:
3797 		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3798 		break;
3799 	}
3800 
3801 	/* Start over by dropping the dst for l3mdev case */
3802 	if (netif_is_l3_master(skb->dev))
3803 		skb_dst_drop(skb);
3804 
3805 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3806 	kfree_skb(skb);
3807 	return 0;
3808 }
3809 
3810 static int ip6_pkt_discard(struct sk_buff *skb)
3811 {
3812 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3813 }
3814 
3815 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3816 {
3817 	skb->dev = skb_dst(skb)->dev;
3818 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3819 }
3820 
3821 static int ip6_pkt_prohibit(struct sk_buff *skb)
3822 {
3823 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3824 }
3825 
3826 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3827 {
3828 	skb->dev = skb_dst(skb)->dev;
3829 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3830 }
3831 
3832 /*
3833  *	Allocate a dst for local (unicast / anycast) address.
3834  */
3835 
3836 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3837 				     struct inet6_dev *idev,
3838 				     const struct in6_addr *addr,
3839 				     bool anycast, gfp_t gfp_flags)
3840 {
3841 	struct fib6_config cfg = {
3842 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3843 		.fc_ifindex = idev->dev->ifindex,
3844 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3845 		.fc_dst = *addr,
3846 		.fc_dst_len = 128,
3847 		.fc_protocol = RTPROT_KERNEL,
3848 		.fc_nlinfo.nl_net = net,
3849 		.fc_ignore_dev_down = true,
3850 	};
3851 
3852 	if (anycast) {
3853 		cfg.fc_type = RTN_ANYCAST;
3854 		cfg.fc_flags |= RTF_ANYCAST;
3855 	} else {
3856 		cfg.fc_type = RTN_LOCAL;
3857 		cfg.fc_flags |= RTF_LOCAL;
3858 	}
3859 
3860 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3861 }
3862 
3863 /* remove deleted ip from prefsrc entries */
3864 struct arg_dev_net_ip {
3865 	struct net_device *dev;
3866 	struct net *net;
3867 	struct in6_addr *addr;
3868 };
3869 
3870 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3871 {
3872 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3873 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3874 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3875 
3876 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3877 	    rt != net->ipv6.fib6_null_entry &&
3878 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3879 		spin_lock_bh(&rt6_exception_lock);
3880 		/* remove prefsrc entry */
3881 		rt->fib6_prefsrc.plen = 0;
3882 		spin_unlock_bh(&rt6_exception_lock);
3883 	}
3884 	return 0;
3885 }
3886 
3887 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3888 {
3889 	struct net *net = dev_net(ifp->idev->dev);
3890 	struct arg_dev_net_ip adni = {
3891 		.dev = ifp->idev->dev,
3892 		.net = net,
3893 		.addr = &ifp->addr,
3894 	};
3895 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3896 }
3897 
3898 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3899 
3900 /* Remove routers and update dst entries when gateway turn into host. */
3901 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3902 {
3903 	struct in6_addr *gateway = (struct in6_addr *)arg;
3904 
3905 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3906 	    rt->fib6_nh.fib_nh_gw_family &&
3907 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3908 		return -1;
3909 	}
3910 
3911 	/* Further clean up cached routes in exception table.
3912 	 * This is needed because cached route may have a different
3913 	 * gateway than its 'parent' in the case of an ip redirect.
3914 	 */
3915 	rt6_exceptions_clean_tohost(rt, gateway);
3916 
3917 	return 0;
3918 }
3919 
3920 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3921 {
3922 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3923 }
3924 
3925 struct arg_netdev_event {
3926 	const struct net_device *dev;
3927 	union {
3928 		unsigned char nh_flags;
3929 		unsigned long event;
3930 	};
3931 };
3932 
3933 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3934 {
3935 	struct fib6_info *iter;
3936 	struct fib6_node *fn;
3937 
3938 	fn = rcu_dereference_protected(rt->fib6_node,
3939 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3940 	iter = rcu_dereference_protected(fn->leaf,
3941 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3942 	while (iter) {
3943 		if (iter->fib6_metric == rt->fib6_metric &&
3944 		    rt6_qualify_for_ecmp(iter))
3945 			return iter;
3946 		iter = rcu_dereference_protected(iter->fib6_next,
3947 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3948 	}
3949 
3950 	return NULL;
3951 }
3952 
3953 static bool rt6_is_dead(const struct fib6_info *rt)
3954 {
3955 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3956 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3957 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3958 		return true;
3959 
3960 	return false;
3961 }
3962 
3963 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3964 {
3965 	struct fib6_info *iter;
3966 	int total = 0;
3967 
3968 	if (!rt6_is_dead(rt))
3969 		total += rt->fib6_nh.fib_nh_weight;
3970 
3971 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3972 		if (!rt6_is_dead(iter))
3973 			total += iter->fib6_nh.fib_nh_weight;
3974 	}
3975 
3976 	return total;
3977 }
3978 
3979 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3980 {
3981 	int upper_bound = -1;
3982 
3983 	if (!rt6_is_dead(rt)) {
3984 		*weight += rt->fib6_nh.fib_nh_weight;
3985 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3986 						    total) - 1;
3987 	}
3988 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3989 }
3990 
3991 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3992 {
3993 	struct fib6_info *iter;
3994 	int weight = 0;
3995 
3996 	rt6_upper_bound_set(rt, &weight, total);
3997 
3998 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3999 		rt6_upper_bound_set(iter, &weight, total);
4000 }
4001 
4002 void rt6_multipath_rebalance(struct fib6_info *rt)
4003 {
4004 	struct fib6_info *first;
4005 	int total;
4006 
4007 	/* In case the entire multipath route was marked for flushing,
4008 	 * then there is no need to rebalance upon the removal of every
4009 	 * sibling route.
4010 	 */
4011 	if (!rt->fib6_nsiblings || rt->should_flush)
4012 		return;
4013 
4014 	/* During lookup routes are evaluated in order, so we need to
4015 	 * make sure upper bounds are assigned from the first sibling
4016 	 * onwards.
4017 	 */
4018 	first = rt6_multipath_first_sibling(rt);
4019 	if (WARN_ON_ONCE(!first))
4020 		return;
4021 
4022 	total = rt6_multipath_total_weight(first);
4023 	rt6_multipath_upper_bound_set(first, total);
4024 }
4025 
4026 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4027 {
4028 	const struct arg_netdev_event *arg = p_arg;
4029 	struct net *net = dev_net(arg->dev);
4030 
4031 	if (rt != net->ipv6.fib6_null_entry &&
4032 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4033 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4034 		fib6_update_sernum_upto_root(net, rt);
4035 		rt6_multipath_rebalance(rt);
4036 	}
4037 
4038 	return 0;
4039 }
4040 
4041 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4042 {
4043 	struct arg_netdev_event arg = {
4044 		.dev = dev,
4045 		{
4046 			.nh_flags = nh_flags,
4047 		},
4048 	};
4049 
4050 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4051 		arg.nh_flags |= RTNH_F_LINKDOWN;
4052 
4053 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4054 }
4055 
4056 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4057 				   const struct net_device *dev)
4058 {
4059 	struct fib6_info *iter;
4060 
4061 	if (rt->fib6_nh.fib_nh_dev == dev)
4062 		return true;
4063 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4064 		if (iter->fib6_nh.fib_nh_dev == dev)
4065 			return true;
4066 
4067 	return false;
4068 }
4069 
4070 static void rt6_multipath_flush(struct fib6_info *rt)
4071 {
4072 	struct fib6_info *iter;
4073 
4074 	rt->should_flush = 1;
4075 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 		iter->should_flush = 1;
4077 }
4078 
4079 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4080 					     const struct net_device *down_dev)
4081 {
4082 	struct fib6_info *iter;
4083 	unsigned int dead = 0;
4084 
4085 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4086 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4087 		dead++;
4088 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4089 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4090 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4091 			dead++;
4092 
4093 	return dead;
4094 }
4095 
4096 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4097 				       const struct net_device *dev,
4098 				       unsigned char nh_flags)
4099 {
4100 	struct fib6_info *iter;
4101 
4102 	if (rt->fib6_nh.fib_nh_dev == dev)
4103 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4104 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4105 		if (iter->fib6_nh.fib_nh_dev == dev)
4106 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4107 }
4108 
4109 /* called with write lock held for table with rt */
4110 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4111 {
4112 	const struct arg_netdev_event *arg = p_arg;
4113 	const struct net_device *dev = arg->dev;
4114 	struct net *net = dev_net(dev);
4115 
4116 	if (rt == net->ipv6.fib6_null_entry)
4117 		return 0;
4118 
4119 	switch (arg->event) {
4120 	case NETDEV_UNREGISTER:
4121 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4122 	case NETDEV_DOWN:
4123 		if (rt->should_flush)
4124 			return -1;
4125 		if (!rt->fib6_nsiblings)
4126 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4127 		if (rt6_multipath_uses_dev(rt, dev)) {
4128 			unsigned int count;
4129 
4130 			count = rt6_multipath_dead_count(rt, dev);
4131 			if (rt->fib6_nsiblings + 1 == count) {
4132 				rt6_multipath_flush(rt);
4133 				return -1;
4134 			}
4135 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4136 						   RTNH_F_LINKDOWN);
4137 			fib6_update_sernum(net, rt);
4138 			rt6_multipath_rebalance(rt);
4139 		}
4140 		return -2;
4141 	case NETDEV_CHANGE:
4142 		if (rt->fib6_nh.fib_nh_dev != dev ||
4143 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4144 			break;
4145 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4146 		rt6_multipath_rebalance(rt);
4147 		break;
4148 	}
4149 
4150 	return 0;
4151 }
4152 
4153 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4154 {
4155 	struct arg_netdev_event arg = {
4156 		.dev = dev,
4157 		{
4158 			.event = event,
4159 		},
4160 	};
4161 	struct net *net = dev_net(dev);
4162 
4163 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4164 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4165 	else
4166 		fib6_clean_all(net, fib6_ifdown, &arg);
4167 }
4168 
4169 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4170 {
4171 	rt6_sync_down_dev(dev, event);
4172 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4173 	neigh_ifdown(&nd_tbl, dev);
4174 }
4175 
4176 struct rt6_mtu_change_arg {
4177 	struct net_device *dev;
4178 	unsigned int mtu;
4179 };
4180 
4181 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4182 {
4183 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4184 	struct inet6_dev *idev;
4185 
4186 	/* In IPv6 pmtu discovery is not optional,
4187 	   so that RTAX_MTU lock cannot disable it.
4188 	   We still use this lock to block changes
4189 	   caused by addrconf/ndisc.
4190 	*/
4191 
4192 	idev = __in6_dev_get(arg->dev);
4193 	if (!idev)
4194 		return 0;
4195 
4196 	/* For administrative MTU increase, there is no way to discover
4197 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4198 	   Since RFC 1981 doesn't include administrative MTU increase
4199 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4200 	 */
4201 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4202 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4203 		u32 mtu = rt->fib6_pmtu;
4204 
4205 		if (mtu >= arg->mtu ||
4206 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4207 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4208 
4209 		spin_lock_bh(&rt6_exception_lock);
4210 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4211 		spin_unlock_bh(&rt6_exception_lock);
4212 	}
4213 	return 0;
4214 }
4215 
4216 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4217 {
4218 	struct rt6_mtu_change_arg arg = {
4219 		.dev = dev,
4220 		.mtu = mtu,
4221 	};
4222 
4223 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4224 }
4225 
4226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4227 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4228 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4229 	[RTA_OIF]               = { .type = NLA_U32 },
4230 	[RTA_IIF]		= { .type = NLA_U32 },
4231 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4232 	[RTA_METRICS]           = { .type = NLA_NESTED },
4233 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4234 	[RTA_PREF]              = { .type = NLA_U8 },
4235 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4236 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4237 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4238 	[RTA_UID]		= { .type = NLA_U32 },
4239 	[RTA_MARK]		= { .type = NLA_U32 },
4240 	[RTA_TABLE]		= { .type = NLA_U32 },
4241 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4242 	[RTA_SPORT]		= { .type = NLA_U16 },
4243 	[RTA_DPORT]		= { .type = NLA_U16 },
4244 };
4245 
4246 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4247 			      struct fib6_config *cfg,
4248 			      struct netlink_ext_ack *extack)
4249 {
4250 	struct rtmsg *rtm;
4251 	struct nlattr *tb[RTA_MAX+1];
4252 	unsigned int pref;
4253 	int err;
4254 
4255 	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4256 				     rtm_ipv6_policy, extack);
4257 	if (err < 0)
4258 		goto errout;
4259 
4260 	err = -EINVAL;
4261 	rtm = nlmsg_data(nlh);
4262 
4263 	*cfg = (struct fib6_config){
4264 		.fc_table = rtm->rtm_table,
4265 		.fc_dst_len = rtm->rtm_dst_len,
4266 		.fc_src_len = rtm->rtm_src_len,
4267 		.fc_flags = RTF_UP,
4268 		.fc_protocol = rtm->rtm_protocol,
4269 		.fc_type = rtm->rtm_type,
4270 
4271 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4272 		.fc_nlinfo.nlh = nlh,
4273 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4274 	};
4275 
4276 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4277 	    rtm->rtm_type == RTN_BLACKHOLE ||
4278 	    rtm->rtm_type == RTN_PROHIBIT ||
4279 	    rtm->rtm_type == RTN_THROW)
4280 		cfg->fc_flags |= RTF_REJECT;
4281 
4282 	if (rtm->rtm_type == RTN_LOCAL)
4283 		cfg->fc_flags |= RTF_LOCAL;
4284 
4285 	if (rtm->rtm_flags & RTM_F_CLONED)
4286 		cfg->fc_flags |= RTF_CACHE;
4287 
4288 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4289 
4290 	if (tb[RTA_GATEWAY]) {
4291 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4292 		cfg->fc_flags |= RTF_GATEWAY;
4293 	}
4294 	if (tb[RTA_VIA]) {
4295 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4296 		goto errout;
4297 	}
4298 
4299 	if (tb[RTA_DST]) {
4300 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4301 
4302 		if (nla_len(tb[RTA_DST]) < plen)
4303 			goto errout;
4304 
4305 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4306 	}
4307 
4308 	if (tb[RTA_SRC]) {
4309 		int plen = (rtm->rtm_src_len + 7) >> 3;
4310 
4311 		if (nla_len(tb[RTA_SRC]) < plen)
4312 			goto errout;
4313 
4314 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4315 	}
4316 
4317 	if (tb[RTA_PREFSRC])
4318 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4319 
4320 	if (tb[RTA_OIF])
4321 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4322 
4323 	if (tb[RTA_PRIORITY])
4324 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4325 
4326 	if (tb[RTA_METRICS]) {
4327 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4328 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4329 	}
4330 
4331 	if (tb[RTA_TABLE])
4332 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4333 
4334 	if (tb[RTA_MULTIPATH]) {
4335 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4336 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4337 
4338 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4339 						     cfg->fc_mp_len, extack);
4340 		if (err < 0)
4341 			goto errout;
4342 	}
4343 
4344 	if (tb[RTA_PREF]) {
4345 		pref = nla_get_u8(tb[RTA_PREF]);
4346 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4347 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4348 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4349 		cfg->fc_flags |= RTF_PREF(pref);
4350 	}
4351 
4352 	if (tb[RTA_ENCAP])
4353 		cfg->fc_encap = tb[RTA_ENCAP];
4354 
4355 	if (tb[RTA_ENCAP_TYPE]) {
4356 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4357 
4358 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4359 		if (err < 0)
4360 			goto errout;
4361 	}
4362 
4363 	if (tb[RTA_EXPIRES]) {
4364 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4365 
4366 		if (addrconf_finite_timeout(timeout)) {
4367 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4368 			cfg->fc_flags |= RTF_EXPIRES;
4369 		}
4370 	}
4371 
4372 	err = 0;
4373 errout:
4374 	return err;
4375 }
4376 
4377 struct rt6_nh {
4378 	struct fib6_info *fib6_info;
4379 	struct fib6_config r_cfg;
4380 	struct list_head next;
4381 };
4382 
4383 static int ip6_route_info_append(struct net *net,
4384 				 struct list_head *rt6_nh_list,
4385 				 struct fib6_info *rt,
4386 				 struct fib6_config *r_cfg)
4387 {
4388 	struct rt6_nh *nh;
4389 	int err = -EEXIST;
4390 
4391 	list_for_each_entry(nh, rt6_nh_list, next) {
4392 		/* check if fib6_info already exists */
4393 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4394 			return err;
4395 	}
4396 
4397 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4398 	if (!nh)
4399 		return -ENOMEM;
4400 	nh->fib6_info = rt;
4401 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4402 	list_add_tail(&nh->next, rt6_nh_list);
4403 
4404 	return 0;
4405 }
4406 
4407 static void ip6_route_mpath_notify(struct fib6_info *rt,
4408 				   struct fib6_info *rt_last,
4409 				   struct nl_info *info,
4410 				   __u16 nlflags)
4411 {
4412 	/* if this is an APPEND route, then rt points to the first route
4413 	 * inserted and rt_last points to last route inserted. Userspace
4414 	 * wants a consistent dump of the route which starts at the first
4415 	 * nexthop. Since sibling routes are always added at the end of
4416 	 * the list, find the first sibling of the last route appended
4417 	 */
4418 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4419 		rt = list_first_entry(&rt_last->fib6_siblings,
4420 				      struct fib6_info,
4421 				      fib6_siblings);
4422 	}
4423 
4424 	if (rt)
4425 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4426 }
4427 
4428 static int ip6_route_multipath_add(struct fib6_config *cfg,
4429 				   struct netlink_ext_ack *extack)
4430 {
4431 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4432 	struct nl_info *info = &cfg->fc_nlinfo;
4433 	struct fib6_config r_cfg;
4434 	struct rtnexthop *rtnh;
4435 	struct fib6_info *rt;
4436 	struct rt6_nh *err_nh;
4437 	struct rt6_nh *nh, *nh_safe;
4438 	__u16 nlflags;
4439 	int remaining;
4440 	int attrlen;
4441 	int err = 1;
4442 	int nhn = 0;
4443 	int replace = (cfg->fc_nlinfo.nlh &&
4444 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4445 	LIST_HEAD(rt6_nh_list);
4446 
4447 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4448 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4449 		nlflags |= NLM_F_APPEND;
4450 
4451 	remaining = cfg->fc_mp_len;
4452 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4453 
4454 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4455 	 * fib6_info structs per nexthop
4456 	 */
4457 	while (rtnh_ok(rtnh, remaining)) {
4458 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4459 		if (rtnh->rtnh_ifindex)
4460 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4461 
4462 		attrlen = rtnh_attrlen(rtnh);
4463 		if (attrlen > 0) {
4464 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4465 
4466 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4467 			if (nla) {
4468 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4469 				r_cfg.fc_flags |= RTF_GATEWAY;
4470 			}
4471 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4472 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4473 			if (nla)
4474 				r_cfg.fc_encap_type = nla_get_u16(nla);
4475 		}
4476 
4477 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4478 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4479 		if (IS_ERR(rt)) {
4480 			err = PTR_ERR(rt);
4481 			rt = NULL;
4482 			goto cleanup;
4483 		}
4484 		if (!rt6_qualify_for_ecmp(rt)) {
4485 			err = -EINVAL;
4486 			NL_SET_ERR_MSG(extack,
4487 				       "Device only routes can not be added for IPv6 using the multipath API.");
4488 			fib6_info_release(rt);
4489 			goto cleanup;
4490 		}
4491 
4492 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4493 
4494 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4495 					    rt, &r_cfg);
4496 		if (err) {
4497 			fib6_info_release(rt);
4498 			goto cleanup;
4499 		}
4500 
4501 		rtnh = rtnh_next(rtnh, &remaining);
4502 	}
4503 
4504 	/* for add and replace send one notification with all nexthops.
4505 	 * Skip the notification in fib6_add_rt2node and send one with
4506 	 * the full route when done
4507 	 */
4508 	info->skip_notify = 1;
4509 
4510 	err_nh = NULL;
4511 	list_for_each_entry(nh, &rt6_nh_list, next) {
4512 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4513 		fib6_info_release(nh->fib6_info);
4514 
4515 		if (!err) {
4516 			/* save reference to last route successfully inserted */
4517 			rt_last = nh->fib6_info;
4518 
4519 			/* save reference to first route for notification */
4520 			if (!rt_notif)
4521 				rt_notif = nh->fib6_info;
4522 		}
4523 
4524 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4525 		nh->fib6_info = NULL;
4526 		if (err) {
4527 			if (replace && nhn)
4528 				NL_SET_ERR_MSG_MOD(extack,
4529 						   "multipath route replace failed (check consistency of installed routes)");
4530 			err_nh = nh;
4531 			goto add_errout;
4532 		}
4533 
4534 		/* Because each route is added like a single route we remove
4535 		 * these flags after the first nexthop: if there is a collision,
4536 		 * we have already failed to add the first nexthop:
4537 		 * fib6_add_rt2node() has rejected it; when replacing, old
4538 		 * nexthops have been replaced by first new, the rest should
4539 		 * be added to it.
4540 		 */
4541 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4542 						     NLM_F_REPLACE);
4543 		nhn++;
4544 	}
4545 
4546 	/* success ... tell user about new route */
4547 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4548 	goto cleanup;
4549 
4550 add_errout:
4551 	/* send notification for routes that were added so that
4552 	 * the delete notifications sent by ip6_route_del are
4553 	 * coherent
4554 	 */
4555 	if (rt_notif)
4556 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4557 
4558 	/* Delete routes that were already added */
4559 	list_for_each_entry(nh, &rt6_nh_list, next) {
4560 		if (err_nh == nh)
4561 			break;
4562 		ip6_route_del(&nh->r_cfg, extack);
4563 	}
4564 
4565 cleanup:
4566 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4567 		if (nh->fib6_info)
4568 			fib6_info_release(nh->fib6_info);
4569 		list_del(&nh->next);
4570 		kfree(nh);
4571 	}
4572 
4573 	return err;
4574 }
4575 
4576 static int ip6_route_multipath_del(struct fib6_config *cfg,
4577 				   struct netlink_ext_ack *extack)
4578 {
4579 	struct fib6_config r_cfg;
4580 	struct rtnexthop *rtnh;
4581 	int remaining;
4582 	int attrlen;
4583 	int err = 1, last_err = 0;
4584 
4585 	remaining = cfg->fc_mp_len;
4586 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4587 
4588 	/* Parse a Multipath Entry */
4589 	while (rtnh_ok(rtnh, remaining)) {
4590 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4591 		if (rtnh->rtnh_ifindex)
4592 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4593 
4594 		attrlen = rtnh_attrlen(rtnh);
4595 		if (attrlen > 0) {
4596 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4597 
4598 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4599 			if (nla) {
4600 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4601 				r_cfg.fc_flags |= RTF_GATEWAY;
4602 			}
4603 		}
4604 		err = ip6_route_del(&r_cfg, extack);
4605 		if (err)
4606 			last_err = err;
4607 
4608 		rtnh = rtnh_next(rtnh, &remaining);
4609 	}
4610 
4611 	return last_err;
4612 }
4613 
4614 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4615 			      struct netlink_ext_ack *extack)
4616 {
4617 	struct fib6_config cfg;
4618 	int err;
4619 
4620 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4621 	if (err < 0)
4622 		return err;
4623 
4624 	if (cfg.fc_mp)
4625 		return ip6_route_multipath_del(&cfg, extack);
4626 	else {
4627 		cfg.fc_delete_all_nh = 1;
4628 		return ip6_route_del(&cfg, extack);
4629 	}
4630 }
4631 
4632 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4633 			      struct netlink_ext_ack *extack)
4634 {
4635 	struct fib6_config cfg;
4636 	int err;
4637 
4638 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4639 	if (err < 0)
4640 		return err;
4641 
4642 	if (cfg.fc_metric == 0)
4643 		cfg.fc_metric = IP6_RT_PRIO_USER;
4644 
4645 	if (cfg.fc_mp)
4646 		return ip6_route_multipath_add(&cfg, extack);
4647 	else
4648 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4649 }
4650 
4651 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4652 {
4653 	int nexthop_len = 0;
4654 
4655 	if (rt->fib6_nsiblings) {
4656 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4657 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4658 			    + nla_total_size(16) /* RTA_GATEWAY */
4659 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4660 
4661 		nexthop_len *= rt->fib6_nsiblings;
4662 	}
4663 
4664 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4665 	       + nla_total_size(16) /* RTA_SRC */
4666 	       + nla_total_size(16) /* RTA_DST */
4667 	       + nla_total_size(16) /* RTA_GATEWAY */
4668 	       + nla_total_size(16) /* RTA_PREFSRC */
4669 	       + nla_total_size(4) /* RTA_TABLE */
4670 	       + nla_total_size(4) /* RTA_IIF */
4671 	       + nla_total_size(4) /* RTA_OIF */
4672 	       + nla_total_size(4) /* RTA_PRIORITY */
4673 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4674 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4675 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4676 	       + nla_total_size(1) /* RTA_PREF */
4677 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4678 	       + nexthop_len;
4679 }
4680 
4681 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4682 			 struct fib6_info *rt, struct dst_entry *dst,
4683 			 struct in6_addr *dest, struct in6_addr *src,
4684 			 int iif, int type, u32 portid, u32 seq,
4685 			 unsigned int flags)
4686 {
4687 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4688 	struct rt6key *rt6_dst, *rt6_src;
4689 	u32 *pmetrics, table, rt6_flags;
4690 	struct nlmsghdr *nlh;
4691 	struct rtmsg *rtm;
4692 	long expires = 0;
4693 
4694 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4695 	if (!nlh)
4696 		return -EMSGSIZE;
4697 
4698 	if (rt6) {
4699 		rt6_dst = &rt6->rt6i_dst;
4700 		rt6_src = &rt6->rt6i_src;
4701 		rt6_flags = rt6->rt6i_flags;
4702 	} else {
4703 		rt6_dst = &rt->fib6_dst;
4704 		rt6_src = &rt->fib6_src;
4705 		rt6_flags = rt->fib6_flags;
4706 	}
4707 
4708 	rtm = nlmsg_data(nlh);
4709 	rtm->rtm_family = AF_INET6;
4710 	rtm->rtm_dst_len = rt6_dst->plen;
4711 	rtm->rtm_src_len = rt6_src->plen;
4712 	rtm->rtm_tos = 0;
4713 	if (rt->fib6_table)
4714 		table = rt->fib6_table->tb6_id;
4715 	else
4716 		table = RT6_TABLE_UNSPEC;
4717 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4718 	if (nla_put_u32(skb, RTA_TABLE, table))
4719 		goto nla_put_failure;
4720 
4721 	rtm->rtm_type = rt->fib6_type;
4722 	rtm->rtm_flags = 0;
4723 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4724 	rtm->rtm_protocol = rt->fib6_protocol;
4725 
4726 	if (rt6_flags & RTF_CACHE)
4727 		rtm->rtm_flags |= RTM_F_CLONED;
4728 
4729 	if (dest) {
4730 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4731 			goto nla_put_failure;
4732 		rtm->rtm_dst_len = 128;
4733 	} else if (rtm->rtm_dst_len)
4734 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4735 			goto nla_put_failure;
4736 #ifdef CONFIG_IPV6_SUBTREES
4737 	if (src) {
4738 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4739 			goto nla_put_failure;
4740 		rtm->rtm_src_len = 128;
4741 	} else if (rtm->rtm_src_len &&
4742 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4743 		goto nla_put_failure;
4744 #endif
4745 	if (iif) {
4746 #ifdef CONFIG_IPV6_MROUTE
4747 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4748 			int err = ip6mr_get_route(net, skb, rtm, portid);
4749 
4750 			if (err == 0)
4751 				return 0;
4752 			if (err < 0)
4753 				goto nla_put_failure;
4754 		} else
4755 #endif
4756 			if (nla_put_u32(skb, RTA_IIF, iif))
4757 				goto nla_put_failure;
4758 	} else if (dest) {
4759 		struct in6_addr saddr_buf;
4760 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4761 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4762 			goto nla_put_failure;
4763 	}
4764 
4765 	if (rt->fib6_prefsrc.plen) {
4766 		struct in6_addr saddr_buf;
4767 		saddr_buf = rt->fib6_prefsrc.addr;
4768 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4769 			goto nla_put_failure;
4770 	}
4771 
4772 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4773 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4774 		goto nla_put_failure;
4775 
4776 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4777 		goto nla_put_failure;
4778 
4779 	/* For multipath routes, walk the siblings list and add
4780 	 * each as a nexthop within RTA_MULTIPATH.
4781 	 */
4782 	if (rt6) {
4783 		if (rt6_flags & RTF_GATEWAY &&
4784 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4785 			goto nla_put_failure;
4786 
4787 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4788 			goto nla_put_failure;
4789 	} else if (rt->fib6_nsiblings) {
4790 		struct fib6_info *sibling, *next_sibling;
4791 		struct nlattr *mp;
4792 
4793 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4794 		if (!mp)
4795 			goto nla_put_failure;
4796 
4797 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4798 				    rt->fib6_nh.fib_nh_weight) < 0)
4799 			goto nla_put_failure;
4800 
4801 		list_for_each_entry_safe(sibling, next_sibling,
4802 					 &rt->fib6_siblings, fib6_siblings) {
4803 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4804 					    sibling->fib6_nh.fib_nh_weight) < 0)
4805 				goto nla_put_failure;
4806 		}
4807 
4808 		nla_nest_end(skb, mp);
4809 	} else {
4810 		unsigned char nh_flags = 0;
4811 
4812 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4813 				     &nh_flags, false) < 0)
4814 			goto nla_put_failure;
4815 
4816 		rtm->rtm_flags |= nh_flags;
4817 	}
4818 
4819 	if (rt6_flags & RTF_EXPIRES) {
4820 		expires = dst ? dst->expires : rt->expires;
4821 		expires -= jiffies;
4822 	}
4823 
4824 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4825 		goto nla_put_failure;
4826 
4827 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4828 		goto nla_put_failure;
4829 
4830 
4831 	nlmsg_end(skb, nlh);
4832 	return 0;
4833 
4834 nla_put_failure:
4835 	nlmsg_cancel(skb, nlh);
4836 	return -EMSGSIZE;
4837 }
4838 
4839 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4840 			       const struct net_device *dev)
4841 {
4842 	if (f6i->fib6_nh.fib_nh_dev == dev)
4843 		return true;
4844 
4845 	if (f6i->fib6_nsiblings) {
4846 		struct fib6_info *sibling, *next_sibling;
4847 
4848 		list_for_each_entry_safe(sibling, next_sibling,
4849 					 &f6i->fib6_siblings, fib6_siblings) {
4850 			if (sibling->fib6_nh.fib_nh_dev == dev)
4851 				return true;
4852 		}
4853 	}
4854 
4855 	return false;
4856 }
4857 
4858 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4859 {
4860 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4861 	struct fib_dump_filter *filter = &arg->filter;
4862 	unsigned int flags = NLM_F_MULTI;
4863 	struct net *net = arg->net;
4864 
4865 	if (rt == net->ipv6.fib6_null_entry)
4866 		return 0;
4867 
4868 	if ((filter->flags & RTM_F_PREFIX) &&
4869 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4870 		/* success since this is not a prefix route */
4871 		return 1;
4872 	}
4873 	if (filter->filter_set) {
4874 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4875 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4876 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4877 			return 1;
4878 		}
4879 		flags |= NLM_F_DUMP_FILTERED;
4880 	}
4881 
4882 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4883 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4884 			     arg->cb->nlh->nlmsg_seq, flags);
4885 }
4886 
4887 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4888 					const struct nlmsghdr *nlh,
4889 					struct nlattr **tb,
4890 					struct netlink_ext_ack *extack)
4891 {
4892 	struct rtmsg *rtm;
4893 	int i, err;
4894 
4895 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4896 		NL_SET_ERR_MSG_MOD(extack,
4897 				   "Invalid header for get route request");
4898 		return -EINVAL;
4899 	}
4900 
4901 	if (!netlink_strict_get_check(skb))
4902 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4903 					      rtm_ipv6_policy, extack);
4904 
4905 	rtm = nlmsg_data(nlh);
4906 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4907 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4908 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4909 	    rtm->rtm_type) {
4910 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4911 		return -EINVAL;
4912 	}
4913 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4914 		NL_SET_ERR_MSG_MOD(extack,
4915 				   "Invalid flags for get route request");
4916 		return -EINVAL;
4917 	}
4918 
4919 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4920 					    rtm_ipv6_policy, extack);
4921 	if (err)
4922 		return err;
4923 
4924 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4925 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4926 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4927 		return -EINVAL;
4928 	}
4929 
4930 	for (i = 0; i <= RTA_MAX; i++) {
4931 		if (!tb[i])
4932 			continue;
4933 
4934 		switch (i) {
4935 		case RTA_SRC:
4936 		case RTA_DST:
4937 		case RTA_IIF:
4938 		case RTA_OIF:
4939 		case RTA_MARK:
4940 		case RTA_UID:
4941 		case RTA_SPORT:
4942 		case RTA_DPORT:
4943 		case RTA_IP_PROTO:
4944 			break;
4945 		default:
4946 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4947 			return -EINVAL;
4948 		}
4949 	}
4950 
4951 	return 0;
4952 }
4953 
4954 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4955 			      struct netlink_ext_ack *extack)
4956 {
4957 	struct net *net = sock_net(in_skb->sk);
4958 	struct nlattr *tb[RTA_MAX+1];
4959 	int err, iif = 0, oif = 0;
4960 	struct fib6_info *from;
4961 	struct dst_entry *dst;
4962 	struct rt6_info *rt;
4963 	struct sk_buff *skb;
4964 	struct rtmsg *rtm;
4965 	struct flowi6 fl6 = {};
4966 	bool fibmatch;
4967 
4968 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4969 	if (err < 0)
4970 		goto errout;
4971 
4972 	err = -EINVAL;
4973 	rtm = nlmsg_data(nlh);
4974 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4975 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4976 
4977 	if (tb[RTA_SRC]) {
4978 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4979 			goto errout;
4980 
4981 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4982 	}
4983 
4984 	if (tb[RTA_DST]) {
4985 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4986 			goto errout;
4987 
4988 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4989 	}
4990 
4991 	if (tb[RTA_IIF])
4992 		iif = nla_get_u32(tb[RTA_IIF]);
4993 
4994 	if (tb[RTA_OIF])
4995 		oif = nla_get_u32(tb[RTA_OIF]);
4996 
4997 	if (tb[RTA_MARK])
4998 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4999 
5000 	if (tb[RTA_UID])
5001 		fl6.flowi6_uid = make_kuid(current_user_ns(),
5002 					   nla_get_u32(tb[RTA_UID]));
5003 	else
5004 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5005 
5006 	if (tb[RTA_SPORT])
5007 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5008 
5009 	if (tb[RTA_DPORT])
5010 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5011 
5012 	if (tb[RTA_IP_PROTO]) {
5013 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5014 						  &fl6.flowi6_proto, AF_INET6,
5015 						  extack);
5016 		if (err)
5017 			goto errout;
5018 	}
5019 
5020 	if (iif) {
5021 		struct net_device *dev;
5022 		int flags = 0;
5023 
5024 		rcu_read_lock();
5025 
5026 		dev = dev_get_by_index_rcu(net, iif);
5027 		if (!dev) {
5028 			rcu_read_unlock();
5029 			err = -ENODEV;
5030 			goto errout;
5031 		}
5032 
5033 		fl6.flowi6_iif = iif;
5034 
5035 		if (!ipv6_addr_any(&fl6.saddr))
5036 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5037 
5038 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5039 
5040 		rcu_read_unlock();
5041 	} else {
5042 		fl6.flowi6_oif = oif;
5043 
5044 		dst = ip6_route_output(net, NULL, &fl6);
5045 	}
5046 
5047 
5048 	rt = container_of(dst, struct rt6_info, dst);
5049 	if (rt->dst.error) {
5050 		err = rt->dst.error;
5051 		ip6_rt_put(rt);
5052 		goto errout;
5053 	}
5054 
5055 	if (rt == net->ipv6.ip6_null_entry) {
5056 		err = rt->dst.error;
5057 		ip6_rt_put(rt);
5058 		goto errout;
5059 	}
5060 
5061 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5062 	if (!skb) {
5063 		ip6_rt_put(rt);
5064 		err = -ENOBUFS;
5065 		goto errout;
5066 	}
5067 
5068 	skb_dst_set(skb, &rt->dst);
5069 
5070 	rcu_read_lock();
5071 	from = rcu_dereference(rt->from);
5072 	if (from) {
5073 		if (fibmatch)
5074 			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5075 					    iif, RTM_NEWROUTE,
5076 					    NETLINK_CB(in_skb).portid,
5077 					    nlh->nlmsg_seq, 0);
5078 		else
5079 			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5080 					    &fl6.saddr, iif, RTM_NEWROUTE,
5081 					    NETLINK_CB(in_skb).portid,
5082 					    nlh->nlmsg_seq, 0);
5083 	} else {
5084 		err = -ENETUNREACH;
5085 	}
5086 	rcu_read_unlock();
5087 
5088 	if (err < 0) {
5089 		kfree_skb(skb);
5090 		goto errout;
5091 	}
5092 
5093 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5094 errout:
5095 	return err;
5096 }
5097 
5098 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5099 		     unsigned int nlm_flags)
5100 {
5101 	struct sk_buff *skb;
5102 	struct net *net = info->nl_net;
5103 	u32 seq;
5104 	int err;
5105 
5106 	err = -ENOBUFS;
5107 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5108 
5109 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5110 	if (!skb)
5111 		goto errout;
5112 
5113 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5114 			    event, info->portid, seq, nlm_flags);
5115 	if (err < 0) {
5116 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5117 		WARN_ON(err == -EMSGSIZE);
5118 		kfree_skb(skb);
5119 		goto errout;
5120 	}
5121 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5122 		    info->nlh, gfp_any());
5123 	return;
5124 errout:
5125 	if (err < 0)
5126 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5127 }
5128 
5129 static int ip6_route_dev_notify(struct notifier_block *this,
5130 				unsigned long event, void *ptr)
5131 {
5132 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5133 	struct net *net = dev_net(dev);
5134 
5135 	if (!(dev->flags & IFF_LOOPBACK))
5136 		return NOTIFY_OK;
5137 
5138 	if (event == NETDEV_REGISTER) {
5139 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5140 		net->ipv6.ip6_null_entry->dst.dev = dev;
5141 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5144 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5145 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5146 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5147 #endif
5148 	 } else if (event == NETDEV_UNREGISTER &&
5149 		    dev->reg_state != NETREG_UNREGISTERED) {
5150 		/* NETDEV_UNREGISTER could be fired for multiple times by
5151 		 * netdev_wait_allrefs(). Make sure we only call this once.
5152 		 */
5153 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5154 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5155 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5156 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5157 #endif
5158 	}
5159 
5160 	return NOTIFY_OK;
5161 }
5162 
5163 /*
5164  *	/proc
5165  */
5166 
5167 #ifdef CONFIG_PROC_FS
5168 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5169 {
5170 	struct net *net = (struct net *)seq->private;
5171 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5172 		   net->ipv6.rt6_stats->fib_nodes,
5173 		   net->ipv6.rt6_stats->fib_route_nodes,
5174 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5175 		   net->ipv6.rt6_stats->fib_rt_entries,
5176 		   net->ipv6.rt6_stats->fib_rt_cache,
5177 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5178 		   net->ipv6.rt6_stats->fib_discarded_routes);
5179 
5180 	return 0;
5181 }
5182 #endif	/* CONFIG_PROC_FS */
5183 
5184 #ifdef CONFIG_SYSCTL
5185 
5186 static
5187 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5188 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5189 {
5190 	struct net *net;
5191 	int delay;
5192 	int ret;
5193 	if (!write)
5194 		return -EINVAL;
5195 
5196 	net = (struct net *)ctl->extra1;
5197 	delay = net->ipv6.sysctl.flush_delay;
5198 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5199 	if (ret)
5200 		return ret;
5201 
5202 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5203 	return 0;
5204 }
5205 
5206 static int zero;
5207 static int one = 1;
5208 
5209 static struct ctl_table ipv6_route_table_template[] = {
5210 	{
5211 		.procname	=	"flush",
5212 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5213 		.maxlen		=	sizeof(int),
5214 		.mode		=	0200,
5215 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5216 	},
5217 	{
5218 		.procname	=	"gc_thresh",
5219 		.data		=	&ip6_dst_ops_template.gc_thresh,
5220 		.maxlen		=	sizeof(int),
5221 		.mode		=	0644,
5222 		.proc_handler	=	proc_dointvec,
5223 	},
5224 	{
5225 		.procname	=	"max_size",
5226 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5227 		.maxlen		=	sizeof(int),
5228 		.mode		=	0644,
5229 		.proc_handler	=	proc_dointvec,
5230 	},
5231 	{
5232 		.procname	=	"gc_min_interval",
5233 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5234 		.maxlen		=	sizeof(int),
5235 		.mode		=	0644,
5236 		.proc_handler	=	proc_dointvec_jiffies,
5237 	},
5238 	{
5239 		.procname	=	"gc_timeout",
5240 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5241 		.maxlen		=	sizeof(int),
5242 		.mode		=	0644,
5243 		.proc_handler	=	proc_dointvec_jiffies,
5244 	},
5245 	{
5246 		.procname	=	"gc_interval",
5247 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5248 		.maxlen		=	sizeof(int),
5249 		.mode		=	0644,
5250 		.proc_handler	=	proc_dointvec_jiffies,
5251 	},
5252 	{
5253 		.procname	=	"gc_elasticity",
5254 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5255 		.maxlen		=	sizeof(int),
5256 		.mode		=	0644,
5257 		.proc_handler	=	proc_dointvec,
5258 	},
5259 	{
5260 		.procname	=	"mtu_expires",
5261 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5262 		.maxlen		=	sizeof(int),
5263 		.mode		=	0644,
5264 		.proc_handler	=	proc_dointvec_jiffies,
5265 	},
5266 	{
5267 		.procname	=	"min_adv_mss",
5268 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5269 		.maxlen		=	sizeof(int),
5270 		.mode		=	0644,
5271 		.proc_handler	=	proc_dointvec,
5272 	},
5273 	{
5274 		.procname	=	"gc_min_interval_ms",
5275 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5276 		.maxlen		=	sizeof(int),
5277 		.mode		=	0644,
5278 		.proc_handler	=	proc_dointvec_ms_jiffies,
5279 	},
5280 	{
5281 		.procname	=	"skip_notify_on_dev_down",
5282 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5283 		.maxlen		=	sizeof(int),
5284 		.mode		=	0644,
5285 		.proc_handler	=	proc_dointvec_minmax,
5286 		.extra1		=	&zero,
5287 		.extra2		=	&one,
5288 	},
5289 	{ }
5290 };
5291 
5292 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5293 {
5294 	struct ctl_table *table;
5295 
5296 	table = kmemdup(ipv6_route_table_template,
5297 			sizeof(ipv6_route_table_template),
5298 			GFP_KERNEL);
5299 
5300 	if (table) {
5301 		table[0].data = &net->ipv6.sysctl.flush_delay;
5302 		table[0].extra1 = net;
5303 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5304 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5305 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5306 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5307 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5308 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5309 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5310 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5311 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5312 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5313 
5314 		/* Don't export sysctls to unprivileged users */
5315 		if (net->user_ns != &init_user_ns)
5316 			table[0].procname = NULL;
5317 	}
5318 
5319 	return table;
5320 }
5321 #endif
5322 
5323 static int __net_init ip6_route_net_init(struct net *net)
5324 {
5325 	int ret = -ENOMEM;
5326 
5327 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5328 	       sizeof(net->ipv6.ip6_dst_ops));
5329 
5330 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5331 		goto out_ip6_dst_ops;
5332 
5333 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5334 					    sizeof(*net->ipv6.fib6_null_entry),
5335 					    GFP_KERNEL);
5336 	if (!net->ipv6.fib6_null_entry)
5337 		goto out_ip6_dst_entries;
5338 
5339 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5340 					   sizeof(*net->ipv6.ip6_null_entry),
5341 					   GFP_KERNEL);
5342 	if (!net->ipv6.ip6_null_entry)
5343 		goto out_fib6_null_entry;
5344 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5345 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5346 			 ip6_template_metrics, true);
5347 
5348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5349 	net->ipv6.fib6_has_custom_rules = false;
5350 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5351 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5352 					       GFP_KERNEL);
5353 	if (!net->ipv6.ip6_prohibit_entry)
5354 		goto out_ip6_null_entry;
5355 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5356 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5357 			 ip6_template_metrics, true);
5358 
5359 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5360 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5361 					       GFP_KERNEL);
5362 	if (!net->ipv6.ip6_blk_hole_entry)
5363 		goto out_ip6_prohibit_entry;
5364 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5365 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5366 			 ip6_template_metrics, true);
5367 #endif
5368 
5369 	net->ipv6.sysctl.flush_delay = 0;
5370 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5371 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5372 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5373 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5374 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5375 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5376 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5377 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5378 
5379 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5380 
5381 	ret = 0;
5382 out:
5383 	return ret;
5384 
5385 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5386 out_ip6_prohibit_entry:
5387 	kfree(net->ipv6.ip6_prohibit_entry);
5388 out_ip6_null_entry:
5389 	kfree(net->ipv6.ip6_null_entry);
5390 #endif
5391 out_fib6_null_entry:
5392 	kfree(net->ipv6.fib6_null_entry);
5393 out_ip6_dst_entries:
5394 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5395 out_ip6_dst_ops:
5396 	goto out;
5397 }
5398 
5399 static void __net_exit ip6_route_net_exit(struct net *net)
5400 {
5401 	kfree(net->ipv6.fib6_null_entry);
5402 	kfree(net->ipv6.ip6_null_entry);
5403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5404 	kfree(net->ipv6.ip6_prohibit_entry);
5405 	kfree(net->ipv6.ip6_blk_hole_entry);
5406 #endif
5407 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5408 }
5409 
5410 static int __net_init ip6_route_net_init_late(struct net *net)
5411 {
5412 #ifdef CONFIG_PROC_FS
5413 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5414 			sizeof(struct ipv6_route_iter));
5415 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5416 			rt6_stats_seq_show, NULL);
5417 #endif
5418 	return 0;
5419 }
5420 
5421 static void __net_exit ip6_route_net_exit_late(struct net *net)
5422 {
5423 #ifdef CONFIG_PROC_FS
5424 	remove_proc_entry("ipv6_route", net->proc_net);
5425 	remove_proc_entry("rt6_stats", net->proc_net);
5426 #endif
5427 }
5428 
5429 static struct pernet_operations ip6_route_net_ops = {
5430 	.init = ip6_route_net_init,
5431 	.exit = ip6_route_net_exit,
5432 };
5433 
5434 static int __net_init ipv6_inetpeer_init(struct net *net)
5435 {
5436 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5437 
5438 	if (!bp)
5439 		return -ENOMEM;
5440 	inet_peer_base_init(bp);
5441 	net->ipv6.peers = bp;
5442 	return 0;
5443 }
5444 
5445 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5446 {
5447 	struct inet_peer_base *bp = net->ipv6.peers;
5448 
5449 	net->ipv6.peers = NULL;
5450 	inetpeer_invalidate_tree(bp);
5451 	kfree(bp);
5452 }
5453 
5454 static struct pernet_operations ipv6_inetpeer_ops = {
5455 	.init	=	ipv6_inetpeer_init,
5456 	.exit	=	ipv6_inetpeer_exit,
5457 };
5458 
5459 static struct pernet_operations ip6_route_net_late_ops = {
5460 	.init = ip6_route_net_init_late,
5461 	.exit = ip6_route_net_exit_late,
5462 };
5463 
5464 static struct notifier_block ip6_route_dev_notifier = {
5465 	.notifier_call = ip6_route_dev_notify,
5466 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5467 };
5468 
5469 void __init ip6_route_init_special_entries(void)
5470 {
5471 	/* Registering of the loopback is done before this portion of code,
5472 	 * the loopback reference in rt6_info will not be taken, do it
5473 	 * manually for init_net */
5474 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5475 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5476 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5477   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5478 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5479 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5480 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5481 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5482   #endif
5483 }
5484 
5485 int __init ip6_route_init(void)
5486 {
5487 	int ret;
5488 	int cpu;
5489 
5490 	ret = -ENOMEM;
5491 	ip6_dst_ops_template.kmem_cachep =
5492 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5493 				  SLAB_HWCACHE_ALIGN, NULL);
5494 	if (!ip6_dst_ops_template.kmem_cachep)
5495 		goto out;
5496 
5497 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5498 	if (ret)
5499 		goto out_kmem_cache;
5500 
5501 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5502 	if (ret)
5503 		goto out_dst_entries;
5504 
5505 	ret = register_pernet_subsys(&ip6_route_net_ops);
5506 	if (ret)
5507 		goto out_register_inetpeer;
5508 
5509 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5510 
5511 	ret = fib6_init();
5512 	if (ret)
5513 		goto out_register_subsys;
5514 
5515 	ret = xfrm6_init();
5516 	if (ret)
5517 		goto out_fib6_init;
5518 
5519 	ret = fib6_rules_init();
5520 	if (ret)
5521 		goto xfrm6_init;
5522 
5523 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5524 	if (ret)
5525 		goto fib6_rules_init;
5526 
5527 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5528 				   inet6_rtm_newroute, NULL, 0);
5529 	if (ret < 0)
5530 		goto out_register_late_subsys;
5531 
5532 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5533 				   inet6_rtm_delroute, NULL, 0);
5534 	if (ret < 0)
5535 		goto out_register_late_subsys;
5536 
5537 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5538 				   inet6_rtm_getroute, NULL,
5539 				   RTNL_FLAG_DOIT_UNLOCKED);
5540 	if (ret < 0)
5541 		goto out_register_late_subsys;
5542 
5543 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5544 	if (ret)
5545 		goto out_register_late_subsys;
5546 
5547 	for_each_possible_cpu(cpu) {
5548 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5549 
5550 		INIT_LIST_HEAD(&ul->head);
5551 		spin_lock_init(&ul->lock);
5552 	}
5553 
5554 out:
5555 	return ret;
5556 
5557 out_register_late_subsys:
5558 	rtnl_unregister_all(PF_INET6);
5559 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5560 fib6_rules_init:
5561 	fib6_rules_cleanup();
5562 xfrm6_init:
5563 	xfrm6_fini();
5564 out_fib6_init:
5565 	fib6_gc_cleanup();
5566 out_register_subsys:
5567 	unregister_pernet_subsys(&ip6_route_net_ops);
5568 out_register_inetpeer:
5569 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5570 out_dst_entries:
5571 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5572 out_kmem_cache:
5573 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5574 	goto out;
5575 }
5576 
5577 void ip6_route_cleanup(void)
5578 {
5579 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5580 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5581 	fib6_rules_cleanup();
5582 	xfrm6_fini();
5583 	fib6_gc_cleanup();
5584 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5585 	unregister_pernet_subsys(&ip6_route_net_ops);
5586 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5587 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5588 }
5589