xref: /linux/net/ipv6/route.c (revision f25377ee4fb1118650a08b403234aa6f57ce25a9)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= ATOMIC_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 					struct fib6_info *match,
433 					struct flowi6 *fl6, int oif,
434 					const struct sk_buff *skb,
435 					int strict)
436 {
437 	struct fib6_info *sibling, *next_sibling;
438 
439 	/* We might have already computed the hash for ICMPv6 errors. In such
440 	 * case it will always be non-zero. Otherwise now is the time to do it.
441 	 */
442 	if (!fl6->mp_hash)
443 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 
445 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
446 		return match;
447 
448 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 				 fib6_siblings) {
450 		const struct fib6_nh *nh = &sibling->fib6_nh;
451 		int nh_upper_bound;
452 
453 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454 		if (fl6->mp_hash > nh_upper_bound)
455 			continue;
456 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
457 			break;
458 		match = sibling;
459 		break;
460 	}
461 
462 	return match;
463 }
464 
465 /*
466  *	Route lookup. rcu_read_lock() should be held.
467  */
468 
469 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
470 			       const struct in6_addr *saddr, int oif, int flags)
471 {
472 	const struct net_device *dev;
473 
474 	if (nh->fib_nh_flags & RTNH_F_DEAD)
475 		return false;
476 
477 	dev = nh->fib_nh_dev;
478 	if (oif) {
479 		if (dev->ifindex == oif)
480 			return true;
481 	} else {
482 		if (ipv6_chk_addr(net, saddr, dev,
483 				  flags & RT6_LOOKUP_F_IFACE))
484 			return true;
485 	}
486 
487 	return false;
488 }
489 
490 static inline struct fib6_info *rt6_device_match(struct net *net,
491 						 struct fib6_info *rt,
492 						    const struct in6_addr *saddr,
493 						    int oif,
494 						    int flags)
495 {
496 	const struct fib6_nh *nh;
497 	struct fib6_info *sprt;
498 
499 	if (!oif && ipv6_addr_any(saddr) &&
500 	    !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
501 		return rt;
502 
503 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
504 		nh = &sprt->fib6_nh;
505 		if (__rt6_device_match(net, nh, saddr, oif, flags))
506 			return sprt;
507 	}
508 
509 	if (oif && flags & RT6_LOOKUP_F_IFACE)
510 		return net->ipv6.fib6_null_entry;
511 
512 	return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
513 }
514 
515 #ifdef CONFIG_IPV6_ROUTER_PREF
516 struct __rt6_probe_work {
517 	struct work_struct work;
518 	struct in6_addr target;
519 	struct net_device *dev;
520 };
521 
522 static void rt6_probe_deferred(struct work_struct *w)
523 {
524 	struct in6_addr mcaddr;
525 	struct __rt6_probe_work *work =
526 		container_of(w, struct __rt6_probe_work, work);
527 
528 	addrconf_addr_solict_mult(&work->target, &mcaddr);
529 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
530 	dev_put(work->dev);
531 	kfree(work);
532 }
533 
534 static void rt6_probe(struct fib6_nh *fib6_nh)
535 {
536 	struct __rt6_probe_work *work = NULL;
537 	const struct in6_addr *nh_gw;
538 	struct neighbour *neigh;
539 	struct net_device *dev;
540 	struct inet6_dev *idev;
541 
542 	/*
543 	 * Okay, this does not seem to be appropriate
544 	 * for now, however, we need to check if it
545 	 * is really so; aka Router Reachability Probing.
546 	 *
547 	 * Router Reachability Probe MUST be rate-limited
548 	 * to no more than one per minute.
549 	 */
550 	if (fib6_nh->fib_nh_gw_family)
551 		return;
552 
553 	nh_gw = &fib6_nh->fib_nh_gw6;
554 	dev = fib6_nh->fib_nh_dev;
555 	rcu_read_lock_bh();
556 	idev = __in6_dev_get(dev);
557 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
558 	if (neigh) {
559 		if (neigh->nud_state & NUD_VALID)
560 			goto out;
561 
562 		write_lock(&neigh->lock);
563 		if (!(neigh->nud_state & NUD_VALID) &&
564 		    time_after(jiffies,
565 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
566 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
567 			if (work)
568 				__neigh_set_probe_once(neigh);
569 		}
570 		write_unlock(&neigh->lock);
571 	} else if (time_after(jiffies, fib6_nh->last_probe +
572 				       idev->cnf.rtr_probe_interval)) {
573 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 	}
575 
576 	if (work) {
577 		fib6_nh->last_probe = jiffies;
578 		INIT_WORK(&work->work, rt6_probe_deferred);
579 		work->target = *nh_gw;
580 		dev_hold(dev);
581 		work->dev = dev;
582 		schedule_work(&work->work);
583 	}
584 
585 out:
586 	rcu_read_unlock_bh();
587 }
588 #else
589 static inline void rt6_probe(struct fib6_nh *fib6_nh)
590 {
591 }
592 #endif
593 
594 /*
595  * Default Router Selection (RFC 2461 6.3.6)
596  */
597 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
598 {
599 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
600 	struct neighbour *neigh;
601 
602 	rcu_read_lock_bh();
603 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
604 					  &fib6_nh->fib_nh_gw6);
605 	if (neigh) {
606 		read_lock(&neigh->lock);
607 		if (neigh->nud_state & NUD_VALID)
608 			ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610 		else if (!(neigh->nud_state & NUD_FAILED))
611 			ret = RT6_NUD_SUCCEED;
612 		else
613 			ret = RT6_NUD_FAIL_PROBE;
614 #endif
615 		read_unlock(&neigh->lock);
616 	} else {
617 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 	}
620 	rcu_read_unlock_bh();
621 
622 	return ret;
623 }
624 
625 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
626 			   int strict)
627 {
628 	int m = 0;
629 
630 	if (!oif || nh->fib_nh_dev->ifindex == oif)
631 		m = 2;
632 
633 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
634 		return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
637 #endif
638 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
639 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
640 		int n = rt6_check_neigh(nh);
641 		if (n < 0)
642 			return n;
643 	}
644 	return m;
645 }
646 
647 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
648 		       int oif, int strict, int *mpri, bool *do_rr)
649 {
650 	bool match_do_rr = false;
651 	bool rc = false;
652 	int m;
653 
654 	if (nh->fib_nh_flags & RTNH_F_DEAD)
655 		goto out;
656 
657 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
658 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
659 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
660 		goto out;
661 
662 	m = rt6_score_route(nh, fib6_flags, oif, strict);
663 	if (m == RT6_NUD_FAIL_DO_RR) {
664 		match_do_rr = true;
665 		m = 0; /* lowest valid score */
666 	} else if (m == RT6_NUD_FAIL_HARD) {
667 		goto out;
668 	}
669 
670 	if (strict & RT6_LOOKUP_F_REACHABLE)
671 		rt6_probe(nh);
672 
673 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
674 	if (m > *mpri) {
675 		*do_rr = match_do_rr;
676 		*mpri = m;
677 		rc = true;
678 	}
679 out:
680 	return rc;
681 }
682 
683 static void __find_rr_leaf(struct fib6_info *rt_start,
684 			   struct fib6_info *nomatch, u32 metric,
685 			   struct fib6_info **match, struct fib6_info **cont,
686 			   int oif, int strict, bool *do_rr, int *mpri)
687 {
688 	struct fib6_info *rt;
689 
690 	for (rt = rt_start;
691 	     rt && rt != nomatch;
692 	     rt = rcu_dereference(rt->fib6_next)) {
693 		struct fib6_nh *nh;
694 
695 		if (cont && rt->fib6_metric != metric) {
696 			*cont = rt;
697 			return;
698 		}
699 
700 		if (fib6_check_expired(rt))
701 			continue;
702 
703 		nh = &rt->fib6_nh;
704 		if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
705 			*match = rt;
706 	}
707 }
708 
709 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
710 				      struct fib6_info *leaf,
711 				      struct fib6_info *rr_head,
712 				      u32 metric, int oif, int strict,
713 				      bool *do_rr)
714 {
715 	struct fib6_info *match = NULL, *cont = NULL;
716 	int mpri = -1;
717 
718 	__find_rr_leaf(rr_head, NULL, metric, &match, &cont,
719 		       oif, strict, do_rr, &mpri);
720 
721 	__find_rr_leaf(leaf, rr_head, metric, &match, &cont,
722 		       oif, strict, do_rr, &mpri);
723 
724 	if (match || !cont)
725 		return match;
726 
727 	__find_rr_leaf(cont, NULL, metric, &match, NULL,
728 		       oif, strict, do_rr, &mpri);
729 
730 	return match;
731 }
732 
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734 				   int oif, int strict)
735 {
736 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 	struct fib6_info *match, *rt0;
738 	bool do_rr = false;
739 	int key_plen;
740 
741 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 		return net->ipv6.fib6_null_entry;
743 
744 	rt0 = rcu_dereference(fn->rr_ptr);
745 	if (!rt0)
746 		rt0 = leaf;
747 
748 	/* Double check to make sure fn is not an intermediate node
749 	 * and fn->leaf does not points to its child's leaf
750 	 * (This might happen if all routes under fn are deleted from
751 	 * the tree and fib6_repair_tree() is called on the node.)
752 	 */
753 	key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 	if (rt0->fib6_src.plen)
756 		key_plen = rt0->fib6_src.plen;
757 #endif
758 	if (fn->fn_bit != key_plen)
759 		return net->ipv6.fib6_null_entry;
760 
761 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762 			     &do_rr);
763 
764 	if (do_rr) {
765 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766 
767 		/* no entries matched; do round-robin */
768 		if (!next || next->fib6_metric != rt0->fib6_metric)
769 			next = leaf;
770 
771 		if (next != rt0) {
772 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 			/* make sure next is not being deleted from the tree */
774 			if (next->fib6_node)
775 				rcu_assign_pointer(fn->rr_ptr, next);
776 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777 		}
778 	}
779 
780 	return match ? match : net->ipv6.fib6_null_entry;
781 }
782 
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785 	return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
786 }
787 
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 		  const struct in6_addr *gwaddr)
791 {
792 	struct net *net = dev_net(dev);
793 	struct route_info *rinfo = (struct route_info *) opt;
794 	struct in6_addr prefix_buf, *prefix;
795 	unsigned int pref;
796 	unsigned long lifetime;
797 	struct fib6_info *rt;
798 
799 	if (len < sizeof(struct route_info)) {
800 		return -EINVAL;
801 	}
802 
803 	/* Sanity check for prefix_len and length */
804 	if (rinfo->length > 3) {
805 		return -EINVAL;
806 	} else if (rinfo->prefix_len > 128) {
807 		return -EINVAL;
808 	} else if (rinfo->prefix_len > 64) {
809 		if (rinfo->length < 2) {
810 			return -EINVAL;
811 		}
812 	} else if (rinfo->prefix_len > 0) {
813 		if (rinfo->length < 1) {
814 			return -EINVAL;
815 		}
816 	}
817 
818 	pref = rinfo->route_pref;
819 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
820 		return -EINVAL;
821 
822 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823 
824 	if (rinfo->length == 3)
825 		prefix = (struct in6_addr *)rinfo->prefix;
826 	else {
827 		/* this function is safe */
828 		ipv6_addr_prefix(&prefix_buf,
829 				 (struct in6_addr *)rinfo->prefix,
830 				 rinfo->prefix_len);
831 		prefix = &prefix_buf;
832 	}
833 
834 	if (rinfo->prefix_len == 0)
835 		rt = rt6_get_dflt_router(net, gwaddr, dev);
836 	else
837 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838 					gwaddr, dev);
839 
840 	if (rt && !lifetime) {
841 		ip6_del_rt(net, rt);
842 		rt = NULL;
843 	}
844 
845 	if (!rt && lifetime)
846 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847 					dev, pref);
848 	else if (rt)
849 		rt->fib6_flags = RTF_ROUTEINFO |
850 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851 
852 	if (rt) {
853 		if (!addrconf_finite_timeout(lifetime))
854 			fib6_clean_expires(rt);
855 		else
856 			fib6_set_expires(rt, jiffies + HZ * lifetime);
857 
858 		fib6_info_release(rt);
859 	}
860 	return 0;
861 }
862 #endif
863 
864 /*
865  *	Misc support functions
866  */
867 
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871 	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
872 
873 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 		/* for copies of local routes, dst->dev needs to be the
875 		 * device if it is a master device, the master device if
876 		 * device is enslaved, and the loopback as the default
877 		 */
878 		if (netif_is_l3_slave(dev) &&
879 		    !rt6_need_strict(&rt->fib6_dst.addr))
880 			dev = l3mdev_master_dev_rcu(dev);
881 		else if (!netif_is_l3_master(dev))
882 			dev = dev_net(dev)->loopback_dev;
883 		/* last case is netif_is_l3_master(dev) is true in which
884 		 * case we want dev returned to be dev
885 		 */
886 	}
887 
888 	return dev;
889 }
890 
891 static const int fib6_prop[RTN_MAX + 1] = {
892 	[RTN_UNSPEC]	= 0,
893 	[RTN_UNICAST]	= 0,
894 	[RTN_LOCAL]	= 0,
895 	[RTN_BROADCAST]	= 0,
896 	[RTN_ANYCAST]	= 0,
897 	[RTN_MULTICAST]	= 0,
898 	[RTN_BLACKHOLE]	= -EINVAL,
899 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
900 	[RTN_PROHIBIT]	= -EACCES,
901 	[RTN_THROW]	= -EAGAIN,
902 	[RTN_NAT]	= -EINVAL,
903 	[RTN_XRESOLVE]	= -EINVAL,
904 };
905 
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908 	return fib6_prop[fib6_type];
909 }
910 
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913 	unsigned short flags = 0;
914 
915 	if (rt->dst_nocount)
916 		flags |= DST_NOCOUNT;
917 	if (rt->dst_nopolicy)
918 		flags |= DST_NOPOLICY;
919 	if (rt->dst_host)
920 		flags |= DST_HOST;
921 
922 	return flags;
923 }
924 
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928 
929 	switch (ort->fib6_type) {
930 	case RTN_BLACKHOLE:
931 		rt->dst.output = dst_discard_out;
932 		rt->dst.input = dst_discard;
933 		break;
934 	case RTN_PROHIBIT:
935 		rt->dst.output = ip6_pkt_prohibit_out;
936 		rt->dst.input = ip6_pkt_prohibit;
937 		break;
938 	case RTN_THROW:
939 	case RTN_UNREACHABLE:
940 	default:
941 		rt->dst.output = ip6_pkt_discard_out;
942 		rt->dst.input = ip6_pkt_discard;
943 		break;
944 	}
945 }
946 
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949 	if (ort->fib6_flags & RTF_REJECT) {
950 		ip6_rt_init_dst_reject(rt, ort);
951 		return;
952 	}
953 
954 	rt->dst.error = 0;
955 	rt->dst.output = ip6_output;
956 
957 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958 		rt->dst.input = ip6_input;
959 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960 		rt->dst.input = ip6_mc_input;
961 	} else {
962 		rt->dst.input = ip6_forward;
963 	}
964 
965 	if (ort->fib6_nh.fib_nh_lws) {
966 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
967 		lwtunnel_set_redirect(&rt->dst);
968 	}
969 
970 	rt->dst.lastuse = jiffies;
971 }
972 
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976 	rt->rt6i_flags &= ~RTF_EXPIRES;
977 	rcu_assign_pointer(rt->from, from);
978 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980 
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984 	struct net_device *dev = fib6_info_nh_dev(ort);
985 
986 	ip6_rt_init_dst(rt, ort);
987 
988 	rt->rt6i_dst = ort->fib6_dst;
989 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	if (ort->fib6_nh.fib_nh_gw_family) {
992 		rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
993 		rt->rt6i_flags |= RTF_GATEWAY;
994 	}
995 	rt6_set_from(rt, ort);
996 #ifdef CONFIG_IPV6_SUBTREES
997 	rt->rt6i_src = ort->fib6_src;
998 #endif
999 }
1000 
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002 					struct in6_addr *saddr)
1003 {
1004 	struct fib6_node *pn, *sn;
1005 	while (1) {
1006 		if (fn->fn_flags & RTN_TL_ROOT)
1007 			return NULL;
1008 		pn = rcu_dereference(fn->parent);
1009 		sn = FIB6_SUBTREE(pn);
1010 		if (sn && sn != fn)
1011 			fn = fib6_node_lookup(sn, NULL, saddr);
1012 		else
1013 			fn = pn;
1014 		if (fn->fn_flags & RTN_RTINFO)
1015 			return fn;
1016 	}
1017 }
1018 
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1020 {
1021 	struct rt6_info *rt = *prt;
1022 
1023 	if (dst_hold_safe(&rt->dst))
1024 		return true;
1025 	if (net) {
1026 		rt = net->ipv6.ip6_null_entry;
1027 		dst_hold(&rt->dst);
1028 	} else {
1029 		rt = NULL;
1030 	}
1031 	*prt = rt;
1032 	return false;
1033 }
1034 
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038 	unsigned short flags = fib6_info_dst_flags(rt);
1039 	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1040 	struct rt6_info *nrt;
1041 
1042 	if (!fib6_info_hold_safe(rt))
1043 		goto fallback;
1044 
1045 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046 	if (!nrt) {
1047 		fib6_info_release(rt);
1048 		goto fallback;
1049 	}
1050 
1051 	ip6_rt_copy_init(nrt, rt);
1052 	return nrt;
1053 
1054 fallback:
1055 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056 	dst_hold(&nrt->dst);
1057 	return nrt;
1058 }
1059 
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 					     struct fib6_table *table,
1062 					     struct flowi6 *fl6,
1063 					     const struct sk_buff *skb,
1064 					     int flags)
1065 {
1066 	struct fib6_info *f6i;
1067 	struct fib6_node *fn;
1068 	struct rt6_info *rt;
1069 
1070 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 		flags &= ~RT6_LOOKUP_F_IFACE;
1072 
1073 	rcu_read_lock();
1074 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076 	f6i = rcu_dereference(fn->leaf);
1077 	if (!f6i)
1078 		f6i = net->ipv6.fib6_null_entry;
1079 	else
1080 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 				      fl6->flowi6_oif, flags);
1082 
1083 	if (f6i == net->ipv6.fib6_null_entry) {
1084 		fn = fib6_backtrack(fn, &fl6->saddr);
1085 		if (fn)
1086 			goto restart;
1087 
1088 		rt = net->ipv6.ip6_null_entry;
1089 		dst_hold(&rt->dst);
1090 		goto out;
1091 	}
1092 
1093 	if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1094 		f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1095 					    flags);
1096 	/* Search through exception table */
1097 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1098 	if (rt) {
1099 		if (ip6_hold_safe(net, &rt))
1100 			dst_use_noref(&rt->dst, jiffies);
1101 	} else {
1102 		rt = ip6_create_rt_rcu(f6i);
1103 	}
1104 
1105 out:
1106 	trace_fib6_table_lookup(net, f6i, table, fl6);
1107 
1108 	rcu_read_unlock();
1109 
1110 	return rt;
1111 }
1112 
1113 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1114 				   const struct sk_buff *skb, int flags)
1115 {
1116 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 }
1118 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119 
1120 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1121 			    const struct in6_addr *saddr, int oif,
1122 			    const struct sk_buff *skb, int strict)
1123 {
1124 	struct flowi6 fl6 = {
1125 		.flowi6_oif = oif,
1126 		.daddr = *daddr,
1127 	};
1128 	struct dst_entry *dst;
1129 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1130 
1131 	if (saddr) {
1132 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1133 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1134 	}
1135 
1136 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1137 	if (dst->error == 0)
1138 		return (struct rt6_info *) dst;
1139 
1140 	dst_release(dst);
1141 
1142 	return NULL;
1143 }
1144 EXPORT_SYMBOL(rt6_lookup);
1145 
1146 /* ip6_ins_rt is called with FREE table->tb6_lock.
1147  * It takes new route entry, the addition fails by any reason the
1148  * route is released.
1149  * Caller must hold dst before calling it.
1150  */
1151 
1152 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1153 			struct netlink_ext_ack *extack)
1154 {
1155 	int err;
1156 	struct fib6_table *table;
1157 
1158 	table = rt->fib6_table;
1159 	spin_lock_bh(&table->tb6_lock);
1160 	err = fib6_add(&table->tb6_root, rt, info, extack);
1161 	spin_unlock_bh(&table->tb6_lock);
1162 
1163 	return err;
1164 }
1165 
1166 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 {
1168 	struct nl_info info = {	.nl_net = net, };
1169 
1170 	return __ip6_ins_rt(rt, &info, NULL);
1171 }
1172 
1173 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1174 					   const struct in6_addr *daddr,
1175 					   const struct in6_addr *saddr)
1176 {
1177 	struct net_device *dev;
1178 	struct rt6_info *rt;
1179 
1180 	/*
1181 	 *	Clone the route.
1182 	 */
1183 
1184 	if (!fib6_info_hold_safe(ort))
1185 		return NULL;
1186 
1187 	dev = ip6_rt_get_dev_rcu(ort);
1188 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189 	if (!rt) {
1190 		fib6_info_release(ort);
1191 		return NULL;
1192 	}
1193 
1194 	ip6_rt_copy_init(rt, ort);
1195 	rt->rt6i_flags |= RTF_CACHE;
1196 	rt->dst.flags |= DST_HOST;
1197 	rt->rt6i_dst.addr = *daddr;
1198 	rt->rt6i_dst.plen = 128;
1199 
1200 	if (!rt6_is_gw_or_nonexthop(ort)) {
1201 		if (ort->fib6_dst.plen != 128 &&
1202 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1203 			rt->rt6i_flags |= RTF_ANYCAST;
1204 #ifdef CONFIG_IPV6_SUBTREES
1205 		if (rt->rt6i_src.plen && saddr) {
1206 			rt->rt6i_src.addr = *saddr;
1207 			rt->rt6i_src.plen = 128;
1208 		}
1209 #endif
1210 	}
1211 
1212 	return rt;
1213 }
1214 
1215 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 {
1217 	unsigned short flags = fib6_info_dst_flags(rt);
1218 	struct net_device *dev;
1219 	struct rt6_info *pcpu_rt;
1220 
1221 	if (!fib6_info_hold_safe(rt))
1222 		return NULL;
1223 
1224 	rcu_read_lock();
1225 	dev = ip6_rt_get_dev_rcu(rt);
1226 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1227 	rcu_read_unlock();
1228 	if (!pcpu_rt) {
1229 		fib6_info_release(rt);
1230 		return NULL;
1231 	}
1232 	ip6_rt_copy_init(pcpu_rt, rt);
1233 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 	return pcpu_rt;
1235 }
1236 
1237 /* It should be called with rcu_read_lock() acquired */
1238 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 {
1240 	struct rt6_info *pcpu_rt, **p;
1241 
1242 	p = this_cpu_ptr(rt->rt6i_pcpu);
1243 	pcpu_rt = *p;
1244 
1245 	if (pcpu_rt)
1246 		ip6_hold_safe(NULL, &pcpu_rt);
1247 
1248 	return pcpu_rt;
1249 }
1250 
1251 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1252 					    struct fib6_info *rt)
1253 {
1254 	struct rt6_info *pcpu_rt, *prev, **p;
1255 
1256 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257 	if (!pcpu_rt) {
1258 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1259 		return net->ipv6.ip6_null_entry;
1260 	}
1261 
1262 	dst_hold(&pcpu_rt->dst);
1263 	p = this_cpu_ptr(rt->rt6i_pcpu);
1264 	prev = cmpxchg(p, NULL, pcpu_rt);
1265 	BUG_ON(prev);
1266 
1267 	return pcpu_rt;
1268 }
1269 
1270 /* exception hash table implementation
1271  */
1272 static DEFINE_SPINLOCK(rt6_exception_lock);
1273 
1274 /* Remove rt6_ex from hash table and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1278 				 struct rt6_exception *rt6_ex)
1279 {
1280 	struct fib6_info *from;
1281 	struct net *net;
1282 
1283 	if (!bucket || !rt6_ex)
1284 		return;
1285 
1286 	net = dev_net(rt6_ex->rt6i->dst.dev);
1287 	net->ipv6.rt6_stats->fib_rt_cache--;
1288 
1289 	/* purge completely the exception to allow releasing the held resources:
1290 	 * some [sk] cache may keep the dst around for unlimited time
1291 	 */
1292 	from = rcu_dereference_protected(rt6_ex->rt6i->from,
1293 					 lockdep_is_held(&rt6_exception_lock));
1294 	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1295 	fib6_info_release(from);
1296 	dst_dev_put(&rt6_ex->rt6i->dst);
1297 
1298 	hlist_del_rcu(&rt6_ex->hlist);
1299 	dst_release(&rt6_ex->rt6i->dst);
1300 	kfree_rcu(rt6_ex, rcu);
1301 	WARN_ON_ONCE(!bucket->depth);
1302 	bucket->depth--;
1303 }
1304 
1305 /* Remove oldest rt6_ex in bucket and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1309 {
1310 	struct rt6_exception *rt6_ex, *oldest = NULL;
1311 
1312 	if (!bucket)
1313 		return;
1314 
1315 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1316 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1317 			oldest = rt6_ex;
1318 	}
1319 	rt6_remove_exception(bucket, oldest);
1320 }
1321 
1322 static u32 rt6_exception_hash(const struct in6_addr *dst,
1323 			      const struct in6_addr *src)
1324 {
1325 	static u32 seed __read_mostly;
1326 	u32 val;
1327 
1328 	net_get_random_once(&seed, sizeof(seed));
1329 	val = jhash(dst, sizeof(*dst), seed);
1330 
1331 #ifdef CONFIG_IPV6_SUBTREES
1332 	if (src)
1333 		val = jhash(src, sizeof(*src), val);
1334 #endif
1335 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1336 }
1337 
1338 /* Helper function to find the cached rt in the hash table
1339  * and update bucket pointer to point to the bucket for this
1340  * (daddr, saddr) pair
1341  * Caller must hold rt6_exception_lock
1342  */
1343 static struct rt6_exception *
1344 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1345 			      const struct in6_addr *daddr,
1346 			      const struct in6_addr *saddr)
1347 {
1348 	struct rt6_exception *rt6_ex;
1349 	u32 hval;
1350 
1351 	if (!(*bucket) || !daddr)
1352 		return NULL;
1353 
1354 	hval = rt6_exception_hash(daddr, saddr);
1355 	*bucket += hval;
1356 
1357 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1358 		struct rt6_info *rt6 = rt6_ex->rt6i;
1359 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360 
1361 #ifdef CONFIG_IPV6_SUBTREES
1362 		if (matched && saddr)
1363 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1364 #endif
1365 		if (matched)
1366 			return rt6_ex;
1367 	}
1368 	return NULL;
1369 }
1370 
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rcu_read_lock()
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1378 			 const struct in6_addr *daddr,
1379 			 const struct in6_addr *saddr)
1380 {
1381 	struct rt6_exception *rt6_ex;
1382 	u32 hval;
1383 
1384 	WARN_ON_ONCE(!rcu_read_lock_held());
1385 
1386 	if (!(*bucket) || !daddr)
1387 		return NULL;
1388 
1389 	hval = rt6_exception_hash(daddr, saddr);
1390 	*bucket += hval;
1391 
1392 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1393 		struct rt6_info *rt6 = rt6_ex->rt6i;
1394 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395 
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 		if (matched && saddr)
1398 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1399 #endif
1400 		if (matched)
1401 			return rt6_ex;
1402 	}
1403 	return NULL;
1404 }
1405 
1406 static unsigned int fib6_mtu(const struct fib6_info *rt)
1407 {
1408 	unsigned int mtu;
1409 
1410 	if (rt->fib6_pmtu) {
1411 		mtu = rt->fib6_pmtu;
1412 	} else {
1413 		struct net_device *dev = fib6_info_nh_dev(rt);
1414 		struct inet6_dev *idev;
1415 
1416 		rcu_read_lock();
1417 		idev = __in6_dev_get(dev);
1418 		mtu = idev->cnf.mtu6;
1419 		rcu_read_unlock();
1420 	}
1421 
1422 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1423 
1424 	return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1425 }
1426 
1427 static int rt6_insert_exception(struct rt6_info *nrt,
1428 				struct fib6_info *ort)
1429 {
1430 	struct net *net = dev_net(nrt->dst.dev);
1431 	struct rt6_exception_bucket *bucket;
1432 	struct in6_addr *src_key = NULL;
1433 	struct rt6_exception *rt6_ex;
1434 	int err = 0;
1435 
1436 	spin_lock_bh(&rt6_exception_lock);
1437 
1438 	if (ort->exception_bucket_flushed) {
1439 		err = -EINVAL;
1440 		goto out;
1441 	}
1442 
1443 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1444 					lockdep_is_held(&rt6_exception_lock));
1445 	if (!bucket) {
1446 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1447 				 GFP_ATOMIC);
1448 		if (!bucket) {
1449 			err = -ENOMEM;
1450 			goto out;
1451 		}
1452 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1453 	}
1454 
1455 #ifdef CONFIG_IPV6_SUBTREES
1456 	/* rt6i_src.plen != 0 indicates ort is in subtree
1457 	 * and exception table is indexed by a hash of
1458 	 * both rt6i_dst and rt6i_src.
1459 	 * Otherwise, the exception table is indexed by
1460 	 * a hash of only rt6i_dst.
1461 	 */
1462 	if (ort->fib6_src.plen)
1463 		src_key = &nrt->rt6i_src.addr;
1464 #endif
1465 	/* rt6_mtu_change() might lower mtu on ort.
1466 	 * Only insert this exception route if its mtu
1467 	 * is less than ort's mtu value.
1468 	 */
1469 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1470 		err = -EINVAL;
1471 		goto out;
1472 	}
1473 
1474 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1475 					       src_key);
1476 	if (rt6_ex)
1477 		rt6_remove_exception(bucket, rt6_ex);
1478 
1479 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1480 	if (!rt6_ex) {
1481 		err = -ENOMEM;
1482 		goto out;
1483 	}
1484 	rt6_ex->rt6i = nrt;
1485 	rt6_ex->stamp = jiffies;
1486 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1487 	bucket->depth++;
1488 	net->ipv6.rt6_stats->fib_rt_cache++;
1489 
1490 	if (bucket->depth > FIB6_MAX_DEPTH)
1491 		rt6_exception_remove_oldest(bucket);
1492 
1493 out:
1494 	spin_unlock_bh(&rt6_exception_lock);
1495 
1496 	/* Update fn->fn_sernum to invalidate all cached dst */
1497 	if (!err) {
1498 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1499 		fib6_update_sernum(net, ort);
1500 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1501 		fib6_force_start_gc(net);
1502 	}
1503 
1504 	return err;
1505 }
1506 
1507 void rt6_flush_exceptions(struct fib6_info *rt)
1508 {
1509 	struct rt6_exception_bucket *bucket;
1510 	struct rt6_exception *rt6_ex;
1511 	struct hlist_node *tmp;
1512 	int i;
1513 
1514 	spin_lock_bh(&rt6_exception_lock);
1515 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1516 	rt->exception_bucket_flushed = 1;
1517 
1518 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519 				    lockdep_is_held(&rt6_exception_lock));
1520 	if (!bucket)
1521 		goto out;
1522 
1523 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1525 			rt6_remove_exception(bucket, rt6_ex);
1526 		WARN_ON_ONCE(bucket->depth);
1527 		bucket++;
1528 	}
1529 
1530 out:
1531 	spin_unlock_bh(&rt6_exception_lock);
1532 }
1533 
1534 /* Find cached rt in the hash table inside passed in rt
1535  * Caller has to hold rcu_read_lock()
1536  */
1537 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1538 					   struct in6_addr *daddr,
1539 					   struct in6_addr *saddr)
1540 {
1541 	struct rt6_exception_bucket *bucket;
1542 	struct in6_addr *src_key = NULL;
1543 	struct rt6_exception *rt6_ex;
1544 	struct rt6_info *res = NULL;
1545 
1546 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1547 
1548 #ifdef CONFIG_IPV6_SUBTREES
1549 	/* rt6i_src.plen != 0 indicates rt is in subtree
1550 	 * and exception table is indexed by a hash of
1551 	 * both rt6i_dst and rt6i_src.
1552 	 * Otherwise, the exception table is indexed by
1553 	 * a hash of only rt6i_dst.
1554 	 */
1555 	if (rt->fib6_src.plen)
1556 		src_key = saddr;
1557 #endif
1558 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1559 
1560 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1561 		res = rt6_ex->rt6i;
1562 
1563 	return res;
1564 }
1565 
1566 /* Remove the passed in cached rt from the hash table that contains it */
1567 static int rt6_remove_exception_rt(struct rt6_info *rt)
1568 {
1569 	struct rt6_exception_bucket *bucket;
1570 	struct in6_addr *src_key = NULL;
1571 	struct rt6_exception *rt6_ex;
1572 	struct fib6_info *from;
1573 	int err;
1574 
1575 	from = rcu_dereference(rt->from);
1576 	if (!from ||
1577 	    !(rt->rt6i_flags & RTF_CACHE))
1578 		return -EINVAL;
1579 
1580 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1581 		return -ENOENT;
1582 
1583 	spin_lock_bh(&rt6_exception_lock);
1584 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1585 				    lockdep_is_held(&rt6_exception_lock));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1588 	 * and exception table is indexed by a hash of
1589 	 * both rt6i_dst and rt6i_src.
1590 	 * Otherwise, the exception table is indexed by
1591 	 * a hash of only rt6i_dst.
1592 	 */
1593 	if (from->fib6_src.plen)
1594 		src_key = &rt->rt6i_src.addr;
1595 #endif
1596 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1597 					       &rt->rt6i_dst.addr,
1598 					       src_key);
1599 	if (rt6_ex) {
1600 		rt6_remove_exception(bucket, rt6_ex);
1601 		err = 0;
1602 	} else {
1603 		err = -ENOENT;
1604 	}
1605 
1606 	spin_unlock_bh(&rt6_exception_lock);
1607 	return err;
1608 }
1609 
1610 /* Find rt6_ex which contains the passed in rt cache and
1611  * refresh its stamp
1612  */
1613 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1614 {
1615 	struct rt6_exception_bucket *bucket;
1616 	struct in6_addr *src_key = NULL;
1617 	struct rt6_exception *rt6_ex;
1618 	struct fib6_info *from;
1619 
1620 	rcu_read_lock();
1621 	from = rcu_dereference(rt->from);
1622 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1623 		goto unlock;
1624 
1625 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1626 
1627 #ifdef CONFIG_IPV6_SUBTREES
1628 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1629 	 * and exception table is indexed by a hash of
1630 	 * both rt6i_dst and rt6i_src.
1631 	 * Otherwise, the exception table is indexed by
1632 	 * a hash of only rt6i_dst.
1633 	 */
1634 	if (from->fib6_src.plen)
1635 		src_key = &rt->rt6i_src.addr;
1636 #endif
1637 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1638 					  &rt->rt6i_dst.addr,
1639 					  src_key);
1640 	if (rt6_ex)
1641 		rt6_ex->stamp = jiffies;
1642 
1643 unlock:
1644 	rcu_read_unlock();
1645 }
1646 
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648 					 struct rt6_info *rt, int mtu)
1649 {
1650 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1651 	 * lowest MTU in the path: always allow updating the route PMTU to
1652 	 * reflect PMTU decreases.
1653 	 *
1654 	 * If the new MTU is higher, and the route PMTU is equal to the local
1655 	 * MTU, this means the old MTU is the lowest in the path, so allow
1656 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657 	 * handle this.
1658 	 */
1659 
1660 	if (dst_mtu(&rt->dst) >= mtu)
1661 		return true;
1662 
1663 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1664 		return true;
1665 
1666 	return false;
1667 }
1668 
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670 				       struct fib6_info *rt, int mtu)
1671 {
1672 	struct rt6_exception_bucket *bucket;
1673 	struct rt6_exception *rt6_ex;
1674 	int i;
1675 
1676 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677 					lockdep_is_held(&rt6_exception_lock));
1678 
1679 	if (!bucket)
1680 		return;
1681 
1682 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684 			struct rt6_info *entry = rt6_ex->rt6i;
1685 
1686 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687 			 * route), the metrics of its rt->from have already
1688 			 * been updated.
1689 			 */
1690 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1692 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693 		}
1694 		bucket++;
1695 	}
1696 }
1697 
1698 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1699 
1700 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1701 					struct in6_addr *gateway)
1702 {
1703 	struct rt6_exception_bucket *bucket;
1704 	struct rt6_exception *rt6_ex;
1705 	struct hlist_node *tmp;
1706 	int i;
1707 
1708 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1709 		return;
1710 
1711 	spin_lock_bh(&rt6_exception_lock);
1712 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713 				     lockdep_is_held(&rt6_exception_lock));
1714 
1715 	if (bucket) {
1716 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717 			hlist_for_each_entry_safe(rt6_ex, tmp,
1718 						  &bucket->chain, hlist) {
1719 				struct rt6_info *entry = rt6_ex->rt6i;
1720 
1721 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722 				    RTF_CACHE_GATEWAY &&
1723 				    ipv6_addr_equal(gateway,
1724 						    &entry->rt6i_gateway)) {
1725 					rt6_remove_exception(bucket, rt6_ex);
1726 				}
1727 			}
1728 			bucket++;
1729 		}
1730 	}
1731 
1732 	spin_unlock_bh(&rt6_exception_lock);
1733 }
1734 
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736 				      struct rt6_exception *rt6_ex,
1737 				      struct fib6_gc_args *gc_args,
1738 				      unsigned long now)
1739 {
1740 	struct rt6_info *rt = rt6_ex->rt6i;
1741 
1742 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1743 	 * even if others have still references to them, so that on next
1744 	 * dst_check() such references can be dropped.
1745 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746 	 * expired, independently from their aging, as per RFC 8201 section 4
1747 	 */
1748 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750 			RT6_TRACE("aging clone %p\n", rt);
1751 			rt6_remove_exception(bucket, rt6_ex);
1752 			return;
1753 		}
1754 	} else if (time_after(jiffies, rt->dst.expires)) {
1755 		RT6_TRACE("purging expired route %p\n", rt);
1756 		rt6_remove_exception(bucket, rt6_ex);
1757 		return;
1758 	}
1759 
1760 	if (rt->rt6i_flags & RTF_GATEWAY) {
1761 		struct neighbour *neigh;
1762 		__u8 neigh_flags = 0;
1763 
1764 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765 		if (neigh)
1766 			neigh_flags = neigh->flags;
1767 
1768 		if (!(neigh_flags & NTF_ROUTER)) {
1769 			RT6_TRACE("purging route %p via non-router but gateway\n",
1770 				  rt);
1771 			rt6_remove_exception(bucket, rt6_ex);
1772 			return;
1773 		}
1774 	}
1775 
1776 	gc_args->more++;
1777 }
1778 
1779 void rt6_age_exceptions(struct fib6_info *rt,
1780 			struct fib6_gc_args *gc_args,
1781 			unsigned long now)
1782 {
1783 	struct rt6_exception_bucket *bucket;
1784 	struct rt6_exception *rt6_ex;
1785 	struct hlist_node *tmp;
1786 	int i;
1787 
1788 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789 		return;
1790 
1791 	rcu_read_lock_bh();
1792 	spin_lock(&rt6_exception_lock);
1793 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794 				    lockdep_is_held(&rt6_exception_lock));
1795 
1796 	if (bucket) {
1797 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798 			hlist_for_each_entry_safe(rt6_ex, tmp,
1799 						  &bucket->chain, hlist) {
1800 				rt6_age_examine_exception(bucket, rt6_ex,
1801 							  gc_args, now);
1802 			}
1803 			bucket++;
1804 		}
1805 	}
1806 	spin_unlock(&rt6_exception_lock);
1807 	rcu_read_unlock_bh();
1808 }
1809 
1810 /* must be called with rcu lock held */
1811 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1812 				    int oif, struct flowi6 *fl6, int strict)
1813 {
1814 	struct fib6_node *fn, *saved_fn;
1815 	struct fib6_info *f6i;
1816 
1817 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1818 	saved_fn = fn;
1819 
1820 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821 		oif = 0;
1822 
1823 redo_rt6_select:
1824 	f6i = rt6_select(net, fn, oif, strict);
1825 	if (f6i == net->ipv6.fib6_null_entry) {
1826 		fn = fib6_backtrack(fn, &fl6->saddr);
1827 		if (fn)
1828 			goto redo_rt6_select;
1829 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1830 			/* also consider unreachable route */
1831 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1832 			fn = saved_fn;
1833 			goto redo_rt6_select;
1834 		}
1835 	}
1836 
1837 	trace_fib6_table_lookup(net, f6i, table, fl6);
1838 
1839 	return f6i;
1840 }
1841 
1842 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1843 			       int oif, struct flowi6 *fl6,
1844 			       const struct sk_buff *skb, int flags)
1845 {
1846 	struct fib6_info *f6i;
1847 	struct rt6_info *rt;
1848 	int strict = 0;
1849 
1850 	strict |= flags & RT6_LOOKUP_F_IFACE;
1851 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1852 	if (net->ipv6.devconf_all->forwarding == 0)
1853 		strict |= RT6_LOOKUP_F_REACHABLE;
1854 
1855 	rcu_read_lock();
1856 
1857 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1858 	if (f6i == net->ipv6.fib6_null_entry) {
1859 		rt = net->ipv6.ip6_null_entry;
1860 		rcu_read_unlock();
1861 		dst_hold(&rt->dst);
1862 		return rt;
1863 	}
1864 
1865 	if (f6i->fib6_nsiblings)
1866 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1867 
1868 	/*Search through exception table */
1869 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870 	if (rt) {
1871 		if (ip6_hold_safe(net, &rt))
1872 			dst_use_noref(&rt->dst, jiffies);
1873 
1874 		rcu_read_unlock();
1875 		return rt;
1876 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1877 			    !f6i->fib6_nh.fib_nh_gw_family)) {
1878 		/* Create a RTF_CACHE clone which will not be
1879 		 * owned by the fib6 tree.  It is for the special case where
1880 		 * the daddr in the skb during the neighbor look-up is different
1881 		 * from the fl6->daddr used to look-up route here.
1882 		 */
1883 		struct rt6_info *uncached_rt;
1884 
1885 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886 
1887 		rcu_read_unlock();
1888 
1889 		if (uncached_rt) {
1890 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1891 			 * No need for another dst_hold()
1892 			 */
1893 			rt6_uncached_list_add(uncached_rt);
1894 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895 		} else {
1896 			uncached_rt = net->ipv6.ip6_null_entry;
1897 			dst_hold(&uncached_rt->dst);
1898 		}
1899 
1900 		return uncached_rt;
1901 	} else {
1902 		/* Get a percpu copy */
1903 
1904 		struct rt6_info *pcpu_rt;
1905 
1906 		local_bh_disable();
1907 		pcpu_rt = rt6_get_pcpu_route(f6i);
1908 
1909 		if (!pcpu_rt)
1910 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1911 
1912 		local_bh_enable();
1913 		rcu_read_unlock();
1914 
1915 		return pcpu_rt;
1916 	}
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 
1920 static struct rt6_info *ip6_pol_route_input(struct net *net,
1921 					    struct fib6_table *table,
1922 					    struct flowi6 *fl6,
1923 					    const struct sk_buff *skb,
1924 					    int flags)
1925 {
1926 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1927 }
1928 
1929 struct dst_entry *ip6_route_input_lookup(struct net *net,
1930 					 struct net_device *dev,
1931 					 struct flowi6 *fl6,
1932 					 const struct sk_buff *skb,
1933 					 int flags)
1934 {
1935 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1936 		flags |= RT6_LOOKUP_F_IFACE;
1937 
1938 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 }
1940 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 
1942 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1943 				  struct flow_keys *keys,
1944 				  struct flow_keys *flkeys)
1945 {
1946 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1947 	const struct ipv6hdr *key_iph = outer_iph;
1948 	struct flow_keys *_flkeys = flkeys;
1949 	const struct ipv6hdr *inner_iph;
1950 	const struct icmp6hdr *icmph;
1951 	struct ipv6hdr _inner_iph;
1952 	struct icmp6hdr _icmph;
1953 
1954 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1955 		goto out;
1956 
1957 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1958 				   sizeof(_icmph), &_icmph);
1959 	if (!icmph)
1960 		goto out;
1961 
1962 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1963 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1964 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1965 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1966 		goto out;
1967 
1968 	inner_iph = skb_header_pointer(skb,
1969 				       skb_transport_offset(skb) + sizeof(*icmph),
1970 				       sizeof(_inner_iph), &_inner_iph);
1971 	if (!inner_iph)
1972 		goto out;
1973 
1974 	key_iph = inner_iph;
1975 	_flkeys = NULL;
1976 out:
1977 	if (_flkeys) {
1978 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1979 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1980 		keys->tags.flow_label = _flkeys->tags.flow_label;
1981 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982 	} else {
1983 		keys->addrs.v6addrs.src = key_iph->saddr;
1984 		keys->addrs.v6addrs.dst = key_iph->daddr;
1985 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1986 		keys->basic.ip_proto = key_iph->nexthdr;
1987 	}
1988 }
1989 
1990 /* if skb is set it will be used and fl6 can be NULL */
1991 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1992 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1993 {
1994 	struct flow_keys hash_keys;
1995 	u32 mhash;
1996 
1997 	switch (ip6_multipath_hash_policy(net)) {
1998 	case 0:
1999 		memset(&hash_keys, 0, sizeof(hash_keys));
2000 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 		if (skb) {
2002 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003 		} else {
2004 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2005 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2006 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2007 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008 		}
2009 		break;
2010 	case 1:
2011 		if (skb) {
2012 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2013 			struct flow_keys keys;
2014 
2015 			/* short-circuit if we already have L4 hash present */
2016 			if (skb->l4_hash)
2017 				return skb_get_hash_raw(skb) >> 1;
2018 
2019 			memset(&hash_keys, 0, sizeof(hash_keys));
2020 
2021                         if (!flkeys) {
2022 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2023 				flkeys = &keys;
2024 			}
2025 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2027 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2028 			hash_keys.ports.src = flkeys->ports.src;
2029 			hash_keys.ports.dst = flkeys->ports.dst;
2030 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031 		} else {
2032 			memset(&hash_keys, 0, sizeof(hash_keys));
2033 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2035 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2036 			hash_keys.ports.src = fl6->fl6_sport;
2037 			hash_keys.ports.dst = fl6->fl6_dport;
2038 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039 		}
2040 		break;
2041 	}
2042 	mhash = flow_hash_from_keys(&hash_keys);
2043 
2044 	return mhash >> 1;
2045 }
2046 
2047 void ip6_route_input(struct sk_buff *skb)
2048 {
2049 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2050 	struct net *net = dev_net(skb->dev);
2051 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2052 	struct ip_tunnel_info *tun_info;
2053 	struct flowi6 fl6 = {
2054 		.flowi6_iif = skb->dev->ifindex,
2055 		.daddr = iph->daddr,
2056 		.saddr = iph->saddr,
2057 		.flowlabel = ip6_flowinfo(iph),
2058 		.flowi6_mark = skb->mark,
2059 		.flowi6_proto = iph->nexthdr,
2060 	};
2061 	struct flow_keys *flkeys = NULL, _flkeys;
2062 
2063 	tun_info = skb_tunnel_info(skb);
2064 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2065 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 
2067 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2068 		flkeys = &_flkeys;
2069 
2070 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2071 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2072 	skb_dst_drop(skb);
2073 	skb_dst_set(skb,
2074 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2075 }
2076 
2077 static struct rt6_info *ip6_pol_route_output(struct net *net,
2078 					     struct fib6_table *table,
2079 					     struct flowi6 *fl6,
2080 					     const struct sk_buff *skb,
2081 					     int flags)
2082 {
2083 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2084 }
2085 
2086 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2087 					 struct flowi6 *fl6, int flags)
2088 {
2089 	bool any_src;
2090 
2091 	if (ipv6_addr_type(&fl6->daddr) &
2092 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2093 		struct dst_entry *dst;
2094 
2095 		dst = l3mdev_link_scope_lookup(net, fl6);
2096 		if (dst)
2097 			return dst;
2098 	}
2099 
2100 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2101 
2102 	any_src = ipv6_addr_any(&fl6->saddr);
2103 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2104 	    (fl6->flowi6_oif && any_src))
2105 		flags |= RT6_LOOKUP_F_IFACE;
2106 
2107 	if (!any_src)
2108 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2109 	else if (sk)
2110 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2111 
2112 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2113 }
2114 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2115 
2116 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2117 {
2118 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2119 	struct net_device *loopback_dev = net->loopback_dev;
2120 	struct dst_entry *new = NULL;
2121 
2122 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2123 		       DST_OBSOLETE_DEAD, 0);
2124 	if (rt) {
2125 		rt6_info_init(rt);
2126 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2127 
2128 		new = &rt->dst;
2129 		new->__use = 1;
2130 		new->input = dst_discard;
2131 		new->output = dst_discard_out;
2132 
2133 		dst_copy_metrics(new, &ort->dst);
2134 
2135 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2136 		rt->rt6i_gateway = ort->rt6i_gateway;
2137 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2138 
2139 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2140 #ifdef CONFIG_IPV6_SUBTREES
2141 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2142 #endif
2143 	}
2144 
2145 	dst_release(dst_orig);
2146 	return new ? new : ERR_PTR(-ENOMEM);
2147 }
2148 
2149 /*
2150  *	Destination cache support functions
2151  */
2152 
2153 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2154 {
2155 	u32 rt_cookie = 0;
2156 
2157 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158 		return false;
2159 
2160 	if (fib6_check_expired(f6i))
2161 		return false;
2162 
2163 	return true;
2164 }
2165 
2166 static struct dst_entry *rt6_check(struct rt6_info *rt,
2167 				   struct fib6_info *from,
2168 				   u32 cookie)
2169 {
2170 	u32 rt_cookie = 0;
2171 
2172 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2173 	    rt_cookie != cookie)
2174 		return NULL;
2175 
2176 	if (rt6_check_expired(rt))
2177 		return NULL;
2178 
2179 	return &rt->dst;
2180 }
2181 
2182 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2183 					    struct fib6_info *from,
2184 					    u32 cookie)
2185 {
2186 	if (!__rt6_check_expired(rt) &&
2187 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2188 	    fib6_check(from, cookie))
2189 		return &rt->dst;
2190 	else
2191 		return NULL;
2192 }
2193 
2194 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2195 {
2196 	struct dst_entry *dst_ret;
2197 	struct fib6_info *from;
2198 	struct rt6_info *rt;
2199 
2200 	rt = container_of(dst, struct rt6_info, dst);
2201 
2202 	rcu_read_lock();
2203 
2204 	/* All IPV6 dsts are created with ->obsolete set to the value
2205 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2206 	 * into this function always.
2207 	 */
2208 
2209 	from = rcu_dereference(rt->from);
2210 
2211 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2212 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2213 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2214 	else
2215 		dst_ret = rt6_check(rt, from, cookie);
2216 
2217 	rcu_read_unlock();
2218 
2219 	return dst_ret;
2220 }
2221 
2222 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2223 {
2224 	struct rt6_info *rt = (struct rt6_info *) dst;
2225 
2226 	if (rt) {
2227 		if (rt->rt6i_flags & RTF_CACHE) {
2228 			rcu_read_lock();
2229 			if (rt6_check_expired(rt)) {
2230 				rt6_remove_exception_rt(rt);
2231 				dst = NULL;
2232 			}
2233 			rcu_read_unlock();
2234 		} else {
2235 			dst_release(dst);
2236 			dst = NULL;
2237 		}
2238 	}
2239 	return dst;
2240 }
2241 
2242 static void ip6_link_failure(struct sk_buff *skb)
2243 {
2244 	struct rt6_info *rt;
2245 
2246 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2247 
2248 	rt = (struct rt6_info *) skb_dst(skb);
2249 	if (rt) {
2250 		rcu_read_lock();
2251 		if (rt->rt6i_flags & RTF_CACHE) {
2252 			rt6_remove_exception_rt(rt);
2253 		} else {
2254 			struct fib6_info *from;
2255 			struct fib6_node *fn;
2256 
2257 			from = rcu_dereference(rt->from);
2258 			if (from) {
2259 				fn = rcu_dereference(from->fib6_node);
2260 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2261 					fn->fn_sernum = -1;
2262 			}
2263 		}
2264 		rcu_read_unlock();
2265 	}
2266 }
2267 
2268 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 {
2270 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2271 		struct fib6_info *from;
2272 
2273 		rcu_read_lock();
2274 		from = rcu_dereference(rt0->from);
2275 		if (from)
2276 			rt0->dst.expires = from->expires;
2277 		rcu_read_unlock();
2278 	}
2279 
2280 	dst_set_expires(&rt0->dst, timeout);
2281 	rt0->rt6i_flags |= RTF_EXPIRES;
2282 }
2283 
2284 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 {
2286 	struct net *net = dev_net(rt->dst.dev);
2287 
2288 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2289 	rt->rt6i_flags |= RTF_MODIFIED;
2290 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2291 }
2292 
2293 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 {
2295 	return !(rt->rt6i_flags & RTF_CACHE) &&
2296 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2297 }
2298 
2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2300 				 const struct ipv6hdr *iph, u32 mtu)
2301 {
2302 	const struct in6_addr *daddr, *saddr;
2303 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2304 
2305 	if (dst_metric_locked(dst, RTAX_MTU))
2306 		return;
2307 
2308 	if (iph) {
2309 		daddr = &iph->daddr;
2310 		saddr = &iph->saddr;
2311 	} else if (sk) {
2312 		daddr = &sk->sk_v6_daddr;
2313 		saddr = &inet6_sk(sk)->saddr;
2314 	} else {
2315 		daddr = NULL;
2316 		saddr = NULL;
2317 	}
2318 	dst_confirm_neigh(dst, daddr);
2319 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2320 	if (mtu >= dst_mtu(dst))
2321 		return;
2322 
2323 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2324 		rt6_do_update_pmtu(rt6, mtu);
2325 		/* update rt6_ex->stamp for cache */
2326 		if (rt6->rt6i_flags & RTF_CACHE)
2327 			rt6_update_exception_stamp_rt(rt6);
2328 	} else if (daddr) {
2329 		struct fib6_info *from;
2330 		struct rt6_info *nrt6;
2331 
2332 		rcu_read_lock();
2333 		from = rcu_dereference(rt6->from);
2334 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2335 		if (nrt6) {
2336 			rt6_do_update_pmtu(nrt6, mtu);
2337 			if (rt6_insert_exception(nrt6, from))
2338 				dst_release_immediate(&nrt6->dst);
2339 		}
2340 		rcu_read_unlock();
2341 	}
2342 }
2343 
2344 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2345 			       struct sk_buff *skb, u32 mtu)
2346 {
2347 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2348 }
2349 
2350 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2351 		     int oif, u32 mark, kuid_t uid)
2352 {
2353 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2354 	struct dst_entry *dst;
2355 	struct flowi6 fl6 = {
2356 		.flowi6_oif = oif,
2357 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2358 		.daddr = iph->daddr,
2359 		.saddr = iph->saddr,
2360 		.flowlabel = ip6_flowinfo(iph),
2361 		.flowi6_uid = uid,
2362 	};
2363 
2364 	dst = ip6_route_output(net, NULL, &fl6);
2365 	if (!dst->error)
2366 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2367 	dst_release(dst);
2368 }
2369 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2370 
2371 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2372 {
2373 	int oif = sk->sk_bound_dev_if;
2374 	struct dst_entry *dst;
2375 
2376 	if (!oif && skb->dev)
2377 		oif = l3mdev_master_ifindex(skb->dev);
2378 
2379 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2380 
2381 	dst = __sk_dst_get(sk);
2382 	if (!dst || !dst->obsolete ||
2383 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384 		return;
2385 
2386 	bh_lock_sock(sk);
2387 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2388 		ip6_datagram_dst_update(sk, false);
2389 	bh_unlock_sock(sk);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392 
2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2394 			   const struct flowi6 *fl6)
2395 {
2396 #ifdef CONFIG_IPV6_SUBTREES
2397 	struct ipv6_pinfo *np = inet6_sk(sk);
2398 #endif
2399 
2400 	ip6_dst_store(sk, dst,
2401 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2402 		      &sk->sk_v6_daddr : NULL,
2403 #ifdef CONFIG_IPV6_SUBTREES
2404 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2405 		      &np->saddr :
2406 #endif
2407 		      NULL);
2408 }
2409 
2410 static bool ip6_redirect_nh_match(struct fib6_info *f6i,
2411 				  struct fib6_nh *nh,
2412 				  struct flowi6 *fl6,
2413 				  const struct in6_addr *gw,
2414 				  struct rt6_info **ret)
2415 {
2416 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2417 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2418 		return false;
2419 
2420 	/* rt_cache's gateway might be different from its 'parent'
2421 	 * in the case of an ip redirect.
2422 	 * So we keep searching in the exception table if the gateway
2423 	 * is different.
2424 	 */
2425 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2426 		struct rt6_info *rt_cache;
2427 
2428 		rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2429 		if (rt_cache &&
2430 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2431 			*ret = rt_cache;
2432 			return true;
2433 		}
2434 		return false;
2435 	}
2436 	return true;
2437 }
2438 
2439 /* Handle redirects */
2440 struct ip6rd_flowi {
2441 	struct flowi6 fl6;
2442 	struct in6_addr gateway;
2443 };
2444 
2445 static struct rt6_info *__ip6_route_redirect(struct net *net,
2446 					     struct fib6_table *table,
2447 					     struct flowi6 *fl6,
2448 					     const struct sk_buff *skb,
2449 					     int flags)
2450 {
2451 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2452 	struct rt6_info *ret = NULL;
2453 	struct fib6_info *rt;
2454 	struct fib6_node *fn;
2455 
2456 	/* Get the "current" route for this destination and
2457 	 * check if the redirect has come from appropriate router.
2458 	 *
2459 	 * RFC 4861 specifies that redirects should only be
2460 	 * accepted if they come from the nexthop to the target.
2461 	 * Due to the way the routes are chosen, this notion
2462 	 * is a bit fuzzy and one might need to check all possible
2463 	 * routes.
2464 	 */
2465 
2466 	rcu_read_lock();
2467 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2468 restart:
2469 	for_each_fib6_node_rt_rcu(fn) {
2470 		if (fib6_check_expired(rt))
2471 			continue;
2472 		if (rt->fib6_flags & RTF_REJECT)
2473 			break;
2474 		if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2475 			continue;
2476 		if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6,
2477 					  &rdfl->gateway, &ret))
2478 			goto out;
2479 	}
2480 
2481 	if (!rt)
2482 		rt = net->ipv6.fib6_null_entry;
2483 	else if (rt->fib6_flags & RTF_REJECT) {
2484 		ret = net->ipv6.ip6_null_entry;
2485 		goto out;
2486 	}
2487 
2488 	if (rt == net->ipv6.fib6_null_entry) {
2489 		fn = fib6_backtrack(fn, &fl6->saddr);
2490 		if (fn)
2491 			goto restart;
2492 	}
2493 
2494 out:
2495 	if (ret)
2496 		ip6_hold_safe(net, &ret);
2497 	else
2498 		ret = ip6_create_rt_rcu(rt);
2499 
2500 	rcu_read_unlock();
2501 
2502 	trace_fib6_table_lookup(net, rt, table, fl6);
2503 	return ret;
2504 };
2505 
2506 static struct dst_entry *ip6_route_redirect(struct net *net,
2507 					    const struct flowi6 *fl6,
2508 					    const struct sk_buff *skb,
2509 					    const struct in6_addr *gateway)
2510 {
2511 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2512 	struct ip6rd_flowi rdfl;
2513 
2514 	rdfl.fl6 = *fl6;
2515 	rdfl.gateway = *gateway;
2516 
2517 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2518 				flags, __ip6_route_redirect);
2519 }
2520 
2521 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2522 		  kuid_t uid)
2523 {
2524 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2525 	struct dst_entry *dst;
2526 	struct flowi6 fl6 = {
2527 		.flowi6_iif = LOOPBACK_IFINDEX,
2528 		.flowi6_oif = oif,
2529 		.flowi6_mark = mark,
2530 		.daddr = iph->daddr,
2531 		.saddr = iph->saddr,
2532 		.flowlabel = ip6_flowinfo(iph),
2533 		.flowi6_uid = uid,
2534 	};
2535 
2536 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2537 	rt6_do_redirect(dst, NULL, skb);
2538 	dst_release(dst);
2539 }
2540 EXPORT_SYMBOL_GPL(ip6_redirect);
2541 
2542 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2543 {
2544 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2545 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2546 	struct dst_entry *dst;
2547 	struct flowi6 fl6 = {
2548 		.flowi6_iif = LOOPBACK_IFINDEX,
2549 		.flowi6_oif = oif,
2550 		.daddr = msg->dest,
2551 		.saddr = iph->daddr,
2552 		.flowi6_uid = sock_net_uid(net, NULL),
2553 	};
2554 
2555 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2556 	rt6_do_redirect(dst, NULL, skb);
2557 	dst_release(dst);
2558 }
2559 
2560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2561 {
2562 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2563 		     sk->sk_uid);
2564 }
2565 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2566 
2567 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2568 {
2569 	struct net_device *dev = dst->dev;
2570 	unsigned int mtu = dst_mtu(dst);
2571 	struct net *net = dev_net(dev);
2572 
2573 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2574 
2575 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2576 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2577 
2578 	/*
2579 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2580 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2581 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2582 	 * rely only on pmtu discovery"
2583 	 */
2584 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2585 		mtu = IPV6_MAXPLEN;
2586 	return mtu;
2587 }
2588 
2589 static unsigned int ip6_mtu(const struct dst_entry *dst)
2590 {
2591 	struct inet6_dev *idev;
2592 	unsigned int mtu;
2593 
2594 	mtu = dst_metric_raw(dst, RTAX_MTU);
2595 	if (mtu)
2596 		goto out;
2597 
2598 	mtu = IPV6_MIN_MTU;
2599 
2600 	rcu_read_lock();
2601 	idev = __in6_dev_get(dst->dev);
2602 	if (idev)
2603 		mtu = idev->cnf.mtu6;
2604 	rcu_read_unlock();
2605 
2606 out:
2607 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2608 
2609 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2610 }
2611 
2612 /* MTU selection:
2613  * 1. mtu on route is locked - use it
2614  * 2. mtu from nexthop exception
2615  * 3. mtu from egress device
2616  *
2617  * based on ip6_dst_mtu_forward and exception logic of
2618  * rt6_find_cached_rt; called with rcu_read_lock
2619  */
2620 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2621 		      struct in6_addr *saddr)
2622 {
2623 	struct rt6_exception_bucket *bucket;
2624 	struct rt6_exception *rt6_ex;
2625 	struct in6_addr *src_key;
2626 	struct inet6_dev *idev;
2627 	u32 mtu = 0;
2628 
2629 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2630 		mtu = f6i->fib6_pmtu;
2631 		if (mtu)
2632 			goto out;
2633 	}
2634 
2635 	src_key = NULL;
2636 #ifdef CONFIG_IPV6_SUBTREES
2637 	if (f6i->fib6_src.plen)
2638 		src_key = saddr;
2639 #endif
2640 
2641 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2642 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2643 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2644 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2645 
2646 	if (likely(!mtu)) {
2647 		struct net_device *dev = fib6_info_nh_dev(f6i);
2648 
2649 		mtu = IPV6_MIN_MTU;
2650 		idev = __in6_dev_get(dev);
2651 		if (idev && idev->cnf.mtu6 > mtu)
2652 			mtu = idev->cnf.mtu6;
2653 	}
2654 
2655 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2656 out:
2657 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2658 }
2659 
2660 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2661 				  struct flowi6 *fl6)
2662 {
2663 	struct dst_entry *dst;
2664 	struct rt6_info *rt;
2665 	struct inet6_dev *idev = in6_dev_get(dev);
2666 	struct net *net = dev_net(dev);
2667 
2668 	if (unlikely(!idev))
2669 		return ERR_PTR(-ENODEV);
2670 
2671 	rt = ip6_dst_alloc(net, dev, 0);
2672 	if (unlikely(!rt)) {
2673 		in6_dev_put(idev);
2674 		dst = ERR_PTR(-ENOMEM);
2675 		goto out;
2676 	}
2677 
2678 	rt->dst.flags |= DST_HOST;
2679 	rt->dst.input = ip6_input;
2680 	rt->dst.output  = ip6_output;
2681 	rt->rt6i_gateway  = fl6->daddr;
2682 	rt->rt6i_dst.addr = fl6->daddr;
2683 	rt->rt6i_dst.plen = 128;
2684 	rt->rt6i_idev     = idev;
2685 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2686 
2687 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2688 	 * do proper release of the net_device
2689 	 */
2690 	rt6_uncached_list_add(rt);
2691 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2692 
2693 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2694 
2695 out:
2696 	return dst;
2697 }
2698 
2699 static int ip6_dst_gc(struct dst_ops *ops)
2700 {
2701 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2702 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2703 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2704 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2705 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2706 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2707 	int entries;
2708 
2709 	entries = dst_entries_get_fast(ops);
2710 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2711 	    entries <= rt_max_size)
2712 		goto out;
2713 
2714 	net->ipv6.ip6_rt_gc_expire++;
2715 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2716 	entries = dst_entries_get_slow(ops);
2717 	if (entries < ops->gc_thresh)
2718 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2719 out:
2720 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2721 	return entries > rt_max_size;
2722 }
2723 
2724 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2725 					    struct fib6_config *cfg,
2726 					    const struct in6_addr *gw_addr,
2727 					    u32 tbid, int flags)
2728 {
2729 	struct flowi6 fl6 = {
2730 		.flowi6_oif = cfg->fc_ifindex,
2731 		.daddr = *gw_addr,
2732 		.saddr = cfg->fc_prefsrc,
2733 	};
2734 	struct fib6_table *table;
2735 	struct rt6_info *rt;
2736 
2737 	table = fib6_get_table(net, tbid);
2738 	if (!table)
2739 		return NULL;
2740 
2741 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2742 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2743 
2744 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2745 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2746 
2747 	/* if table lookup failed, fall back to full lookup */
2748 	if (rt == net->ipv6.ip6_null_entry) {
2749 		ip6_rt_put(rt);
2750 		rt = NULL;
2751 	}
2752 
2753 	return rt;
2754 }
2755 
2756 static int ip6_route_check_nh_onlink(struct net *net,
2757 				     struct fib6_config *cfg,
2758 				     const struct net_device *dev,
2759 				     struct netlink_ext_ack *extack)
2760 {
2761 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2762 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2763 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2764 	struct fib6_info *from;
2765 	struct rt6_info *grt;
2766 	int err;
2767 
2768 	err = 0;
2769 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2770 	if (grt) {
2771 		rcu_read_lock();
2772 		from = rcu_dereference(grt->from);
2773 		if (!grt->dst.error &&
2774 		    /* ignore match if it is the default route */
2775 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2776 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2777 			NL_SET_ERR_MSG(extack,
2778 				       "Nexthop has invalid gateway or device mismatch");
2779 			err = -EINVAL;
2780 		}
2781 		rcu_read_unlock();
2782 
2783 		ip6_rt_put(grt);
2784 	}
2785 
2786 	return err;
2787 }
2788 
2789 static int ip6_route_check_nh(struct net *net,
2790 			      struct fib6_config *cfg,
2791 			      struct net_device **_dev,
2792 			      struct inet6_dev **idev)
2793 {
2794 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2795 	struct net_device *dev = _dev ? *_dev : NULL;
2796 	struct rt6_info *grt = NULL;
2797 	int err = -EHOSTUNREACH;
2798 
2799 	if (cfg->fc_table) {
2800 		int flags = RT6_LOOKUP_F_IFACE;
2801 
2802 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2803 					  cfg->fc_table, flags);
2804 		if (grt) {
2805 			if (grt->rt6i_flags & RTF_GATEWAY ||
2806 			    (dev && dev != grt->dst.dev)) {
2807 				ip6_rt_put(grt);
2808 				grt = NULL;
2809 			}
2810 		}
2811 	}
2812 
2813 	if (!grt)
2814 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2815 
2816 	if (!grt)
2817 		goto out;
2818 
2819 	if (dev) {
2820 		if (dev != grt->dst.dev) {
2821 			ip6_rt_put(grt);
2822 			goto out;
2823 		}
2824 	} else {
2825 		*_dev = dev = grt->dst.dev;
2826 		*idev = grt->rt6i_idev;
2827 		dev_hold(dev);
2828 		in6_dev_hold(grt->rt6i_idev);
2829 	}
2830 
2831 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2832 		err = 0;
2833 
2834 	ip6_rt_put(grt);
2835 
2836 out:
2837 	return err;
2838 }
2839 
2840 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2841 			   struct net_device **_dev, struct inet6_dev **idev,
2842 			   struct netlink_ext_ack *extack)
2843 {
2844 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2845 	int gwa_type = ipv6_addr_type(gw_addr);
2846 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2847 	const struct net_device *dev = *_dev;
2848 	bool need_addr_check = !dev;
2849 	int err = -EINVAL;
2850 
2851 	/* if gw_addr is local we will fail to detect this in case
2852 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2853 	 * will return already-added prefix route via interface that
2854 	 * prefix route was assigned to, which might be non-loopback.
2855 	 */
2856 	if (dev &&
2857 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2858 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2859 		goto out;
2860 	}
2861 
2862 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2863 		/* IPv6 strictly inhibits using not link-local
2864 		 * addresses as nexthop address.
2865 		 * Otherwise, router will not able to send redirects.
2866 		 * It is very good, but in some (rare!) circumstances
2867 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2868 		 * some exceptions. --ANK
2869 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2870 		 * addressing
2871 		 */
2872 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2873 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2874 			goto out;
2875 		}
2876 
2877 		if (cfg->fc_flags & RTNH_F_ONLINK)
2878 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2879 		else
2880 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2881 
2882 		if (err)
2883 			goto out;
2884 	}
2885 
2886 	/* reload in case device was changed */
2887 	dev = *_dev;
2888 
2889 	err = -EINVAL;
2890 	if (!dev) {
2891 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2892 		goto out;
2893 	} else if (dev->flags & IFF_LOOPBACK) {
2894 		NL_SET_ERR_MSG(extack,
2895 			       "Egress device can not be loopback device for this route");
2896 		goto out;
2897 	}
2898 
2899 	/* if we did not check gw_addr above, do so now that the
2900 	 * egress device has been resolved.
2901 	 */
2902 	if (need_addr_check &&
2903 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2904 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2905 		goto out;
2906 	}
2907 
2908 	err = 0;
2909 out:
2910 	return err;
2911 }
2912 
2913 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2914 {
2915 	if ((flags & RTF_REJECT) ||
2916 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2917 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2918 	     !(flags & RTF_LOCAL)))
2919 		return true;
2920 
2921 	return false;
2922 }
2923 
2924 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2925 		 struct fib6_config *cfg, gfp_t gfp_flags,
2926 		 struct netlink_ext_ack *extack)
2927 {
2928 	struct net_device *dev = NULL;
2929 	struct inet6_dev *idev = NULL;
2930 	int addr_type;
2931 	int err;
2932 
2933 	fib6_nh->fib_nh_family = AF_INET6;
2934 
2935 	err = -ENODEV;
2936 	if (cfg->fc_ifindex) {
2937 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2938 		if (!dev)
2939 			goto out;
2940 		idev = in6_dev_get(dev);
2941 		if (!idev)
2942 			goto out;
2943 	}
2944 
2945 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2946 		if (!dev) {
2947 			NL_SET_ERR_MSG(extack,
2948 				       "Nexthop device required for onlink");
2949 			goto out;
2950 		}
2951 
2952 		if (!(dev->flags & IFF_UP)) {
2953 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2954 			err = -ENETDOWN;
2955 			goto out;
2956 		}
2957 
2958 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2959 	}
2960 
2961 	fib6_nh->fib_nh_weight = 1;
2962 
2963 	/* We cannot add true routes via loopback here,
2964 	 * they would result in kernel looping; promote them to reject routes
2965 	 */
2966 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2967 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2968 		/* hold loopback dev/idev if we haven't done so. */
2969 		if (dev != net->loopback_dev) {
2970 			if (dev) {
2971 				dev_put(dev);
2972 				in6_dev_put(idev);
2973 			}
2974 			dev = net->loopback_dev;
2975 			dev_hold(dev);
2976 			idev = in6_dev_get(dev);
2977 			if (!idev) {
2978 				err = -ENODEV;
2979 				goto out;
2980 			}
2981 		}
2982 		goto set_dev;
2983 	}
2984 
2985 	if (cfg->fc_flags & RTF_GATEWAY) {
2986 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2987 		if (err)
2988 			goto out;
2989 
2990 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2991 		fib6_nh->fib_nh_gw_family = AF_INET6;
2992 	}
2993 
2994 	err = -ENODEV;
2995 	if (!dev)
2996 		goto out;
2997 
2998 	if (idev->cnf.disable_ipv6) {
2999 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3000 		err = -EACCES;
3001 		goto out;
3002 	}
3003 
3004 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3005 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3006 		err = -ENETDOWN;
3007 		goto out;
3008 	}
3009 
3010 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3011 	    !netif_carrier_ok(dev))
3012 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3013 
3014 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3015 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3016 	if (err)
3017 		goto out;
3018 set_dev:
3019 	fib6_nh->fib_nh_dev = dev;
3020 	fib6_nh->fib_nh_oif = dev->ifindex;
3021 	err = 0;
3022 out:
3023 	if (idev)
3024 		in6_dev_put(idev);
3025 
3026 	if (err) {
3027 		lwtstate_put(fib6_nh->fib_nh_lws);
3028 		fib6_nh->fib_nh_lws = NULL;
3029 		if (dev)
3030 			dev_put(dev);
3031 	}
3032 
3033 	return err;
3034 }
3035 
3036 void fib6_nh_release(struct fib6_nh *fib6_nh)
3037 {
3038 	fib_nh_common_release(&fib6_nh->nh_common);
3039 }
3040 
3041 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3042 					      gfp_t gfp_flags,
3043 					      struct netlink_ext_ack *extack)
3044 {
3045 	struct net *net = cfg->fc_nlinfo.nl_net;
3046 	struct fib6_info *rt = NULL;
3047 	struct fib6_table *table;
3048 	int err = -EINVAL;
3049 	int addr_type;
3050 
3051 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3052 	if (cfg->fc_flags & RTF_PCPU) {
3053 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3054 		goto out;
3055 	}
3056 
3057 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3058 	if (cfg->fc_flags & RTF_CACHE) {
3059 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3060 		goto out;
3061 	}
3062 
3063 	if (cfg->fc_type > RTN_MAX) {
3064 		NL_SET_ERR_MSG(extack, "Invalid route type");
3065 		goto out;
3066 	}
3067 
3068 	if (cfg->fc_dst_len > 128) {
3069 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3070 		goto out;
3071 	}
3072 	if (cfg->fc_src_len > 128) {
3073 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3074 		goto out;
3075 	}
3076 #ifndef CONFIG_IPV6_SUBTREES
3077 	if (cfg->fc_src_len) {
3078 		NL_SET_ERR_MSG(extack,
3079 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3080 		goto out;
3081 	}
3082 #endif
3083 
3084 	err = -ENOBUFS;
3085 	if (cfg->fc_nlinfo.nlh &&
3086 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3087 		table = fib6_get_table(net, cfg->fc_table);
3088 		if (!table) {
3089 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3090 			table = fib6_new_table(net, cfg->fc_table);
3091 		}
3092 	} else {
3093 		table = fib6_new_table(net, cfg->fc_table);
3094 	}
3095 
3096 	if (!table)
3097 		goto out;
3098 
3099 	err = -ENOMEM;
3100 	rt = fib6_info_alloc(gfp_flags);
3101 	if (!rt)
3102 		goto out;
3103 
3104 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3105 					       extack);
3106 	if (IS_ERR(rt->fib6_metrics)) {
3107 		err = PTR_ERR(rt->fib6_metrics);
3108 		/* Do not leave garbage there. */
3109 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3110 		goto out;
3111 	}
3112 
3113 	if (cfg->fc_flags & RTF_ADDRCONF)
3114 		rt->dst_nocount = true;
3115 
3116 	if (cfg->fc_flags & RTF_EXPIRES)
3117 		fib6_set_expires(rt, jiffies +
3118 				clock_t_to_jiffies(cfg->fc_expires));
3119 	else
3120 		fib6_clean_expires(rt);
3121 
3122 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3123 		cfg->fc_protocol = RTPROT_BOOT;
3124 	rt->fib6_protocol = cfg->fc_protocol;
3125 
3126 	rt->fib6_table = table;
3127 	rt->fib6_metric = cfg->fc_metric;
3128 	rt->fib6_type = cfg->fc_type;
3129 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3130 
3131 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3132 	rt->fib6_dst.plen = cfg->fc_dst_len;
3133 	if (rt->fib6_dst.plen == 128)
3134 		rt->dst_host = true;
3135 
3136 #ifdef CONFIG_IPV6_SUBTREES
3137 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3138 	rt->fib6_src.plen = cfg->fc_src_len;
3139 #endif
3140 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3141 	if (err)
3142 		goto out;
3143 
3144 	/* We cannot add true routes via loopback here,
3145 	 * they would result in kernel looping; promote them to reject routes
3146 	 */
3147 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3148 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3149 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3150 
3151 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3152 		struct net_device *dev = fib6_info_nh_dev(rt);
3153 
3154 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3155 			NL_SET_ERR_MSG(extack, "Invalid source address");
3156 			err = -EINVAL;
3157 			goto out;
3158 		}
3159 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3160 		rt->fib6_prefsrc.plen = 128;
3161 	} else
3162 		rt->fib6_prefsrc.plen = 0;
3163 
3164 	return rt;
3165 out:
3166 	fib6_info_release(rt);
3167 	return ERR_PTR(err);
3168 }
3169 
3170 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3171 		  struct netlink_ext_ack *extack)
3172 {
3173 	struct fib6_info *rt;
3174 	int err;
3175 
3176 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3177 	if (IS_ERR(rt))
3178 		return PTR_ERR(rt);
3179 
3180 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3181 	fib6_info_release(rt);
3182 
3183 	return err;
3184 }
3185 
3186 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3187 {
3188 	struct net *net = info->nl_net;
3189 	struct fib6_table *table;
3190 	int err;
3191 
3192 	if (rt == net->ipv6.fib6_null_entry) {
3193 		err = -ENOENT;
3194 		goto out;
3195 	}
3196 
3197 	table = rt->fib6_table;
3198 	spin_lock_bh(&table->tb6_lock);
3199 	err = fib6_del(rt, info);
3200 	spin_unlock_bh(&table->tb6_lock);
3201 
3202 out:
3203 	fib6_info_release(rt);
3204 	return err;
3205 }
3206 
3207 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3208 {
3209 	struct nl_info info = { .nl_net = net };
3210 
3211 	return __ip6_del_rt(rt, &info);
3212 }
3213 
3214 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3215 {
3216 	struct nl_info *info = &cfg->fc_nlinfo;
3217 	struct net *net = info->nl_net;
3218 	struct sk_buff *skb = NULL;
3219 	struct fib6_table *table;
3220 	int err = -ENOENT;
3221 
3222 	if (rt == net->ipv6.fib6_null_entry)
3223 		goto out_put;
3224 	table = rt->fib6_table;
3225 	spin_lock_bh(&table->tb6_lock);
3226 
3227 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3228 		struct fib6_info *sibling, *next_sibling;
3229 
3230 		/* prefer to send a single notification with all hops */
3231 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3232 		if (skb) {
3233 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3234 
3235 			if (rt6_fill_node(net, skb, rt, NULL,
3236 					  NULL, NULL, 0, RTM_DELROUTE,
3237 					  info->portid, seq, 0) < 0) {
3238 				kfree_skb(skb);
3239 				skb = NULL;
3240 			} else
3241 				info->skip_notify = 1;
3242 		}
3243 
3244 		list_for_each_entry_safe(sibling, next_sibling,
3245 					 &rt->fib6_siblings,
3246 					 fib6_siblings) {
3247 			err = fib6_del(sibling, info);
3248 			if (err)
3249 				goto out_unlock;
3250 		}
3251 	}
3252 
3253 	err = fib6_del(rt, info);
3254 out_unlock:
3255 	spin_unlock_bh(&table->tb6_lock);
3256 out_put:
3257 	fib6_info_release(rt);
3258 
3259 	if (skb) {
3260 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3261 			    info->nlh, gfp_any());
3262 	}
3263 	return err;
3264 }
3265 
3266 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3267 {
3268 	int rc = -ESRCH;
3269 
3270 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3271 		goto out;
3272 
3273 	if (cfg->fc_flags & RTF_GATEWAY &&
3274 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3275 		goto out;
3276 
3277 	rc = rt6_remove_exception_rt(rt);
3278 out:
3279 	return rc;
3280 }
3281 
3282 static int ip6_route_del(struct fib6_config *cfg,
3283 			 struct netlink_ext_ack *extack)
3284 {
3285 	struct rt6_info *rt_cache;
3286 	struct fib6_table *table;
3287 	struct fib6_info *rt;
3288 	struct fib6_node *fn;
3289 	int err = -ESRCH;
3290 
3291 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3292 	if (!table) {
3293 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3294 		return err;
3295 	}
3296 
3297 	rcu_read_lock();
3298 
3299 	fn = fib6_locate(&table->tb6_root,
3300 			 &cfg->fc_dst, cfg->fc_dst_len,
3301 			 &cfg->fc_src, cfg->fc_src_len,
3302 			 !(cfg->fc_flags & RTF_CACHE));
3303 
3304 	if (fn) {
3305 		for_each_fib6_node_rt_rcu(fn) {
3306 			struct fib6_nh *nh;
3307 
3308 			if (cfg->fc_flags & RTF_CACHE) {
3309 				int rc;
3310 
3311 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3312 							      &cfg->fc_src);
3313 				if (rt_cache) {
3314 					rc = ip6_del_cached_rt(rt_cache, cfg);
3315 					if (rc != -ESRCH) {
3316 						rcu_read_unlock();
3317 						return rc;
3318 					}
3319 				}
3320 				continue;
3321 			}
3322 
3323 			nh = &rt->fib6_nh;
3324 			if (cfg->fc_ifindex &&
3325 			    (!nh->fib_nh_dev ||
3326 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3327 				continue;
3328 			if (cfg->fc_flags & RTF_GATEWAY &&
3329 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3330 				continue;
3331 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3332 				continue;
3333 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3334 				continue;
3335 			if (!fib6_info_hold_safe(rt))
3336 				continue;
3337 			rcu_read_unlock();
3338 
3339 			/* if gateway was specified only delete the one hop */
3340 			if (cfg->fc_flags & RTF_GATEWAY)
3341 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3342 
3343 			return __ip6_del_rt_siblings(rt, cfg);
3344 		}
3345 	}
3346 	rcu_read_unlock();
3347 
3348 	return err;
3349 }
3350 
3351 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3352 {
3353 	struct netevent_redirect netevent;
3354 	struct rt6_info *rt, *nrt = NULL;
3355 	struct ndisc_options ndopts;
3356 	struct inet6_dev *in6_dev;
3357 	struct neighbour *neigh;
3358 	struct fib6_info *from;
3359 	struct rd_msg *msg;
3360 	int optlen, on_link;
3361 	u8 *lladdr;
3362 
3363 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3364 	optlen -= sizeof(*msg);
3365 
3366 	if (optlen < 0) {
3367 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3368 		return;
3369 	}
3370 
3371 	msg = (struct rd_msg *)icmp6_hdr(skb);
3372 
3373 	if (ipv6_addr_is_multicast(&msg->dest)) {
3374 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3375 		return;
3376 	}
3377 
3378 	on_link = 0;
3379 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3380 		on_link = 1;
3381 	} else if (ipv6_addr_type(&msg->target) !=
3382 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3383 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3384 		return;
3385 	}
3386 
3387 	in6_dev = __in6_dev_get(skb->dev);
3388 	if (!in6_dev)
3389 		return;
3390 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3391 		return;
3392 
3393 	/* RFC2461 8.1:
3394 	 *	The IP source address of the Redirect MUST be the same as the current
3395 	 *	first-hop router for the specified ICMP Destination Address.
3396 	 */
3397 
3398 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3399 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3400 		return;
3401 	}
3402 
3403 	lladdr = NULL;
3404 	if (ndopts.nd_opts_tgt_lladdr) {
3405 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3406 					     skb->dev);
3407 		if (!lladdr) {
3408 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3409 			return;
3410 		}
3411 	}
3412 
3413 	rt = (struct rt6_info *) dst;
3414 	if (rt->rt6i_flags & RTF_REJECT) {
3415 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3416 		return;
3417 	}
3418 
3419 	/* Redirect received -> path was valid.
3420 	 * Look, redirects are sent only in response to data packets,
3421 	 * so that this nexthop apparently is reachable. --ANK
3422 	 */
3423 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3424 
3425 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3426 	if (!neigh)
3427 		return;
3428 
3429 	/*
3430 	 *	We have finally decided to accept it.
3431 	 */
3432 
3433 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3434 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3435 		     NEIGH_UPDATE_F_OVERRIDE|
3436 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3437 				     NEIGH_UPDATE_F_ISROUTER)),
3438 		     NDISC_REDIRECT, &ndopts);
3439 
3440 	rcu_read_lock();
3441 	from = rcu_dereference(rt->from);
3442 	/* This fib6_info_hold() is safe here because we hold reference to rt
3443 	 * and rt already holds reference to fib6_info.
3444 	 */
3445 	fib6_info_hold(from);
3446 	rcu_read_unlock();
3447 
3448 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3449 	if (!nrt)
3450 		goto out;
3451 
3452 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3453 	if (on_link)
3454 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3455 
3456 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3457 
3458 	/* No need to remove rt from the exception table if rt is
3459 	 * a cached route because rt6_insert_exception() will
3460 	 * takes care of it
3461 	 */
3462 	if (rt6_insert_exception(nrt, from)) {
3463 		dst_release_immediate(&nrt->dst);
3464 		goto out;
3465 	}
3466 
3467 	netevent.old = &rt->dst;
3468 	netevent.new = &nrt->dst;
3469 	netevent.daddr = &msg->dest;
3470 	netevent.neigh = neigh;
3471 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3472 
3473 out:
3474 	fib6_info_release(from);
3475 	neigh_release(neigh);
3476 }
3477 
3478 #ifdef CONFIG_IPV6_ROUTE_INFO
3479 static struct fib6_info *rt6_get_route_info(struct net *net,
3480 					   const struct in6_addr *prefix, int prefixlen,
3481 					   const struct in6_addr *gwaddr,
3482 					   struct net_device *dev)
3483 {
3484 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3485 	int ifindex = dev->ifindex;
3486 	struct fib6_node *fn;
3487 	struct fib6_info *rt = NULL;
3488 	struct fib6_table *table;
3489 
3490 	table = fib6_get_table(net, tb_id);
3491 	if (!table)
3492 		return NULL;
3493 
3494 	rcu_read_lock();
3495 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3496 	if (!fn)
3497 		goto out;
3498 
3499 	for_each_fib6_node_rt_rcu(fn) {
3500 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3501 			continue;
3502 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3503 		    !rt->fib6_nh.fib_nh_gw_family)
3504 			continue;
3505 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3506 			continue;
3507 		if (!fib6_info_hold_safe(rt))
3508 			continue;
3509 		break;
3510 	}
3511 out:
3512 	rcu_read_unlock();
3513 	return rt;
3514 }
3515 
3516 static struct fib6_info *rt6_add_route_info(struct net *net,
3517 					   const struct in6_addr *prefix, int prefixlen,
3518 					   const struct in6_addr *gwaddr,
3519 					   struct net_device *dev,
3520 					   unsigned int pref)
3521 {
3522 	struct fib6_config cfg = {
3523 		.fc_metric	= IP6_RT_PRIO_USER,
3524 		.fc_ifindex	= dev->ifindex,
3525 		.fc_dst_len	= prefixlen,
3526 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3527 				  RTF_UP | RTF_PREF(pref),
3528 		.fc_protocol = RTPROT_RA,
3529 		.fc_type = RTN_UNICAST,
3530 		.fc_nlinfo.portid = 0,
3531 		.fc_nlinfo.nlh = NULL,
3532 		.fc_nlinfo.nl_net = net,
3533 	};
3534 
3535 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3536 	cfg.fc_dst = *prefix;
3537 	cfg.fc_gateway = *gwaddr;
3538 
3539 	/* We should treat it as a default route if prefix length is 0. */
3540 	if (!prefixlen)
3541 		cfg.fc_flags |= RTF_DEFAULT;
3542 
3543 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3544 
3545 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3546 }
3547 #endif
3548 
3549 struct fib6_info *rt6_get_dflt_router(struct net *net,
3550 				     const struct in6_addr *addr,
3551 				     struct net_device *dev)
3552 {
3553 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3554 	struct fib6_info *rt;
3555 	struct fib6_table *table;
3556 
3557 	table = fib6_get_table(net, tb_id);
3558 	if (!table)
3559 		return NULL;
3560 
3561 	rcu_read_lock();
3562 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3563 		struct fib6_nh *nh = &rt->fib6_nh;
3564 
3565 		if (dev == nh->fib_nh_dev &&
3566 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3567 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3568 			break;
3569 	}
3570 	if (rt && !fib6_info_hold_safe(rt))
3571 		rt = NULL;
3572 	rcu_read_unlock();
3573 	return rt;
3574 }
3575 
3576 struct fib6_info *rt6_add_dflt_router(struct net *net,
3577 				     const struct in6_addr *gwaddr,
3578 				     struct net_device *dev,
3579 				     unsigned int pref)
3580 {
3581 	struct fib6_config cfg = {
3582 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3583 		.fc_metric	= IP6_RT_PRIO_USER,
3584 		.fc_ifindex	= dev->ifindex,
3585 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3586 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3587 		.fc_protocol = RTPROT_RA,
3588 		.fc_type = RTN_UNICAST,
3589 		.fc_nlinfo.portid = 0,
3590 		.fc_nlinfo.nlh = NULL,
3591 		.fc_nlinfo.nl_net = net,
3592 	};
3593 
3594 	cfg.fc_gateway = *gwaddr;
3595 
3596 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3597 		struct fib6_table *table;
3598 
3599 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3600 		if (table)
3601 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3602 	}
3603 
3604 	return rt6_get_dflt_router(net, gwaddr, dev);
3605 }
3606 
3607 static void __rt6_purge_dflt_routers(struct net *net,
3608 				     struct fib6_table *table)
3609 {
3610 	struct fib6_info *rt;
3611 
3612 restart:
3613 	rcu_read_lock();
3614 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3615 		struct net_device *dev = fib6_info_nh_dev(rt);
3616 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3617 
3618 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3619 		    (!idev || idev->cnf.accept_ra != 2) &&
3620 		    fib6_info_hold_safe(rt)) {
3621 			rcu_read_unlock();
3622 			ip6_del_rt(net, rt);
3623 			goto restart;
3624 		}
3625 	}
3626 	rcu_read_unlock();
3627 
3628 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3629 }
3630 
3631 void rt6_purge_dflt_routers(struct net *net)
3632 {
3633 	struct fib6_table *table;
3634 	struct hlist_head *head;
3635 	unsigned int h;
3636 
3637 	rcu_read_lock();
3638 
3639 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3640 		head = &net->ipv6.fib_table_hash[h];
3641 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3642 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3643 				__rt6_purge_dflt_routers(net, table);
3644 		}
3645 	}
3646 
3647 	rcu_read_unlock();
3648 }
3649 
3650 static void rtmsg_to_fib6_config(struct net *net,
3651 				 struct in6_rtmsg *rtmsg,
3652 				 struct fib6_config *cfg)
3653 {
3654 	*cfg = (struct fib6_config){
3655 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3656 			 : RT6_TABLE_MAIN,
3657 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3658 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3659 		.fc_expires = rtmsg->rtmsg_info,
3660 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3661 		.fc_src_len = rtmsg->rtmsg_src_len,
3662 		.fc_flags = rtmsg->rtmsg_flags,
3663 		.fc_type = rtmsg->rtmsg_type,
3664 
3665 		.fc_nlinfo.nl_net = net,
3666 
3667 		.fc_dst = rtmsg->rtmsg_dst,
3668 		.fc_src = rtmsg->rtmsg_src,
3669 		.fc_gateway = rtmsg->rtmsg_gateway,
3670 	};
3671 }
3672 
3673 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3674 {
3675 	struct fib6_config cfg;
3676 	struct in6_rtmsg rtmsg;
3677 	int err;
3678 
3679 	switch (cmd) {
3680 	case SIOCADDRT:		/* Add a route */
3681 	case SIOCDELRT:		/* Delete a route */
3682 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3683 			return -EPERM;
3684 		err = copy_from_user(&rtmsg, arg,
3685 				     sizeof(struct in6_rtmsg));
3686 		if (err)
3687 			return -EFAULT;
3688 
3689 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3690 
3691 		rtnl_lock();
3692 		switch (cmd) {
3693 		case SIOCADDRT:
3694 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3695 			break;
3696 		case SIOCDELRT:
3697 			err = ip6_route_del(&cfg, NULL);
3698 			break;
3699 		default:
3700 			err = -EINVAL;
3701 		}
3702 		rtnl_unlock();
3703 
3704 		return err;
3705 	}
3706 
3707 	return -EINVAL;
3708 }
3709 
3710 /*
3711  *	Drop the packet on the floor
3712  */
3713 
3714 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3715 {
3716 	int type;
3717 	struct dst_entry *dst = skb_dst(skb);
3718 	switch (ipstats_mib_noroutes) {
3719 	case IPSTATS_MIB_INNOROUTES:
3720 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3721 		if (type == IPV6_ADDR_ANY) {
3722 			IP6_INC_STATS(dev_net(dst->dev),
3723 				      __in6_dev_get_safely(skb->dev),
3724 				      IPSTATS_MIB_INADDRERRORS);
3725 			break;
3726 		}
3727 		/* FALLTHROUGH */
3728 	case IPSTATS_MIB_OUTNOROUTES:
3729 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3730 			      ipstats_mib_noroutes);
3731 		break;
3732 	}
3733 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3734 	kfree_skb(skb);
3735 	return 0;
3736 }
3737 
3738 static int ip6_pkt_discard(struct sk_buff *skb)
3739 {
3740 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3741 }
3742 
3743 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3744 {
3745 	skb->dev = skb_dst(skb)->dev;
3746 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3747 }
3748 
3749 static int ip6_pkt_prohibit(struct sk_buff *skb)
3750 {
3751 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3752 }
3753 
3754 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3755 {
3756 	skb->dev = skb_dst(skb)->dev;
3757 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3758 }
3759 
3760 /*
3761  *	Allocate a dst for local (unicast / anycast) address.
3762  */
3763 
3764 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3765 				     struct inet6_dev *idev,
3766 				     const struct in6_addr *addr,
3767 				     bool anycast, gfp_t gfp_flags)
3768 {
3769 	struct fib6_config cfg = {
3770 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3771 		.fc_ifindex = idev->dev->ifindex,
3772 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3773 		.fc_dst = *addr,
3774 		.fc_dst_len = 128,
3775 		.fc_protocol = RTPROT_KERNEL,
3776 		.fc_nlinfo.nl_net = net,
3777 		.fc_ignore_dev_down = true,
3778 	};
3779 
3780 	if (anycast) {
3781 		cfg.fc_type = RTN_ANYCAST;
3782 		cfg.fc_flags |= RTF_ANYCAST;
3783 	} else {
3784 		cfg.fc_type = RTN_LOCAL;
3785 		cfg.fc_flags |= RTF_LOCAL;
3786 	}
3787 
3788 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3789 }
3790 
3791 /* remove deleted ip from prefsrc entries */
3792 struct arg_dev_net_ip {
3793 	struct net_device *dev;
3794 	struct net *net;
3795 	struct in6_addr *addr;
3796 };
3797 
3798 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3799 {
3800 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3801 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3802 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3803 
3804 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3805 	    rt != net->ipv6.fib6_null_entry &&
3806 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3807 		spin_lock_bh(&rt6_exception_lock);
3808 		/* remove prefsrc entry */
3809 		rt->fib6_prefsrc.plen = 0;
3810 		spin_unlock_bh(&rt6_exception_lock);
3811 	}
3812 	return 0;
3813 }
3814 
3815 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3816 {
3817 	struct net *net = dev_net(ifp->idev->dev);
3818 	struct arg_dev_net_ip adni = {
3819 		.dev = ifp->idev->dev,
3820 		.net = net,
3821 		.addr = &ifp->addr,
3822 	};
3823 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3824 }
3825 
3826 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3827 
3828 /* Remove routers and update dst entries when gateway turn into host. */
3829 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3830 {
3831 	struct in6_addr *gateway = (struct in6_addr *)arg;
3832 
3833 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3834 	    rt->fib6_nh.fib_nh_gw_family &&
3835 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3836 		return -1;
3837 	}
3838 
3839 	/* Further clean up cached routes in exception table.
3840 	 * This is needed because cached route may have a different
3841 	 * gateway than its 'parent' in the case of an ip redirect.
3842 	 */
3843 	rt6_exceptions_clean_tohost(rt, gateway);
3844 
3845 	return 0;
3846 }
3847 
3848 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3849 {
3850 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3851 }
3852 
3853 struct arg_netdev_event {
3854 	const struct net_device *dev;
3855 	union {
3856 		unsigned int nh_flags;
3857 		unsigned long event;
3858 	};
3859 };
3860 
3861 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3862 {
3863 	struct fib6_info *iter;
3864 	struct fib6_node *fn;
3865 
3866 	fn = rcu_dereference_protected(rt->fib6_node,
3867 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3868 	iter = rcu_dereference_protected(fn->leaf,
3869 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3870 	while (iter) {
3871 		if (iter->fib6_metric == rt->fib6_metric &&
3872 		    rt6_qualify_for_ecmp(iter))
3873 			return iter;
3874 		iter = rcu_dereference_protected(iter->fib6_next,
3875 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3876 	}
3877 
3878 	return NULL;
3879 }
3880 
3881 static bool rt6_is_dead(const struct fib6_info *rt)
3882 {
3883 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3884 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3885 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3886 		return true;
3887 
3888 	return false;
3889 }
3890 
3891 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3892 {
3893 	struct fib6_info *iter;
3894 	int total = 0;
3895 
3896 	if (!rt6_is_dead(rt))
3897 		total += rt->fib6_nh.fib_nh_weight;
3898 
3899 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3900 		if (!rt6_is_dead(iter))
3901 			total += iter->fib6_nh.fib_nh_weight;
3902 	}
3903 
3904 	return total;
3905 }
3906 
3907 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3908 {
3909 	int upper_bound = -1;
3910 
3911 	if (!rt6_is_dead(rt)) {
3912 		*weight += rt->fib6_nh.fib_nh_weight;
3913 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3914 						    total) - 1;
3915 	}
3916 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3917 }
3918 
3919 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3920 {
3921 	struct fib6_info *iter;
3922 	int weight = 0;
3923 
3924 	rt6_upper_bound_set(rt, &weight, total);
3925 
3926 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3927 		rt6_upper_bound_set(iter, &weight, total);
3928 }
3929 
3930 void rt6_multipath_rebalance(struct fib6_info *rt)
3931 {
3932 	struct fib6_info *first;
3933 	int total;
3934 
3935 	/* In case the entire multipath route was marked for flushing,
3936 	 * then there is no need to rebalance upon the removal of every
3937 	 * sibling route.
3938 	 */
3939 	if (!rt->fib6_nsiblings || rt->should_flush)
3940 		return;
3941 
3942 	/* During lookup routes are evaluated in order, so we need to
3943 	 * make sure upper bounds are assigned from the first sibling
3944 	 * onwards.
3945 	 */
3946 	first = rt6_multipath_first_sibling(rt);
3947 	if (WARN_ON_ONCE(!first))
3948 		return;
3949 
3950 	total = rt6_multipath_total_weight(first);
3951 	rt6_multipath_upper_bound_set(first, total);
3952 }
3953 
3954 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3955 {
3956 	const struct arg_netdev_event *arg = p_arg;
3957 	struct net *net = dev_net(arg->dev);
3958 
3959 	if (rt != net->ipv6.fib6_null_entry &&
3960 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
3961 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3962 		fib6_update_sernum_upto_root(net, rt);
3963 		rt6_multipath_rebalance(rt);
3964 	}
3965 
3966 	return 0;
3967 }
3968 
3969 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3970 {
3971 	struct arg_netdev_event arg = {
3972 		.dev = dev,
3973 		{
3974 			.nh_flags = nh_flags,
3975 		},
3976 	};
3977 
3978 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3979 		arg.nh_flags |= RTNH_F_LINKDOWN;
3980 
3981 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3982 }
3983 
3984 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3985 				   const struct net_device *dev)
3986 {
3987 	struct fib6_info *iter;
3988 
3989 	if (rt->fib6_nh.fib_nh_dev == dev)
3990 		return true;
3991 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992 		if (iter->fib6_nh.fib_nh_dev == dev)
3993 			return true;
3994 
3995 	return false;
3996 }
3997 
3998 static void rt6_multipath_flush(struct fib6_info *rt)
3999 {
4000 	struct fib6_info *iter;
4001 
4002 	rt->should_flush = 1;
4003 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4004 		iter->should_flush = 1;
4005 }
4006 
4007 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4008 					     const struct net_device *down_dev)
4009 {
4010 	struct fib6_info *iter;
4011 	unsigned int dead = 0;
4012 
4013 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4014 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4015 		dead++;
4016 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4017 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4018 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4019 			dead++;
4020 
4021 	return dead;
4022 }
4023 
4024 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4025 				       const struct net_device *dev,
4026 				       unsigned int nh_flags)
4027 {
4028 	struct fib6_info *iter;
4029 
4030 	if (rt->fib6_nh.fib_nh_dev == dev)
4031 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4032 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4033 		if (iter->fib6_nh.fib_nh_dev == dev)
4034 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4035 }
4036 
4037 /* called with write lock held for table with rt */
4038 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4039 {
4040 	const struct arg_netdev_event *arg = p_arg;
4041 	const struct net_device *dev = arg->dev;
4042 	struct net *net = dev_net(dev);
4043 
4044 	if (rt == net->ipv6.fib6_null_entry)
4045 		return 0;
4046 
4047 	switch (arg->event) {
4048 	case NETDEV_UNREGISTER:
4049 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4050 	case NETDEV_DOWN:
4051 		if (rt->should_flush)
4052 			return -1;
4053 		if (!rt->fib6_nsiblings)
4054 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4055 		if (rt6_multipath_uses_dev(rt, dev)) {
4056 			unsigned int count;
4057 
4058 			count = rt6_multipath_dead_count(rt, dev);
4059 			if (rt->fib6_nsiblings + 1 == count) {
4060 				rt6_multipath_flush(rt);
4061 				return -1;
4062 			}
4063 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4064 						   RTNH_F_LINKDOWN);
4065 			fib6_update_sernum(net, rt);
4066 			rt6_multipath_rebalance(rt);
4067 		}
4068 		return -2;
4069 	case NETDEV_CHANGE:
4070 		if (rt->fib6_nh.fib_nh_dev != dev ||
4071 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4072 			break;
4073 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4074 		rt6_multipath_rebalance(rt);
4075 		break;
4076 	}
4077 
4078 	return 0;
4079 }
4080 
4081 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4082 {
4083 	struct arg_netdev_event arg = {
4084 		.dev = dev,
4085 		{
4086 			.event = event,
4087 		},
4088 	};
4089 	struct net *net = dev_net(dev);
4090 
4091 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4092 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4093 	else
4094 		fib6_clean_all(net, fib6_ifdown, &arg);
4095 }
4096 
4097 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4098 {
4099 	rt6_sync_down_dev(dev, event);
4100 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4101 	neigh_ifdown(&nd_tbl, dev);
4102 }
4103 
4104 struct rt6_mtu_change_arg {
4105 	struct net_device *dev;
4106 	unsigned int mtu;
4107 };
4108 
4109 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4110 {
4111 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4112 	struct inet6_dev *idev;
4113 
4114 	/* In IPv6 pmtu discovery is not optional,
4115 	   so that RTAX_MTU lock cannot disable it.
4116 	   We still use this lock to block changes
4117 	   caused by addrconf/ndisc.
4118 	*/
4119 
4120 	idev = __in6_dev_get(arg->dev);
4121 	if (!idev)
4122 		return 0;
4123 
4124 	/* For administrative MTU increase, there is no way to discover
4125 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4126 	   Since RFC 1981 doesn't include administrative MTU increase
4127 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4128 	 */
4129 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4130 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4131 		u32 mtu = rt->fib6_pmtu;
4132 
4133 		if (mtu >= arg->mtu ||
4134 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4135 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4136 
4137 		spin_lock_bh(&rt6_exception_lock);
4138 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4139 		spin_unlock_bh(&rt6_exception_lock);
4140 	}
4141 	return 0;
4142 }
4143 
4144 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4145 {
4146 	struct rt6_mtu_change_arg arg = {
4147 		.dev = dev,
4148 		.mtu = mtu,
4149 	};
4150 
4151 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4152 }
4153 
4154 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4155 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4156 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4157 	[RTA_OIF]               = { .type = NLA_U32 },
4158 	[RTA_IIF]		= { .type = NLA_U32 },
4159 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4160 	[RTA_METRICS]           = { .type = NLA_NESTED },
4161 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4162 	[RTA_PREF]              = { .type = NLA_U8 },
4163 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4164 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4165 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4166 	[RTA_UID]		= { .type = NLA_U32 },
4167 	[RTA_MARK]		= { .type = NLA_U32 },
4168 	[RTA_TABLE]		= { .type = NLA_U32 },
4169 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4170 	[RTA_SPORT]		= { .type = NLA_U16 },
4171 	[RTA_DPORT]		= { .type = NLA_U16 },
4172 };
4173 
4174 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4175 			      struct fib6_config *cfg,
4176 			      struct netlink_ext_ack *extack)
4177 {
4178 	struct rtmsg *rtm;
4179 	struct nlattr *tb[RTA_MAX+1];
4180 	unsigned int pref;
4181 	int err;
4182 
4183 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4184 			  extack);
4185 	if (err < 0)
4186 		goto errout;
4187 
4188 	err = -EINVAL;
4189 	rtm = nlmsg_data(nlh);
4190 
4191 	*cfg = (struct fib6_config){
4192 		.fc_table = rtm->rtm_table,
4193 		.fc_dst_len = rtm->rtm_dst_len,
4194 		.fc_src_len = rtm->rtm_src_len,
4195 		.fc_flags = RTF_UP,
4196 		.fc_protocol = rtm->rtm_protocol,
4197 		.fc_type = rtm->rtm_type,
4198 
4199 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4200 		.fc_nlinfo.nlh = nlh,
4201 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4202 	};
4203 
4204 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4205 	    rtm->rtm_type == RTN_BLACKHOLE ||
4206 	    rtm->rtm_type == RTN_PROHIBIT ||
4207 	    rtm->rtm_type == RTN_THROW)
4208 		cfg->fc_flags |= RTF_REJECT;
4209 
4210 	if (rtm->rtm_type == RTN_LOCAL)
4211 		cfg->fc_flags |= RTF_LOCAL;
4212 
4213 	if (rtm->rtm_flags & RTM_F_CLONED)
4214 		cfg->fc_flags |= RTF_CACHE;
4215 
4216 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4217 
4218 	if (tb[RTA_GATEWAY]) {
4219 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4220 		cfg->fc_flags |= RTF_GATEWAY;
4221 	}
4222 	if (tb[RTA_VIA]) {
4223 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4224 		goto errout;
4225 	}
4226 
4227 	if (tb[RTA_DST]) {
4228 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4229 
4230 		if (nla_len(tb[RTA_DST]) < plen)
4231 			goto errout;
4232 
4233 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4234 	}
4235 
4236 	if (tb[RTA_SRC]) {
4237 		int plen = (rtm->rtm_src_len + 7) >> 3;
4238 
4239 		if (nla_len(tb[RTA_SRC]) < plen)
4240 			goto errout;
4241 
4242 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4243 	}
4244 
4245 	if (tb[RTA_PREFSRC])
4246 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4247 
4248 	if (tb[RTA_OIF])
4249 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4250 
4251 	if (tb[RTA_PRIORITY])
4252 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4253 
4254 	if (tb[RTA_METRICS]) {
4255 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4256 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4257 	}
4258 
4259 	if (tb[RTA_TABLE])
4260 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4261 
4262 	if (tb[RTA_MULTIPATH]) {
4263 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4264 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4265 
4266 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4267 						     cfg->fc_mp_len, extack);
4268 		if (err < 0)
4269 			goto errout;
4270 	}
4271 
4272 	if (tb[RTA_PREF]) {
4273 		pref = nla_get_u8(tb[RTA_PREF]);
4274 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4275 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4276 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4277 		cfg->fc_flags |= RTF_PREF(pref);
4278 	}
4279 
4280 	if (tb[RTA_ENCAP])
4281 		cfg->fc_encap = tb[RTA_ENCAP];
4282 
4283 	if (tb[RTA_ENCAP_TYPE]) {
4284 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4285 
4286 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4287 		if (err < 0)
4288 			goto errout;
4289 	}
4290 
4291 	if (tb[RTA_EXPIRES]) {
4292 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4293 
4294 		if (addrconf_finite_timeout(timeout)) {
4295 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4296 			cfg->fc_flags |= RTF_EXPIRES;
4297 		}
4298 	}
4299 
4300 	err = 0;
4301 errout:
4302 	return err;
4303 }
4304 
4305 struct rt6_nh {
4306 	struct fib6_info *fib6_info;
4307 	struct fib6_config r_cfg;
4308 	struct list_head next;
4309 };
4310 
4311 static int ip6_route_info_append(struct net *net,
4312 				 struct list_head *rt6_nh_list,
4313 				 struct fib6_info *rt,
4314 				 struct fib6_config *r_cfg)
4315 {
4316 	struct rt6_nh *nh;
4317 	int err = -EEXIST;
4318 
4319 	list_for_each_entry(nh, rt6_nh_list, next) {
4320 		/* check if fib6_info already exists */
4321 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4322 			return err;
4323 	}
4324 
4325 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4326 	if (!nh)
4327 		return -ENOMEM;
4328 	nh->fib6_info = rt;
4329 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4330 	list_add_tail(&nh->next, rt6_nh_list);
4331 
4332 	return 0;
4333 }
4334 
4335 static void ip6_route_mpath_notify(struct fib6_info *rt,
4336 				   struct fib6_info *rt_last,
4337 				   struct nl_info *info,
4338 				   __u16 nlflags)
4339 {
4340 	/* if this is an APPEND route, then rt points to the first route
4341 	 * inserted and rt_last points to last route inserted. Userspace
4342 	 * wants a consistent dump of the route which starts at the first
4343 	 * nexthop. Since sibling routes are always added at the end of
4344 	 * the list, find the first sibling of the last route appended
4345 	 */
4346 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4347 		rt = list_first_entry(&rt_last->fib6_siblings,
4348 				      struct fib6_info,
4349 				      fib6_siblings);
4350 	}
4351 
4352 	if (rt)
4353 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4354 }
4355 
4356 static int ip6_route_multipath_add(struct fib6_config *cfg,
4357 				   struct netlink_ext_ack *extack)
4358 {
4359 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4360 	struct nl_info *info = &cfg->fc_nlinfo;
4361 	struct fib6_config r_cfg;
4362 	struct rtnexthop *rtnh;
4363 	struct fib6_info *rt;
4364 	struct rt6_nh *err_nh;
4365 	struct rt6_nh *nh, *nh_safe;
4366 	__u16 nlflags;
4367 	int remaining;
4368 	int attrlen;
4369 	int err = 1;
4370 	int nhn = 0;
4371 	int replace = (cfg->fc_nlinfo.nlh &&
4372 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4373 	LIST_HEAD(rt6_nh_list);
4374 
4375 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4376 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4377 		nlflags |= NLM_F_APPEND;
4378 
4379 	remaining = cfg->fc_mp_len;
4380 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4381 
4382 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4383 	 * fib6_info structs per nexthop
4384 	 */
4385 	while (rtnh_ok(rtnh, remaining)) {
4386 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4387 		if (rtnh->rtnh_ifindex)
4388 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4389 
4390 		attrlen = rtnh_attrlen(rtnh);
4391 		if (attrlen > 0) {
4392 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4393 
4394 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4395 			if (nla) {
4396 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4397 				r_cfg.fc_flags |= RTF_GATEWAY;
4398 			}
4399 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4400 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4401 			if (nla)
4402 				r_cfg.fc_encap_type = nla_get_u16(nla);
4403 		}
4404 
4405 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4406 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4407 		if (IS_ERR(rt)) {
4408 			err = PTR_ERR(rt);
4409 			rt = NULL;
4410 			goto cleanup;
4411 		}
4412 		if (!rt6_qualify_for_ecmp(rt)) {
4413 			err = -EINVAL;
4414 			NL_SET_ERR_MSG(extack,
4415 				       "Device only routes can not be added for IPv6 using the multipath API.");
4416 			fib6_info_release(rt);
4417 			goto cleanup;
4418 		}
4419 
4420 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4421 
4422 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4423 					    rt, &r_cfg);
4424 		if (err) {
4425 			fib6_info_release(rt);
4426 			goto cleanup;
4427 		}
4428 
4429 		rtnh = rtnh_next(rtnh, &remaining);
4430 	}
4431 
4432 	/* for add and replace send one notification with all nexthops.
4433 	 * Skip the notification in fib6_add_rt2node and send one with
4434 	 * the full route when done
4435 	 */
4436 	info->skip_notify = 1;
4437 
4438 	err_nh = NULL;
4439 	list_for_each_entry(nh, &rt6_nh_list, next) {
4440 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4441 		fib6_info_release(nh->fib6_info);
4442 
4443 		if (!err) {
4444 			/* save reference to last route successfully inserted */
4445 			rt_last = nh->fib6_info;
4446 
4447 			/* save reference to first route for notification */
4448 			if (!rt_notif)
4449 				rt_notif = nh->fib6_info;
4450 		}
4451 
4452 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4453 		nh->fib6_info = NULL;
4454 		if (err) {
4455 			if (replace && nhn)
4456 				NL_SET_ERR_MSG_MOD(extack,
4457 						   "multipath route replace failed (check consistency of installed routes)");
4458 			err_nh = nh;
4459 			goto add_errout;
4460 		}
4461 
4462 		/* Because each route is added like a single route we remove
4463 		 * these flags after the first nexthop: if there is a collision,
4464 		 * we have already failed to add the first nexthop:
4465 		 * fib6_add_rt2node() has rejected it; when replacing, old
4466 		 * nexthops have been replaced by first new, the rest should
4467 		 * be added to it.
4468 		 */
4469 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4470 						     NLM_F_REPLACE);
4471 		nhn++;
4472 	}
4473 
4474 	/* success ... tell user about new route */
4475 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4476 	goto cleanup;
4477 
4478 add_errout:
4479 	/* send notification for routes that were added so that
4480 	 * the delete notifications sent by ip6_route_del are
4481 	 * coherent
4482 	 */
4483 	if (rt_notif)
4484 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4485 
4486 	/* Delete routes that were already added */
4487 	list_for_each_entry(nh, &rt6_nh_list, next) {
4488 		if (err_nh == nh)
4489 			break;
4490 		ip6_route_del(&nh->r_cfg, extack);
4491 	}
4492 
4493 cleanup:
4494 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4495 		if (nh->fib6_info)
4496 			fib6_info_release(nh->fib6_info);
4497 		list_del(&nh->next);
4498 		kfree(nh);
4499 	}
4500 
4501 	return err;
4502 }
4503 
4504 static int ip6_route_multipath_del(struct fib6_config *cfg,
4505 				   struct netlink_ext_ack *extack)
4506 {
4507 	struct fib6_config r_cfg;
4508 	struct rtnexthop *rtnh;
4509 	int remaining;
4510 	int attrlen;
4511 	int err = 1, last_err = 0;
4512 
4513 	remaining = cfg->fc_mp_len;
4514 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4515 
4516 	/* Parse a Multipath Entry */
4517 	while (rtnh_ok(rtnh, remaining)) {
4518 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4519 		if (rtnh->rtnh_ifindex)
4520 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4521 
4522 		attrlen = rtnh_attrlen(rtnh);
4523 		if (attrlen > 0) {
4524 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4525 
4526 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4527 			if (nla) {
4528 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4529 				r_cfg.fc_flags |= RTF_GATEWAY;
4530 			}
4531 		}
4532 		err = ip6_route_del(&r_cfg, extack);
4533 		if (err)
4534 			last_err = err;
4535 
4536 		rtnh = rtnh_next(rtnh, &remaining);
4537 	}
4538 
4539 	return last_err;
4540 }
4541 
4542 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4543 			      struct netlink_ext_ack *extack)
4544 {
4545 	struct fib6_config cfg;
4546 	int err;
4547 
4548 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4549 	if (err < 0)
4550 		return err;
4551 
4552 	if (cfg.fc_mp)
4553 		return ip6_route_multipath_del(&cfg, extack);
4554 	else {
4555 		cfg.fc_delete_all_nh = 1;
4556 		return ip6_route_del(&cfg, extack);
4557 	}
4558 }
4559 
4560 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4561 			      struct netlink_ext_ack *extack)
4562 {
4563 	struct fib6_config cfg;
4564 	int err;
4565 
4566 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4567 	if (err < 0)
4568 		return err;
4569 
4570 	if (cfg.fc_metric == 0)
4571 		cfg.fc_metric = IP6_RT_PRIO_USER;
4572 
4573 	if (cfg.fc_mp)
4574 		return ip6_route_multipath_add(&cfg, extack);
4575 	else
4576 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4577 }
4578 
4579 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4580 {
4581 	int nexthop_len = 0;
4582 
4583 	if (rt->fib6_nsiblings) {
4584 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4585 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4586 			    + nla_total_size(16) /* RTA_GATEWAY */
4587 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4588 
4589 		nexthop_len *= rt->fib6_nsiblings;
4590 	}
4591 
4592 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4593 	       + nla_total_size(16) /* RTA_SRC */
4594 	       + nla_total_size(16) /* RTA_DST */
4595 	       + nla_total_size(16) /* RTA_GATEWAY */
4596 	       + nla_total_size(16) /* RTA_PREFSRC */
4597 	       + nla_total_size(4) /* RTA_TABLE */
4598 	       + nla_total_size(4) /* RTA_IIF */
4599 	       + nla_total_size(4) /* RTA_OIF */
4600 	       + nla_total_size(4) /* RTA_PRIORITY */
4601 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4602 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4603 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4604 	       + nla_total_size(1) /* RTA_PREF */
4605 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4606 	       + nexthop_len;
4607 }
4608 
4609 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4610 			 struct fib6_info *rt, struct dst_entry *dst,
4611 			 struct in6_addr *dest, struct in6_addr *src,
4612 			 int iif, int type, u32 portid, u32 seq,
4613 			 unsigned int flags)
4614 {
4615 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4616 	struct rt6key *rt6_dst, *rt6_src;
4617 	u32 *pmetrics, table, rt6_flags;
4618 	struct nlmsghdr *nlh;
4619 	struct rtmsg *rtm;
4620 	long expires = 0;
4621 
4622 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4623 	if (!nlh)
4624 		return -EMSGSIZE;
4625 
4626 	if (rt6) {
4627 		rt6_dst = &rt6->rt6i_dst;
4628 		rt6_src = &rt6->rt6i_src;
4629 		rt6_flags = rt6->rt6i_flags;
4630 	} else {
4631 		rt6_dst = &rt->fib6_dst;
4632 		rt6_src = &rt->fib6_src;
4633 		rt6_flags = rt->fib6_flags;
4634 	}
4635 
4636 	rtm = nlmsg_data(nlh);
4637 	rtm->rtm_family = AF_INET6;
4638 	rtm->rtm_dst_len = rt6_dst->plen;
4639 	rtm->rtm_src_len = rt6_src->plen;
4640 	rtm->rtm_tos = 0;
4641 	if (rt->fib6_table)
4642 		table = rt->fib6_table->tb6_id;
4643 	else
4644 		table = RT6_TABLE_UNSPEC;
4645 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4646 	if (nla_put_u32(skb, RTA_TABLE, table))
4647 		goto nla_put_failure;
4648 
4649 	rtm->rtm_type = rt->fib6_type;
4650 	rtm->rtm_flags = 0;
4651 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4652 	rtm->rtm_protocol = rt->fib6_protocol;
4653 
4654 	if (rt6_flags & RTF_CACHE)
4655 		rtm->rtm_flags |= RTM_F_CLONED;
4656 
4657 	if (dest) {
4658 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4659 			goto nla_put_failure;
4660 		rtm->rtm_dst_len = 128;
4661 	} else if (rtm->rtm_dst_len)
4662 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4663 			goto nla_put_failure;
4664 #ifdef CONFIG_IPV6_SUBTREES
4665 	if (src) {
4666 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4667 			goto nla_put_failure;
4668 		rtm->rtm_src_len = 128;
4669 	} else if (rtm->rtm_src_len &&
4670 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4671 		goto nla_put_failure;
4672 #endif
4673 	if (iif) {
4674 #ifdef CONFIG_IPV6_MROUTE
4675 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4676 			int err = ip6mr_get_route(net, skb, rtm, portid);
4677 
4678 			if (err == 0)
4679 				return 0;
4680 			if (err < 0)
4681 				goto nla_put_failure;
4682 		} else
4683 #endif
4684 			if (nla_put_u32(skb, RTA_IIF, iif))
4685 				goto nla_put_failure;
4686 	} else if (dest) {
4687 		struct in6_addr saddr_buf;
4688 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4689 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4690 			goto nla_put_failure;
4691 	}
4692 
4693 	if (rt->fib6_prefsrc.plen) {
4694 		struct in6_addr saddr_buf;
4695 		saddr_buf = rt->fib6_prefsrc.addr;
4696 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4697 			goto nla_put_failure;
4698 	}
4699 
4700 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4701 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4702 		goto nla_put_failure;
4703 
4704 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4705 		goto nla_put_failure;
4706 
4707 	/* For multipath routes, walk the siblings list and add
4708 	 * each as a nexthop within RTA_MULTIPATH.
4709 	 */
4710 	if (rt6) {
4711 		if (rt6_flags & RTF_GATEWAY &&
4712 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4713 			goto nla_put_failure;
4714 
4715 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4716 			goto nla_put_failure;
4717 	} else if (rt->fib6_nsiblings) {
4718 		struct fib6_info *sibling, *next_sibling;
4719 		struct nlattr *mp;
4720 
4721 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4722 		if (!mp)
4723 			goto nla_put_failure;
4724 
4725 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4726 				    rt->fib6_nh.fib_nh_weight) < 0)
4727 			goto nla_put_failure;
4728 
4729 		list_for_each_entry_safe(sibling, next_sibling,
4730 					 &rt->fib6_siblings, fib6_siblings) {
4731 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4732 					    sibling->fib6_nh.fib_nh_weight) < 0)
4733 				goto nla_put_failure;
4734 		}
4735 
4736 		nla_nest_end(skb, mp);
4737 	} else {
4738 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4739 				     &rtm->rtm_flags, false) < 0)
4740 			goto nla_put_failure;
4741 	}
4742 
4743 	if (rt6_flags & RTF_EXPIRES) {
4744 		expires = dst ? dst->expires : rt->expires;
4745 		expires -= jiffies;
4746 	}
4747 
4748 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4749 		goto nla_put_failure;
4750 
4751 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4752 		goto nla_put_failure;
4753 
4754 
4755 	nlmsg_end(skb, nlh);
4756 	return 0;
4757 
4758 nla_put_failure:
4759 	nlmsg_cancel(skb, nlh);
4760 	return -EMSGSIZE;
4761 }
4762 
4763 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4764 			       const struct net_device *dev)
4765 {
4766 	if (f6i->fib6_nh.fib_nh_dev == dev)
4767 		return true;
4768 
4769 	if (f6i->fib6_nsiblings) {
4770 		struct fib6_info *sibling, *next_sibling;
4771 
4772 		list_for_each_entry_safe(sibling, next_sibling,
4773 					 &f6i->fib6_siblings, fib6_siblings) {
4774 			if (sibling->fib6_nh.fib_nh_dev == dev)
4775 				return true;
4776 		}
4777 	}
4778 
4779 	return false;
4780 }
4781 
4782 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4783 {
4784 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4785 	struct fib_dump_filter *filter = &arg->filter;
4786 	unsigned int flags = NLM_F_MULTI;
4787 	struct net *net = arg->net;
4788 
4789 	if (rt == net->ipv6.fib6_null_entry)
4790 		return 0;
4791 
4792 	if ((filter->flags & RTM_F_PREFIX) &&
4793 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4794 		/* success since this is not a prefix route */
4795 		return 1;
4796 	}
4797 	if (filter->filter_set) {
4798 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4799 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4800 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4801 			return 1;
4802 		}
4803 		flags |= NLM_F_DUMP_FILTERED;
4804 	}
4805 
4806 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4807 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4808 			     arg->cb->nlh->nlmsg_seq, flags);
4809 }
4810 
4811 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4812 					const struct nlmsghdr *nlh,
4813 					struct nlattr **tb,
4814 					struct netlink_ext_ack *extack)
4815 {
4816 	struct rtmsg *rtm;
4817 	int i, err;
4818 
4819 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4820 		NL_SET_ERR_MSG_MOD(extack,
4821 				   "Invalid header for get route request");
4822 		return -EINVAL;
4823 	}
4824 
4825 	if (!netlink_strict_get_check(skb))
4826 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4827 				   rtm_ipv6_policy, extack);
4828 
4829 	rtm = nlmsg_data(nlh);
4830 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4831 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4832 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4833 	    rtm->rtm_type) {
4834 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4835 		return -EINVAL;
4836 	}
4837 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4838 		NL_SET_ERR_MSG_MOD(extack,
4839 				   "Invalid flags for get route request");
4840 		return -EINVAL;
4841 	}
4842 
4843 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4844 				 rtm_ipv6_policy, extack);
4845 	if (err)
4846 		return err;
4847 
4848 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4849 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4850 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4851 		return -EINVAL;
4852 	}
4853 
4854 	for (i = 0; i <= RTA_MAX; i++) {
4855 		if (!tb[i])
4856 			continue;
4857 
4858 		switch (i) {
4859 		case RTA_SRC:
4860 		case RTA_DST:
4861 		case RTA_IIF:
4862 		case RTA_OIF:
4863 		case RTA_MARK:
4864 		case RTA_UID:
4865 		case RTA_SPORT:
4866 		case RTA_DPORT:
4867 		case RTA_IP_PROTO:
4868 			break;
4869 		default:
4870 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4871 			return -EINVAL;
4872 		}
4873 	}
4874 
4875 	return 0;
4876 }
4877 
4878 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4879 			      struct netlink_ext_ack *extack)
4880 {
4881 	struct net *net = sock_net(in_skb->sk);
4882 	struct nlattr *tb[RTA_MAX+1];
4883 	int err, iif = 0, oif = 0;
4884 	struct fib6_info *from;
4885 	struct dst_entry *dst;
4886 	struct rt6_info *rt;
4887 	struct sk_buff *skb;
4888 	struct rtmsg *rtm;
4889 	struct flowi6 fl6 = {};
4890 	bool fibmatch;
4891 
4892 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4893 	if (err < 0)
4894 		goto errout;
4895 
4896 	err = -EINVAL;
4897 	rtm = nlmsg_data(nlh);
4898 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4899 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4900 
4901 	if (tb[RTA_SRC]) {
4902 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4903 			goto errout;
4904 
4905 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4906 	}
4907 
4908 	if (tb[RTA_DST]) {
4909 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4910 			goto errout;
4911 
4912 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4913 	}
4914 
4915 	if (tb[RTA_IIF])
4916 		iif = nla_get_u32(tb[RTA_IIF]);
4917 
4918 	if (tb[RTA_OIF])
4919 		oif = nla_get_u32(tb[RTA_OIF]);
4920 
4921 	if (tb[RTA_MARK])
4922 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4923 
4924 	if (tb[RTA_UID])
4925 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4926 					   nla_get_u32(tb[RTA_UID]));
4927 	else
4928 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4929 
4930 	if (tb[RTA_SPORT])
4931 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4932 
4933 	if (tb[RTA_DPORT])
4934 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4935 
4936 	if (tb[RTA_IP_PROTO]) {
4937 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4938 						  &fl6.flowi6_proto, AF_INET6,
4939 						  extack);
4940 		if (err)
4941 			goto errout;
4942 	}
4943 
4944 	if (iif) {
4945 		struct net_device *dev;
4946 		int flags = 0;
4947 
4948 		rcu_read_lock();
4949 
4950 		dev = dev_get_by_index_rcu(net, iif);
4951 		if (!dev) {
4952 			rcu_read_unlock();
4953 			err = -ENODEV;
4954 			goto errout;
4955 		}
4956 
4957 		fl6.flowi6_iif = iif;
4958 
4959 		if (!ipv6_addr_any(&fl6.saddr))
4960 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4961 
4962 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4963 
4964 		rcu_read_unlock();
4965 	} else {
4966 		fl6.flowi6_oif = oif;
4967 
4968 		dst = ip6_route_output(net, NULL, &fl6);
4969 	}
4970 
4971 
4972 	rt = container_of(dst, struct rt6_info, dst);
4973 	if (rt->dst.error) {
4974 		err = rt->dst.error;
4975 		ip6_rt_put(rt);
4976 		goto errout;
4977 	}
4978 
4979 	if (rt == net->ipv6.ip6_null_entry) {
4980 		err = rt->dst.error;
4981 		ip6_rt_put(rt);
4982 		goto errout;
4983 	}
4984 
4985 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4986 	if (!skb) {
4987 		ip6_rt_put(rt);
4988 		err = -ENOBUFS;
4989 		goto errout;
4990 	}
4991 
4992 	skb_dst_set(skb, &rt->dst);
4993 
4994 	rcu_read_lock();
4995 	from = rcu_dereference(rt->from);
4996 
4997 	if (fibmatch)
4998 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4999 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5000 				    nlh->nlmsg_seq, 0);
5001 	else
5002 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5003 				    &fl6.saddr, iif, RTM_NEWROUTE,
5004 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5005 				    0);
5006 	rcu_read_unlock();
5007 
5008 	if (err < 0) {
5009 		kfree_skb(skb);
5010 		goto errout;
5011 	}
5012 
5013 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5014 errout:
5015 	return err;
5016 }
5017 
5018 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5019 		     unsigned int nlm_flags)
5020 {
5021 	struct sk_buff *skb;
5022 	struct net *net = info->nl_net;
5023 	u32 seq;
5024 	int err;
5025 
5026 	err = -ENOBUFS;
5027 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5028 
5029 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5030 	if (!skb)
5031 		goto errout;
5032 
5033 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5034 			    event, info->portid, seq, nlm_flags);
5035 	if (err < 0) {
5036 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5037 		WARN_ON(err == -EMSGSIZE);
5038 		kfree_skb(skb);
5039 		goto errout;
5040 	}
5041 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5042 		    info->nlh, gfp_any());
5043 	return;
5044 errout:
5045 	if (err < 0)
5046 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5047 }
5048 
5049 static int ip6_route_dev_notify(struct notifier_block *this,
5050 				unsigned long event, void *ptr)
5051 {
5052 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5053 	struct net *net = dev_net(dev);
5054 
5055 	if (!(dev->flags & IFF_LOOPBACK))
5056 		return NOTIFY_OK;
5057 
5058 	if (event == NETDEV_REGISTER) {
5059 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5060 		net->ipv6.ip6_null_entry->dst.dev = dev;
5061 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5062 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5063 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5064 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5065 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5066 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5067 #endif
5068 	 } else if (event == NETDEV_UNREGISTER &&
5069 		    dev->reg_state != NETREG_UNREGISTERED) {
5070 		/* NETDEV_UNREGISTER could be fired for multiple times by
5071 		 * netdev_wait_allrefs(). Make sure we only call this once.
5072 		 */
5073 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5074 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5075 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5076 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5077 #endif
5078 	}
5079 
5080 	return NOTIFY_OK;
5081 }
5082 
5083 /*
5084  *	/proc
5085  */
5086 
5087 #ifdef CONFIG_PROC_FS
5088 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5089 {
5090 	struct net *net = (struct net *)seq->private;
5091 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5092 		   net->ipv6.rt6_stats->fib_nodes,
5093 		   net->ipv6.rt6_stats->fib_route_nodes,
5094 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5095 		   net->ipv6.rt6_stats->fib_rt_entries,
5096 		   net->ipv6.rt6_stats->fib_rt_cache,
5097 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5098 		   net->ipv6.rt6_stats->fib_discarded_routes);
5099 
5100 	return 0;
5101 }
5102 #endif	/* CONFIG_PROC_FS */
5103 
5104 #ifdef CONFIG_SYSCTL
5105 
5106 static
5107 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5108 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5109 {
5110 	struct net *net;
5111 	int delay;
5112 	int ret;
5113 	if (!write)
5114 		return -EINVAL;
5115 
5116 	net = (struct net *)ctl->extra1;
5117 	delay = net->ipv6.sysctl.flush_delay;
5118 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5119 	if (ret)
5120 		return ret;
5121 
5122 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5123 	return 0;
5124 }
5125 
5126 static int zero;
5127 static int one = 1;
5128 
5129 static struct ctl_table ipv6_route_table_template[] = {
5130 	{
5131 		.procname	=	"flush",
5132 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5133 		.maxlen		=	sizeof(int),
5134 		.mode		=	0200,
5135 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5136 	},
5137 	{
5138 		.procname	=	"gc_thresh",
5139 		.data		=	&ip6_dst_ops_template.gc_thresh,
5140 		.maxlen		=	sizeof(int),
5141 		.mode		=	0644,
5142 		.proc_handler	=	proc_dointvec,
5143 	},
5144 	{
5145 		.procname	=	"max_size",
5146 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5147 		.maxlen		=	sizeof(int),
5148 		.mode		=	0644,
5149 		.proc_handler	=	proc_dointvec,
5150 	},
5151 	{
5152 		.procname	=	"gc_min_interval",
5153 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5154 		.maxlen		=	sizeof(int),
5155 		.mode		=	0644,
5156 		.proc_handler	=	proc_dointvec_jiffies,
5157 	},
5158 	{
5159 		.procname	=	"gc_timeout",
5160 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5161 		.maxlen		=	sizeof(int),
5162 		.mode		=	0644,
5163 		.proc_handler	=	proc_dointvec_jiffies,
5164 	},
5165 	{
5166 		.procname	=	"gc_interval",
5167 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5168 		.maxlen		=	sizeof(int),
5169 		.mode		=	0644,
5170 		.proc_handler	=	proc_dointvec_jiffies,
5171 	},
5172 	{
5173 		.procname	=	"gc_elasticity",
5174 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5175 		.maxlen		=	sizeof(int),
5176 		.mode		=	0644,
5177 		.proc_handler	=	proc_dointvec,
5178 	},
5179 	{
5180 		.procname	=	"mtu_expires",
5181 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5182 		.maxlen		=	sizeof(int),
5183 		.mode		=	0644,
5184 		.proc_handler	=	proc_dointvec_jiffies,
5185 	},
5186 	{
5187 		.procname	=	"min_adv_mss",
5188 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5189 		.maxlen		=	sizeof(int),
5190 		.mode		=	0644,
5191 		.proc_handler	=	proc_dointvec,
5192 	},
5193 	{
5194 		.procname	=	"gc_min_interval_ms",
5195 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5196 		.maxlen		=	sizeof(int),
5197 		.mode		=	0644,
5198 		.proc_handler	=	proc_dointvec_ms_jiffies,
5199 	},
5200 	{
5201 		.procname	=	"skip_notify_on_dev_down",
5202 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5203 		.maxlen		=	sizeof(int),
5204 		.mode		=	0644,
5205 		.proc_handler	=	proc_dointvec,
5206 		.extra1		=	&zero,
5207 		.extra2		=	&one,
5208 	},
5209 	{ }
5210 };
5211 
5212 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5213 {
5214 	struct ctl_table *table;
5215 
5216 	table = kmemdup(ipv6_route_table_template,
5217 			sizeof(ipv6_route_table_template),
5218 			GFP_KERNEL);
5219 
5220 	if (table) {
5221 		table[0].data = &net->ipv6.sysctl.flush_delay;
5222 		table[0].extra1 = net;
5223 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5224 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5225 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5226 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5227 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5228 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5229 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5230 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5231 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5232 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5233 
5234 		/* Don't export sysctls to unprivileged users */
5235 		if (net->user_ns != &init_user_ns)
5236 			table[0].procname = NULL;
5237 	}
5238 
5239 	return table;
5240 }
5241 #endif
5242 
5243 static int __net_init ip6_route_net_init(struct net *net)
5244 {
5245 	int ret = -ENOMEM;
5246 
5247 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5248 	       sizeof(net->ipv6.ip6_dst_ops));
5249 
5250 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5251 		goto out_ip6_dst_ops;
5252 
5253 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5254 					    sizeof(*net->ipv6.fib6_null_entry),
5255 					    GFP_KERNEL);
5256 	if (!net->ipv6.fib6_null_entry)
5257 		goto out_ip6_dst_entries;
5258 
5259 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5260 					   sizeof(*net->ipv6.ip6_null_entry),
5261 					   GFP_KERNEL);
5262 	if (!net->ipv6.ip6_null_entry)
5263 		goto out_fib6_null_entry;
5264 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5265 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5266 			 ip6_template_metrics, true);
5267 
5268 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5269 	net->ipv6.fib6_has_custom_rules = false;
5270 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5271 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5272 					       GFP_KERNEL);
5273 	if (!net->ipv6.ip6_prohibit_entry)
5274 		goto out_ip6_null_entry;
5275 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5276 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5277 			 ip6_template_metrics, true);
5278 
5279 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5280 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5281 					       GFP_KERNEL);
5282 	if (!net->ipv6.ip6_blk_hole_entry)
5283 		goto out_ip6_prohibit_entry;
5284 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5285 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5286 			 ip6_template_metrics, true);
5287 #endif
5288 
5289 	net->ipv6.sysctl.flush_delay = 0;
5290 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5291 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5292 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5293 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5294 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5295 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5296 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5297 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5298 
5299 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5300 
5301 	ret = 0;
5302 out:
5303 	return ret;
5304 
5305 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5306 out_ip6_prohibit_entry:
5307 	kfree(net->ipv6.ip6_prohibit_entry);
5308 out_ip6_null_entry:
5309 	kfree(net->ipv6.ip6_null_entry);
5310 #endif
5311 out_fib6_null_entry:
5312 	kfree(net->ipv6.fib6_null_entry);
5313 out_ip6_dst_entries:
5314 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5315 out_ip6_dst_ops:
5316 	goto out;
5317 }
5318 
5319 static void __net_exit ip6_route_net_exit(struct net *net)
5320 {
5321 	kfree(net->ipv6.fib6_null_entry);
5322 	kfree(net->ipv6.ip6_null_entry);
5323 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5324 	kfree(net->ipv6.ip6_prohibit_entry);
5325 	kfree(net->ipv6.ip6_blk_hole_entry);
5326 #endif
5327 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5328 }
5329 
5330 static int __net_init ip6_route_net_init_late(struct net *net)
5331 {
5332 #ifdef CONFIG_PROC_FS
5333 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5334 			sizeof(struct ipv6_route_iter));
5335 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5336 			rt6_stats_seq_show, NULL);
5337 #endif
5338 	return 0;
5339 }
5340 
5341 static void __net_exit ip6_route_net_exit_late(struct net *net)
5342 {
5343 #ifdef CONFIG_PROC_FS
5344 	remove_proc_entry("ipv6_route", net->proc_net);
5345 	remove_proc_entry("rt6_stats", net->proc_net);
5346 #endif
5347 }
5348 
5349 static struct pernet_operations ip6_route_net_ops = {
5350 	.init = ip6_route_net_init,
5351 	.exit = ip6_route_net_exit,
5352 };
5353 
5354 static int __net_init ipv6_inetpeer_init(struct net *net)
5355 {
5356 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5357 
5358 	if (!bp)
5359 		return -ENOMEM;
5360 	inet_peer_base_init(bp);
5361 	net->ipv6.peers = bp;
5362 	return 0;
5363 }
5364 
5365 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5366 {
5367 	struct inet_peer_base *bp = net->ipv6.peers;
5368 
5369 	net->ipv6.peers = NULL;
5370 	inetpeer_invalidate_tree(bp);
5371 	kfree(bp);
5372 }
5373 
5374 static struct pernet_operations ipv6_inetpeer_ops = {
5375 	.init	=	ipv6_inetpeer_init,
5376 	.exit	=	ipv6_inetpeer_exit,
5377 };
5378 
5379 static struct pernet_operations ip6_route_net_late_ops = {
5380 	.init = ip6_route_net_init_late,
5381 	.exit = ip6_route_net_exit_late,
5382 };
5383 
5384 static struct notifier_block ip6_route_dev_notifier = {
5385 	.notifier_call = ip6_route_dev_notify,
5386 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5387 };
5388 
5389 void __init ip6_route_init_special_entries(void)
5390 {
5391 	/* Registering of the loopback is done before this portion of code,
5392 	 * the loopback reference in rt6_info will not be taken, do it
5393 	 * manually for init_net */
5394 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5395 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5396 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5397   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5398 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5399 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5400 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5401 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5402   #endif
5403 }
5404 
5405 int __init ip6_route_init(void)
5406 {
5407 	int ret;
5408 	int cpu;
5409 
5410 	ret = -ENOMEM;
5411 	ip6_dst_ops_template.kmem_cachep =
5412 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5413 				  SLAB_HWCACHE_ALIGN, NULL);
5414 	if (!ip6_dst_ops_template.kmem_cachep)
5415 		goto out;
5416 
5417 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5418 	if (ret)
5419 		goto out_kmem_cache;
5420 
5421 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5422 	if (ret)
5423 		goto out_dst_entries;
5424 
5425 	ret = register_pernet_subsys(&ip6_route_net_ops);
5426 	if (ret)
5427 		goto out_register_inetpeer;
5428 
5429 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5430 
5431 	ret = fib6_init();
5432 	if (ret)
5433 		goto out_register_subsys;
5434 
5435 	ret = xfrm6_init();
5436 	if (ret)
5437 		goto out_fib6_init;
5438 
5439 	ret = fib6_rules_init();
5440 	if (ret)
5441 		goto xfrm6_init;
5442 
5443 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5444 	if (ret)
5445 		goto fib6_rules_init;
5446 
5447 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5448 				   inet6_rtm_newroute, NULL, 0);
5449 	if (ret < 0)
5450 		goto out_register_late_subsys;
5451 
5452 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5453 				   inet6_rtm_delroute, NULL, 0);
5454 	if (ret < 0)
5455 		goto out_register_late_subsys;
5456 
5457 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5458 				   inet6_rtm_getroute, NULL,
5459 				   RTNL_FLAG_DOIT_UNLOCKED);
5460 	if (ret < 0)
5461 		goto out_register_late_subsys;
5462 
5463 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5464 	if (ret)
5465 		goto out_register_late_subsys;
5466 
5467 	for_each_possible_cpu(cpu) {
5468 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5469 
5470 		INIT_LIST_HEAD(&ul->head);
5471 		spin_lock_init(&ul->lock);
5472 	}
5473 
5474 out:
5475 	return ret;
5476 
5477 out_register_late_subsys:
5478 	rtnl_unregister_all(PF_INET6);
5479 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5480 fib6_rules_init:
5481 	fib6_rules_cleanup();
5482 xfrm6_init:
5483 	xfrm6_fini();
5484 out_fib6_init:
5485 	fib6_gc_cleanup();
5486 out_register_subsys:
5487 	unregister_pernet_subsys(&ip6_route_net_ops);
5488 out_register_inetpeer:
5489 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5490 out_dst_entries:
5491 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5492 out_kmem_cache:
5493 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5494 	goto out;
5495 }
5496 
5497 void ip6_route_cleanup(void)
5498 {
5499 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5500 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5501 	fib6_rules_cleanup();
5502 	xfrm6_fini();
5503 	fib6_gc_cleanup();
5504 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5505 	unregister_pernet_subsys(&ip6_route_net_ops);
5506 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5507 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5508 }
5509