xref: /linux/net/ipv6/route.c (revision cad4977344b35ea116ec5fefe91a76b1dfa113f5)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 			 struct fib6_info *rt, struct dst_entry *dst,
109 			 struct in6_addr *dest, struct in6_addr *src,
110 			 int iif, int type, u32 portid, u32 seq,
111 			 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 					   struct in6_addr *daddr,
114 					   struct in6_addr *saddr);
115 
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 					   const struct in6_addr *prefix, int prefixlen,
119 					   const struct in6_addr *gwaddr,
120 					   struct net_device *dev,
121 					   unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 					   const struct in6_addr *prefix, int prefixlen,
124 					   const struct in6_addr *gwaddr,
125 					   struct net_device *dev);
126 #endif
127 
128 struct uncached_list {
129 	spinlock_t		lock;
130 	struct list_head	head;
131 };
132 
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134 
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138 
139 	rt->rt6i_uncached_list = ul;
140 
141 	spin_lock_bh(&ul->lock);
142 	list_add_tail(&rt->rt6i_uncached, &ul->head);
143 	spin_unlock_bh(&ul->lock);
144 }
145 
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148 	if (!list_empty(&rt->rt6i_uncached)) {
149 		struct uncached_list *ul = rt->rt6i_uncached_list;
150 		struct net *net = dev_net(rt->dst.dev);
151 
152 		spin_lock_bh(&ul->lock);
153 		list_del(&rt->rt6i_uncached);
154 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 		spin_unlock_bh(&ul->lock);
156 	}
157 }
158 
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161 	struct net_device *loopback_dev = net->loopback_dev;
162 	int cpu;
163 
164 	if (dev == loopback_dev)
165 		return;
166 
167 	for_each_possible_cpu(cpu) {
168 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 		struct rt6_info *rt;
170 
171 		spin_lock_bh(&ul->lock);
172 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 			struct inet6_dev *rt_idev = rt->rt6i_idev;
174 			struct net_device *rt_dev = rt->dst.dev;
175 
176 			if (rt_idev->dev == dev) {
177 				rt->rt6i_idev = in6_dev_get(loopback_dev);
178 				in6_dev_put(rt_idev);
179 			}
180 
181 			if (rt_dev == dev) {
182 				rt->dst.dev = loopback_dev;
183 				dev_hold(rt->dst.dev);
184 				dev_put(rt_dev);
185 			}
186 		}
187 		spin_unlock_bh(&ul->lock);
188 	}
189 }
190 
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	if (!ipv6_addr_any(p))
196 		return (const void *) p;
197 	else if (skb)
198 		return &ipv6_hdr(skb)->daddr;
199 	return daddr;
200 }
201 
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 				   struct net_device *dev,
204 				   struct sk_buff *skb,
205 				   const void *daddr)
206 {
207 	struct neighbour *n;
208 
209 	daddr = choose_neigh_daddr(gw, skb, daddr);
210 	n = __ipv6_neigh_lookup(dev, daddr);
211 	if (n)
212 		return n;
213 	return neigh_create(&nd_tbl, daddr, dev);
214 }
215 
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 					      struct sk_buff *skb,
218 					      const void *daddr)
219 {
220 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221 
222 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224 
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227 	struct net_device *dev = dst->dev;
228 	struct rt6_info *rt = (struct rt6_info *)dst;
229 
230 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 	if (!daddr)
232 		return;
233 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 		return;
235 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 		return;
237 	__ipv6_confirm_neigh(dev, daddr);
238 }
239 
240 static struct dst_ops ip6_dst_ops_template = {
241 	.family			=	AF_INET6,
242 	.gc			=	ip6_dst_gc,
243 	.gc_thresh		=	1024,
244 	.check			=	ip6_dst_check,
245 	.default_advmss		=	ip6_default_advmss,
246 	.mtu			=	ip6_mtu,
247 	.cow_metrics		=	dst_cow_metrics_generic,
248 	.destroy		=	ip6_dst_destroy,
249 	.ifdown			=	ip6_dst_ifdown,
250 	.negative_advice	=	ip6_negative_advice,
251 	.link_failure		=	ip6_link_failure,
252 	.update_pmtu		=	ip6_rt_update_pmtu,
253 	.redirect		=	rt6_do_redirect,
254 	.local_out		=	__ip6_local_out,
255 	.neigh_lookup		=	ip6_dst_neigh_lookup,
256 	.confirm_neigh		=	ip6_confirm_neigh,
257 };
258 
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262 
263 	return mtu ? : dst->dev->mtu;
264 }
265 
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 					 struct sk_buff *skb, u32 mtu)
268 {
269 }
270 
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 				      struct sk_buff *skb)
273 {
274 }
275 
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 	.family			=	AF_INET6,
278 	.destroy		=	ip6_dst_destroy,
279 	.check			=	ip6_dst_check,
280 	.mtu			=	ip6_blackhole_mtu,
281 	.default_advmss		=	ip6_default_advmss,
282 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
283 	.redirect		=	ip6_rt_blackhole_redirect,
284 	.cow_metrics		=	dst_cow_metrics_generic,
285 	.neigh_lookup		=	ip6_dst_neigh_lookup,
286 };
287 
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 	[RTAX_HOPLIMIT - 1] = 0,
290 };
291 
292 static const struct fib6_info fib6_null_entry_template = {
293 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
294 	.fib6_protocol  = RTPROT_KERNEL,
295 	.fib6_metric	= ~(u32)0,
296 	.fib6_ref	= ATOMIC_INIT(1),
297 	.fib6_type	= RTN_UNREACHABLE,
298 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
299 };
300 
301 static const struct rt6_info ip6_null_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -ENETUNREACH,
307 		.input		= ip6_pkt_discard,
308 		.output		= ip6_pkt_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 };
326 
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328 	.dst = {
329 		.__refcnt	= ATOMIC_INIT(1),
330 		.__use		= 1,
331 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
332 		.error		= -EINVAL,
333 		.input		= dst_discard,
334 		.output		= dst_discard_out,
335 	},
336 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
337 };
338 
339 #endif
340 
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343 	struct dst_entry *dst = &rt->dst;
344 
345 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 	INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348 
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 			       int flags)
352 {
353 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 					1, DST_OBSOLETE_FORCE_CHK, flags);
355 
356 	if (rt) {
357 		rt6_info_init(rt);
358 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359 	}
360 
361 	return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364 
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct rt6_info *rt = (struct rt6_info *)dst;
368 	struct fib6_info *from;
369 	struct inet6_dev *idev;
370 
371 	ip_dst_metrics_put(dst);
372 	rt6_uncached_list_del(rt);
373 
374 	idev = rt->rt6i_idev;
375 	if (idev) {
376 		rt->rt6i_idev = NULL;
377 		in6_dev_put(idev);
378 	}
379 
380 	rcu_read_lock();
381 	from = rcu_dereference(rt->from);
382 	rcu_assign_pointer(rt->from, NULL);
383 	fib6_info_release(from);
384 	rcu_read_unlock();
385 }
386 
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388 			   int how)
389 {
390 	struct rt6_info *rt = (struct rt6_info *)dst;
391 	struct inet6_dev *idev = rt->rt6i_idev;
392 	struct net_device *loopback_dev =
393 		dev_net(dev)->loopback_dev;
394 
395 	if (idev && idev->dev != loopback_dev) {
396 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397 		if (loopback_idev) {
398 			rt->rt6i_idev = loopback_idev;
399 			in6_dev_put(idev);
400 		}
401 	}
402 }
403 
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406 	if (rt->rt6i_flags & RTF_EXPIRES)
407 		return time_after(jiffies, rt->dst.expires);
408 	else
409 		return false;
410 }
411 
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414 	struct fib6_info *from;
415 
416 	from = rcu_dereference(rt->from);
417 
418 	if (rt->rt6i_flags & RTF_EXPIRES) {
419 		if (time_after(jiffies, rt->dst.expires))
420 			return true;
421 	} else if (from) {
422 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 			fib6_check_expired(from);
424 	}
425 	return false;
426 }
427 
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 					struct fib6_info *match,
430 					struct flowi6 *fl6, int oif,
431 					const struct sk_buff *skb,
432 					int strict)
433 {
434 	struct fib6_info *sibling, *next_sibling;
435 
436 	/* We might have already computed the hash for ICMPv6 errors. In such
437 	 * case it will always be non-zero. Otherwise now is the time to do it.
438 	 */
439 	if (!fl6->mp_hash)
440 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441 
442 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443 		return match;
444 
445 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446 				 fib6_siblings) {
447 		int nh_upper_bound;
448 
449 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 		if (fl6->mp_hash > nh_upper_bound)
451 			continue;
452 		if (rt6_score_route(sibling, oif, strict) < 0)
453 			break;
454 		match = sibling;
455 		break;
456 	}
457 
458 	return match;
459 }
460 
461 /*
462  *	Route lookup. rcu_read_lock() should be held.
463  */
464 
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 						 struct fib6_info *rt,
467 						    const struct in6_addr *saddr,
468 						    int oif,
469 						    int flags)
470 {
471 	struct fib6_info *sprt;
472 
473 	if (!oif && ipv6_addr_any(saddr) &&
474 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475 		return rt;
476 
477 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
479 
480 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481 			continue;
482 
483 		if (oif) {
484 			if (dev->ifindex == oif)
485 				return sprt;
486 		} else {
487 			if (ipv6_chk_addr(net, saddr, dev,
488 					  flags & RT6_LOOKUP_F_IFACE))
489 				return sprt;
490 		}
491 	}
492 
493 	if (oif && flags & RT6_LOOKUP_F_IFACE)
494 		return net->ipv6.fib6_null_entry;
495 
496 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498 
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 	struct work_struct work;
502 	struct in6_addr target;
503 	struct net_device *dev;
504 };
505 
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508 	struct in6_addr mcaddr;
509 	struct __rt6_probe_work *work =
510 		container_of(w, struct __rt6_probe_work, work);
511 
512 	addrconf_addr_solict_mult(&work->target, &mcaddr);
513 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514 	dev_put(work->dev);
515 	kfree(work);
516 }
517 
518 static void rt6_probe(struct fib6_info *rt)
519 {
520 	struct __rt6_probe_work *work = NULL;
521 	const struct in6_addr *nh_gw;
522 	struct neighbour *neigh;
523 	struct net_device *dev;
524 	struct inet6_dev *idev;
525 
526 	/*
527 	 * Okay, this does not seem to be appropriate
528 	 * for now, however, we need to check if it
529 	 * is really so; aka Router Reachability Probing.
530 	 *
531 	 * Router Reachability Probe MUST be rate-limited
532 	 * to no more than one per minute.
533 	 */
534 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
535 		return;
536 
537 	nh_gw = &rt->fib6_nh.nh_gw;
538 	dev = rt->fib6_nh.nh_dev;
539 	rcu_read_lock_bh();
540 	idev = __in6_dev_get(dev);
541 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
542 	if (neigh) {
543 		if (neigh->nud_state & NUD_VALID)
544 			goto out;
545 
546 		write_lock(&neigh->lock);
547 		if (!(neigh->nud_state & NUD_VALID) &&
548 		    time_after(jiffies,
549 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
550 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
551 			if (work)
552 				__neigh_set_probe_once(neigh);
553 		}
554 		write_unlock(&neigh->lock);
555 	} else if (time_after(jiffies, rt->last_probe +
556 				       idev->cnf.rtr_probe_interval)) {
557 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
558 	}
559 
560 	if (work) {
561 		rt->last_probe = jiffies;
562 		INIT_WORK(&work->work, rt6_probe_deferred);
563 		work->target = *nh_gw;
564 		dev_hold(dev);
565 		work->dev = dev;
566 		schedule_work(&work->work);
567 	}
568 
569 out:
570 	rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577 
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583 	const struct net_device *dev = rt->fib6_nh.nh_dev;
584 
585 	if (!oif || dev->ifindex == oif)
586 		return 2;
587 	return 0;
588 }
589 
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 	struct neighbour *neigh;
594 
595 	if (rt->fib6_flags & RTF_NONEXTHOP ||
596 	    !(rt->fib6_flags & RTF_GATEWAY))
597 		return RT6_NUD_SUCCEED;
598 
599 	rcu_read_lock_bh();
600 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601 					  &rt->fib6_nh.nh_gw);
602 	if (neigh) {
603 		read_lock(&neigh->lock);
604 		if (neigh->nud_state & NUD_VALID)
605 			ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 		else if (!(neigh->nud_state & NUD_FAILED))
608 			ret = RT6_NUD_SUCCEED;
609 		else
610 			ret = RT6_NUD_FAIL_PROBE;
611 #endif
612 		read_unlock(&neigh->lock);
613 	} else {
614 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616 	}
617 	rcu_read_unlock_bh();
618 
619 	return ret;
620 }
621 
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624 	int m;
625 
626 	m = rt6_check_dev(rt, oif);
627 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 		return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632 	if (strict & RT6_LOOKUP_F_REACHABLE) {
633 		int n = rt6_check_neigh(rt);
634 		if (n < 0)
635 			return n;
636 	}
637 	return m;
638 }
639 
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643 	const struct net_device *dev = fib6_info_nh_dev(f6i);
644 	bool rc = false;
645 
646 	if (dev) {
647 		const struct inet6_dev *idev = __in6_dev_get(dev);
648 
649 		rc = !!idev->cnf.ignore_routes_with_linkdown;
650 	}
651 
652 	return rc;
653 }
654 
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 				   int *mpri, struct fib6_info *match,
657 				   bool *do_rr)
658 {
659 	int m;
660 	bool match_do_rr = false;
661 
662 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663 		goto out;
664 
665 	if (fib6_ignore_linkdown(rt) &&
666 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668 		goto out;
669 
670 	if (fib6_check_expired(rt))
671 		goto out;
672 
673 	m = rt6_score_route(rt, oif, strict);
674 	if (m == RT6_NUD_FAIL_DO_RR) {
675 		match_do_rr = true;
676 		m = 0; /* lowest valid score */
677 	} else if (m == RT6_NUD_FAIL_HARD) {
678 		goto out;
679 	}
680 
681 	if (strict & RT6_LOOKUP_F_REACHABLE)
682 		rt6_probe(rt);
683 
684 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
685 	if (m > *mpri) {
686 		*do_rr = match_do_rr;
687 		*mpri = m;
688 		match = rt;
689 	}
690 out:
691 	return match;
692 }
693 
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 				     struct fib6_info *leaf,
696 				     struct fib6_info *rr_head,
697 				     u32 metric, int oif, int strict,
698 				     bool *do_rr)
699 {
700 	struct fib6_info *rt, *match, *cont;
701 	int mpri = -1;
702 
703 	match = NULL;
704 	cont = NULL;
705 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 		if (rt->fib6_metric != metric) {
707 			cont = rt;
708 			break;
709 		}
710 
711 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
712 	}
713 
714 	for (rt = leaf; rt && rt != rr_head;
715 	     rt = rcu_dereference(rt->fib6_next)) {
716 		if (rt->fib6_metric != metric) {
717 			cont = rt;
718 			break;
719 		}
720 
721 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
722 	}
723 
724 	if (match || !cont)
725 		return match;
726 
727 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 
730 	return match;
731 }
732 
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734 				   int oif, int strict)
735 {
736 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 	struct fib6_info *match, *rt0;
738 	bool do_rr = false;
739 	int key_plen;
740 
741 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 		return net->ipv6.fib6_null_entry;
743 
744 	rt0 = rcu_dereference(fn->rr_ptr);
745 	if (!rt0)
746 		rt0 = leaf;
747 
748 	/* Double check to make sure fn is not an intermediate node
749 	 * and fn->leaf does not points to its child's leaf
750 	 * (This might happen if all routes under fn are deleted from
751 	 * the tree and fib6_repair_tree() is called on the node.)
752 	 */
753 	key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 	if (rt0->fib6_src.plen)
756 		key_plen = rt0->fib6_src.plen;
757 #endif
758 	if (fn->fn_bit != key_plen)
759 		return net->ipv6.fib6_null_entry;
760 
761 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762 			     &do_rr);
763 
764 	if (do_rr) {
765 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766 
767 		/* no entries matched; do round-robin */
768 		if (!next || next->fib6_metric != rt0->fib6_metric)
769 			next = leaf;
770 
771 		if (next != rt0) {
772 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 			/* make sure next is not being deleted from the tree */
774 			if (next->fib6_node)
775 				rcu_assign_pointer(fn->rr_ptr, next);
776 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777 		}
778 	}
779 
780 	return match ? match : net->ipv6.fib6_null_entry;
781 }
782 
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787 
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 		  const struct in6_addr *gwaddr)
791 {
792 	struct net *net = dev_net(dev);
793 	struct route_info *rinfo = (struct route_info *) opt;
794 	struct in6_addr prefix_buf, *prefix;
795 	unsigned int pref;
796 	unsigned long lifetime;
797 	struct fib6_info *rt;
798 
799 	if (len < sizeof(struct route_info)) {
800 		return -EINVAL;
801 	}
802 
803 	/* Sanity check for prefix_len and length */
804 	if (rinfo->length > 3) {
805 		return -EINVAL;
806 	} else if (rinfo->prefix_len > 128) {
807 		return -EINVAL;
808 	} else if (rinfo->prefix_len > 64) {
809 		if (rinfo->length < 2) {
810 			return -EINVAL;
811 		}
812 	} else if (rinfo->prefix_len > 0) {
813 		if (rinfo->length < 1) {
814 			return -EINVAL;
815 		}
816 	}
817 
818 	pref = rinfo->route_pref;
819 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
820 		return -EINVAL;
821 
822 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823 
824 	if (rinfo->length == 3)
825 		prefix = (struct in6_addr *)rinfo->prefix;
826 	else {
827 		/* this function is safe */
828 		ipv6_addr_prefix(&prefix_buf,
829 				 (struct in6_addr *)rinfo->prefix,
830 				 rinfo->prefix_len);
831 		prefix = &prefix_buf;
832 	}
833 
834 	if (rinfo->prefix_len == 0)
835 		rt = rt6_get_dflt_router(net, gwaddr, dev);
836 	else
837 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838 					gwaddr, dev);
839 
840 	if (rt && !lifetime) {
841 		ip6_del_rt(net, rt);
842 		rt = NULL;
843 	}
844 
845 	if (!rt && lifetime)
846 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847 					dev, pref);
848 	else if (rt)
849 		rt->fib6_flags = RTF_ROUTEINFO |
850 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851 
852 	if (rt) {
853 		if (!addrconf_finite_timeout(lifetime))
854 			fib6_clean_expires(rt);
855 		else
856 			fib6_set_expires(rt, jiffies + HZ * lifetime);
857 
858 		fib6_info_release(rt);
859 	}
860 	return 0;
861 }
862 #endif
863 
864 /*
865  *	Misc support functions
866  */
867 
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871 	struct net_device *dev = rt->fib6_nh.nh_dev;
872 
873 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 		/* for copies of local routes, dst->dev needs to be the
875 		 * device if it is a master device, the master device if
876 		 * device is enslaved, and the loopback as the default
877 		 */
878 		if (netif_is_l3_slave(dev) &&
879 		    !rt6_need_strict(&rt->fib6_dst.addr))
880 			dev = l3mdev_master_dev_rcu(dev);
881 		else if (!netif_is_l3_master(dev))
882 			dev = dev_net(dev)->loopback_dev;
883 		/* last case is netif_is_l3_master(dev) is true in which
884 		 * case we want dev returned to be dev
885 		 */
886 	}
887 
888 	return dev;
889 }
890 
891 static const int fib6_prop[RTN_MAX + 1] = {
892 	[RTN_UNSPEC]	= 0,
893 	[RTN_UNICAST]	= 0,
894 	[RTN_LOCAL]	= 0,
895 	[RTN_BROADCAST]	= 0,
896 	[RTN_ANYCAST]	= 0,
897 	[RTN_MULTICAST]	= 0,
898 	[RTN_BLACKHOLE]	= -EINVAL,
899 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
900 	[RTN_PROHIBIT]	= -EACCES,
901 	[RTN_THROW]	= -EAGAIN,
902 	[RTN_NAT]	= -EINVAL,
903 	[RTN_XRESOLVE]	= -EINVAL,
904 };
905 
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908 	return fib6_prop[fib6_type];
909 }
910 
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913 	unsigned short flags = 0;
914 
915 	if (rt->dst_nocount)
916 		flags |= DST_NOCOUNT;
917 	if (rt->dst_nopolicy)
918 		flags |= DST_NOPOLICY;
919 	if (rt->dst_host)
920 		flags |= DST_HOST;
921 
922 	return flags;
923 }
924 
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928 
929 	switch (ort->fib6_type) {
930 	case RTN_BLACKHOLE:
931 		rt->dst.output = dst_discard_out;
932 		rt->dst.input = dst_discard;
933 		break;
934 	case RTN_PROHIBIT:
935 		rt->dst.output = ip6_pkt_prohibit_out;
936 		rt->dst.input = ip6_pkt_prohibit;
937 		break;
938 	case RTN_THROW:
939 	case RTN_UNREACHABLE:
940 	default:
941 		rt->dst.output = ip6_pkt_discard_out;
942 		rt->dst.input = ip6_pkt_discard;
943 		break;
944 	}
945 }
946 
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949 	if (ort->fib6_flags & RTF_REJECT) {
950 		ip6_rt_init_dst_reject(rt, ort);
951 		return;
952 	}
953 
954 	rt->dst.error = 0;
955 	rt->dst.output = ip6_output;
956 
957 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958 		rt->dst.input = ip6_input;
959 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960 		rt->dst.input = ip6_mc_input;
961 	} else {
962 		rt->dst.input = ip6_forward;
963 	}
964 
965 	if (ort->fib6_nh.nh_lwtstate) {
966 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
967 		lwtunnel_set_redirect(&rt->dst);
968 	}
969 
970 	rt->dst.lastuse = jiffies;
971 }
972 
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976 	rt->rt6i_flags &= ~RTF_EXPIRES;
977 	rcu_assign_pointer(rt->from, from);
978 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980 
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984 	struct net_device *dev = fib6_info_nh_dev(ort);
985 
986 	ip6_rt_init_dst(rt, ort);
987 
988 	rt->rt6i_dst = ort->fib6_dst;
989 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
991 	rt->rt6i_flags = ort->fib6_flags;
992 	rt6_set_from(rt, ort);
993 #ifdef CONFIG_IPV6_SUBTREES
994 	rt->rt6i_src = ort->fib6_src;
995 #endif
996 }
997 
998 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
999 					struct in6_addr *saddr)
1000 {
1001 	struct fib6_node *pn, *sn;
1002 	while (1) {
1003 		if (fn->fn_flags & RTN_TL_ROOT)
1004 			return NULL;
1005 		pn = rcu_dereference(fn->parent);
1006 		sn = FIB6_SUBTREE(pn);
1007 		if (sn && sn != fn)
1008 			fn = fib6_node_lookup(sn, NULL, saddr);
1009 		else
1010 			fn = pn;
1011 		if (fn->fn_flags & RTN_RTINFO)
1012 			return fn;
1013 	}
1014 }
1015 
1016 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1017 			  bool null_fallback)
1018 {
1019 	struct rt6_info *rt = *prt;
1020 
1021 	if (dst_hold_safe(&rt->dst))
1022 		return true;
1023 	if (null_fallback) {
1024 		rt = net->ipv6.ip6_null_entry;
1025 		dst_hold(&rt->dst);
1026 	} else {
1027 		rt = NULL;
1028 	}
1029 	*prt = rt;
1030 	return false;
1031 }
1032 
1033 /* called with rcu_lock held */
1034 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1035 {
1036 	unsigned short flags = fib6_info_dst_flags(rt);
1037 	struct net_device *dev = rt->fib6_nh.nh_dev;
1038 	struct rt6_info *nrt;
1039 
1040 	if (!fib6_info_hold_safe(rt))
1041 		return NULL;
1042 
1043 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1044 	if (nrt)
1045 		ip6_rt_copy_init(nrt, rt);
1046 	else
1047 		fib6_info_release(rt);
1048 
1049 	return nrt;
1050 }
1051 
1052 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1053 					     struct fib6_table *table,
1054 					     struct flowi6 *fl6,
1055 					     const struct sk_buff *skb,
1056 					     int flags)
1057 {
1058 	struct fib6_info *f6i;
1059 	struct fib6_node *fn;
1060 	struct rt6_info *rt;
1061 
1062 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1063 		flags &= ~RT6_LOOKUP_F_IFACE;
1064 
1065 	rcu_read_lock();
1066 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1067 restart:
1068 	f6i = rcu_dereference(fn->leaf);
1069 	if (!f6i) {
1070 		f6i = net->ipv6.fib6_null_entry;
1071 	} else {
1072 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1073 				      fl6->flowi6_oif, flags);
1074 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1075 			f6i = fib6_multipath_select(net, f6i, fl6,
1076 						    fl6->flowi6_oif, skb,
1077 						    flags);
1078 	}
1079 	if (f6i == net->ipv6.fib6_null_entry) {
1080 		fn = fib6_backtrack(fn, &fl6->saddr);
1081 		if (fn)
1082 			goto restart;
1083 	}
1084 
1085 	trace_fib6_table_lookup(net, f6i, table, fl6);
1086 
1087 	/* Search through exception table */
1088 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1089 	if (rt) {
1090 		if (ip6_hold_safe(net, &rt, true))
1091 			dst_use_noref(&rt->dst, jiffies);
1092 	} else if (f6i == net->ipv6.fib6_null_entry) {
1093 		rt = net->ipv6.ip6_null_entry;
1094 		dst_hold(&rt->dst);
1095 	} else {
1096 		rt = ip6_create_rt_rcu(f6i);
1097 		if (!rt) {
1098 			rt = net->ipv6.ip6_null_entry;
1099 			dst_hold(&rt->dst);
1100 		}
1101 	}
1102 
1103 	rcu_read_unlock();
1104 
1105 	return rt;
1106 }
1107 
1108 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1109 				   const struct sk_buff *skb, int flags)
1110 {
1111 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1114 
1115 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1116 			    const struct in6_addr *saddr, int oif,
1117 			    const struct sk_buff *skb, int strict)
1118 {
1119 	struct flowi6 fl6 = {
1120 		.flowi6_oif = oif,
1121 		.daddr = *daddr,
1122 	};
1123 	struct dst_entry *dst;
1124 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1125 
1126 	if (saddr) {
1127 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1128 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1129 	}
1130 
1131 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1132 	if (dst->error == 0)
1133 		return (struct rt6_info *) dst;
1134 
1135 	dst_release(dst);
1136 
1137 	return NULL;
1138 }
1139 EXPORT_SYMBOL(rt6_lookup);
1140 
1141 /* ip6_ins_rt is called with FREE table->tb6_lock.
1142  * It takes new route entry, the addition fails by any reason the
1143  * route is released.
1144  * Caller must hold dst before calling it.
1145  */
1146 
1147 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1148 			struct netlink_ext_ack *extack)
1149 {
1150 	int err;
1151 	struct fib6_table *table;
1152 
1153 	table = rt->fib6_table;
1154 	spin_lock_bh(&table->tb6_lock);
1155 	err = fib6_add(&table->tb6_root, rt, info, extack);
1156 	spin_unlock_bh(&table->tb6_lock);
1157 
1158 	return err;
1159 }
1160 
1161 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1162 {
1163 	struct nl_info info = {	.nl_net = net, };
1164 
1165 	return __ip6_ins_rt(rt, &info, NULL);
1166 }
1167 
1168 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1169 					   const struct in6_addr *daddr,
1170 					   const struct in6_addr *saddr)
1171 {
1172 	struct net_device *dev;
1173 	struct rt6_info *rt;
1174 
1175 	/*
1176 	 *	Clone the route.
1177 	 */
1178 
1179 	if (!fib6_info_hold_safe(ort))
1180 		return NULL;
1181 
1182 	dev = ip6_rt_get_dev_rcu(ort);
1183 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1184 	if (!rt) {
1185 		fib6_info_release(ort);
1186 		return NULL;
1187 	}
1188 
1189 	ip6_rt_copy_init(rt, ort);
1190 	rt->rt6i_flags |= RTF_CACHE;
1191 	rt->dst.flags |= DST_HOST;
1192 	rt->rt6i_dst.addr = *daddr;
1193 	rt->rt6i_dst.plen = 128;
1194 
1195 	if (!rt6_is_gw_or_nonexthop(ort)) {
1196 		if (ort->fib6_dst.plen != 128 &&
1197 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1198 			rt->rt6i_flags |= RTF_ANYCAST;
1199 #ifdef CONFIG_IPV6_SUBTREES
1200 		if (rt->rt6i_src.plen && saddr) {
1201 			rt->rt6i_src.addr = *saddr;
1202 			rt->rt6i_src.plen = 128;
1203 		}
1204 #endif
1205 	}
1206 
1207 	return rt;
1208 }
1209 
1210 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1211 {
1212 	unsigned short flags = fib6_info_dst_flags(rt);
1213 	struct net_device *dev;
1214 	struct rt6_info *pcpu_rt;
1215 
1216 	if (!fib6_info_hold_safe(rt))
1217 		return NULL;
1218 
1219 	rcu_read_lock();
1220 	dev = ip6_rt_get_dev_rcu(rt);
1221 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1222 	rcu_read_unlock();
1223 	if (!pcpu_rt) {
1224 		fib6_info_release(rt);
1225 		return NULL;
1226 	}
1227 	ip6_rt_copy_init(pcpu_rt, rt);
1228 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1229 	return pcpu_rt;
1230 }
1231 
1232 /* It should be called with rcu_read_lock() acquired */
1233 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1234 {
1235 	struct rt6_info *pcpu_rt, **p;
1236 
1237 	p = this_cpu_ptr(rt->rt6i_pcpu);
1238 	pcpu_rt = *p;
1239 
1240 	if (pcpu_rt)
1241 		ip6_hold_safe(NULL, &pcpu_rt, false);
1242 
1243 	return pcpu_rt;
1244 }
1245 
1246 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1247 					    struct fib6_info *rt)
1248 {
1249 	struct rt6_info *pcpu_rt, *prev, **p;
1250 
1251 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1252 	if (!pcpu_rt) {
1253 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1254 		return net->ipv6.ip6_null_entry;
1255 	}
1256 
1257 	dst_hold(&pcpu_rt->dst);
1258 	p = this_cpu_ptr(rt->rt6i_pcpu);
1259 	prev = cmpxchg(p, NULL, pcpu_rt);
1260 	BUG_ON(prev);
1261 
1262 	return pcpu_rt;
1263 }
1264 
1265 /* exception hash table implementation
1266  */
1267 static DEFINE_SPINLOCK(rt6_exception_lock);
1268 
1269 /* Remove rt6_ex from hash table and free the memory
1270  * Caller must hold rt6_exception_lock
1271  */
1272 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1273 				 struct rt6_exception *rt6_ex)
1274 {
1275 	struct net *net;
1276 
1277 	if (!bucket || !rt6_ex)
1278 		return;
1279 
1280 	net = dev_net(rt6_ex->rt6i->dst.dev);
1281 	hlist_del_rcu(&rt6_ex->hlist);
1282 	dst_release(&rt6_ex->rt6i->dst);
1283 	kfree_rcu(rt6_ex, rcu);
1284 	WARN_ON_ONCE(!bucket->depth);
1285 	bucket->depth--;
1286 	net->ipv6.rt6_stats->fib_rt_cache--;
1287 }
1288 
1289 /* Remove oldest rt6_ex in bucket and free the memory
1290  * Caller must hold rt6_exception_lock
1291  */
1292 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1293 {
1294 	struct rt6_exception *rt6_ex, *oldest = NULL;
1295 
1296 	if (!bucket)
1297 		return;
1298 
1299 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1300 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1301 			oldest = rt6_ex;
1302 	}
1303 	rt6_remove_exception(bucket, oldest);
1304 }
1305 
1306 static u32 rt6_exception_hash(const struct in6_addr *dst,
1307 			      const struct in6_addr *src)
1308 {
1309 	static u32 seed __read_mostly;
1310 	u32 val;
1311 
1312 	net_get_random_once(&seed, sizeof(seed));
1313 	val = jhash(dst, sizeof(*dst), seed);
1314 
1315 #ifdef CONFIG_IPV6_SUBTREES
1316 	if (src)
1317 		val = jhash(src, sizeof(*src), val);
1318 #endif
1319 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1320 }
1321 
1322 /* Helper function to find the cached rt in the hash table
1323  * and update bucket pointer to point to the bucket for this
1324  * (daddr, saddr) pair
1325  * Caller must hold rt6_exception_lock
1326  */
1327 static struct rt6_exception *
1328 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1329 			      const struct in6_addr *daddr,
1330 			      const struct in6_addr *saddr)
1331 {
1332 	struct rt6_exception *rt6_ex;
1333 	u32 hval;
1334 
1335 	if (!(*bucket) || !daddr)
1336 		return NULL;
1337 
1338 	hval = rt6_exception_hash(daddr, saddr);
1339 	*bucket += hval;
1340 
1341 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1342 		struct rt6_info *rt6 = rt6_ex->rt6i;
1343 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1344 
1345 #ifdef CONFIG_IPV6_SUBTREES
1346 		if (matched && saddr)
1347 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1348 #endif
1349 		if (matched)
1350 			return rt6_ex;
1351 	}
1352 	return NULL;
1353 }
1354 
1355 /* Helper function to find the cached rt in the hash table
1356  * and update bucket pointer to point to the bucket for this
1357  * (daddr, saddr) pair
1358  * Caller must hold rcu_read_lock()
1359  */
1360 static struct rt6_exception *
1361 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1362 			 const struct in6_addr *daddr,
1363 			 const struct in6_addr *saddr)
1364 {
1365 	struct rt6_exception *rt6_ex;
1366 	u32 hval;
1367 
1368 	WARN_ON_ONCE(!rcu_read_lock_held());
1369 
1370 	if (!(*bucket) || !daddr)
1371 		return NULL;
1372 
1373 	hval = rt6_exception_hash(daddr, saddr);
1374 	*bucket += hval;
1375 
1376 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1377 		struct rt6_info *rt6 = rt6_ex->rt6i;
1378 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1379 
1380 #ifdef CONFIG_IPV6_SUBTREES
1381 		if (matched && saddr)
1382 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1383 #endif
1384 		if (matched)
1385 			return rt6_ex;
1386 	}
1387 	return NULL;
1388 }
1389 
1390 static unsigned int fib6_mtu(const struct fib6_info *rt)
1391 {
1392 	unsigned int mtu;
1393 
1394 	if (rt->fib6_pmtu) {
1395 		mtu = rt->fib6_pmtu;
1396 	} else {
1397 		struct net_device *dev = fib6_info_nh_dev(rt);
1398 		struct inet6_dev *idev;
1399 
1400 		rcu_read_lock();
1401 		idev = __in6_dev_get(dev);
1402 		mtu = idev->cnf.mtu6;
1403 		rcu_read_unlock();
1404 	}
1405 
1406 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1407 
1408 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1409 }
1410 
1411 static int rt6_insert_exception(struct rt6_info *nrt,
1412 				struct fib6_info *ort)
1413 {
1414 	struct net *net = dev_net(nrt->dst.dev);
1415 	struct rt6_exception_bucket *bucket;
1416 	struct in6_addr *src_key = NULL;
1417 	struct rt6_exception *rt6_ex;
1418 	int err = 0;
1419 
1420 	spin_lock_bh(&rt6_exception_lock);
1421 
1422 	if (ort->exception_bucket_flushed) {
1423 		err = -EINVAL;
1424 		goto out;
1425 	}
1426 
1427 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1428 					lockdep_is_held(&rt6_exception_lock));
1429 	if (!bucket) {
1430 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1431 				 GFP_ATOMIC);
1432 		if (!bucket) {
1433 			err = -ENOMEM;
1434 			goto out;
1435 		}
1436 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1437 	}
1438 
1439 #ifdef CONFIG_IPV6_SUBTREES
1440 	/* rt6i_src.plen != 0 indicates ort is in subtree
1441 	 * and exception table is indexed by a hash of
1442 	 * both rt6i_dst and rt6i_src.
1443 	 * Otherwise, the exception table is indexed by
1444 	 * a hash of only rt6i_dst.
1445 	 */
1446 	if (ort->fib6_src.plen)
1447 		src_key = &nrt->rt6i_src.addr;
1448 #endif
1449 	/* rt6_mtu_change() might lower mtu on ort.
1450 	 * Only insert this exception route if its mtu
1451 	 * is less than ort's mtu value.
1452 	 */
1453 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1454 		err = -EINVAL;
1455 		goto out;
1456 	}
1457 
1458 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1459 					       src_key);
1460 	if (rt6_ex)
1461 		rt6_remove_exception(bucket, rt6_ex);
1462 
1463 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1464 	if (!rt6_ex) {
1465 		err = -ENOMEM;
1466 		goto out;
1467 	}
1468 	rt6_ex->rt6i = nrt;
1469 	rt6_ex->stamp = jiffies;
1470 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1471 	bucket->depth++;
1472 	net->ipv6.rt6_stats->fib_rt_cache++;
1473 
1474 	if (bucket->depth > FIB6_MAX_DEPTH)
1475 		rt6_exception_remove_oldest(bucket);
1476 
1477 out:
1478 	spin_unlock_bh(&rt6_exception_lock);
1479 
1480 	/* Update fn->fn_sernum to invalidate all cached dst */
1481 	if (!err) {
1482 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1483 		fib6_update_sernum(net, ort);
1484 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1485 		fib6_force_start_gc(net);
1486 	}
1487 
1488 	return err;
1489 }
1490 
1491 void rt6_flush_exceptions(struct fib6_info *rt)
1492 {
1493 	struct rt6_exception_bucket *bucket;
1494 	struct rt6_exception *rt6_ex;
1495 	struct hlist_node *tmp;
1496 	int i;
1497 
1498 	spin_lock_bh(&rt6_exception_lock);
1499 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1500 	rt->exception_bucket_flushed = 1;
1501 
1502 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503 				    lockdep_is_held(&rt6_exception_lock));
1504 	if (!bucket)
1505 		goto out;
1506 
1507 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1508 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1509 			rt6_remove_exception(bucket, rt6_ex);
1510 		WARN_ON_ONCE(bucket->depth);
1511 		bucket++;
1512 	}
1513 
1514 out:
1515 	spin_unlock_bh(&rt6_exception_lock);
1516 }
1517 
1518 /* Find cached rt in the hash table inside passed in rt
1519  * Caller has to hold rcu_read_lock()
1520  */
1521 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1522 					   struct in6_addr *daddr,
1523 					   struct in6_addr *saddr)
1524 {
1525 	struct rt6_exception_bucket *bucket;
1526 	struct in6_addr *src_key = NULL;
1527 	struct rt6_exception *rt6_ex;
1528 	struct rt6_info *res = NULL;
1529 
1530 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1531 
1532 #ifdef CONFIG_IPV6_SUBTREES
1533 	/* rt6i_src.plen != 0 indicates rt is in subtree
1534 	 * and exception table is indexed by a hash of
1535 	 * both rt6i_dst and rt6i_src.
1536 	 * Otherwise, the exception table is indexed by
1537 	 * a hash of only rt6i_dst.
1538 	 */
1539 	if (rt->fib6_src.plen)
1540 		src_key = saddr;
1541 #endif
1542 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1543 
1544 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1545 		res = rt6_ex->rt6i;
1546 
1547 	return res;
1548 }
1549 
1550 /* Remove the passed in cached rt from the hash table that contains it */
1551 static int rt6_remove_exception_rt(struct rt6_info *rt)
1552 {
1553 	struct rt6_exception_bucket *bucket;
1554 	struct in6_addr *src_key = NULL;
1555 	struct rt6_exception *rt6_ex;
1556 	struct fib6_info *from;
1557 	int err;
1558 
1559 	from = rcu_dereference(rt->from);
1560 	if (!from ||
1561 	    !(rt->rt6i_flags & RTF_CACHE))
1562 		return -EINVAL;
1563 
1564 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1565 		return -ENOENT;
1566 
1567 	spin_lock_bh(&rt6_exception_lock);
1568 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1569 				    lockdep_is_held(&rt6_exception_lock));
1570 #ifdef CONFIG_IPV6_SUBTREES
1571 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1572 	 * and exception table is indexed by a hash of
1573 	 * both rt6i_dst and rt6i_src.
1574 	 * Otherwise, the exception table is indexed by
1575 	 * a hash of only rt6i_dst.
1576 	 */
1577 	if (from->fib6_src.plen)
1578 		src_key = &rt->rt6i_src.addr;
1579 #endif
1580 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581 					       &rt->rt6i_dst.addr,
1582 					       src_key);
1583 	if (rt6_ex) {
1584 		rt6_remove_exception(bucket, rt6_ex);
1585 		err = 0;
1586 	} else {
1587 		err = -ENOENT;
1588 	}
1589 
1590 	spin_unlock_bh(&rt6_exception_lock);
1591 	return err;
1592 }
1593 
1594 /* Find rt6_ex which contains the passed in rt cache and
1595  * refresh its stamp
1596  */
1597 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1598 {
1599 	struct rt6_exception_bucket *bucket;
1600 	struct fib6_info *from = rt->from;
1601 	struct in6_addr *src_key = NULL;
1602 	struct rt6_exception *rt6_ex;
1603 
1604 	if (!from ||
1605 	    !(rt->rt6i_flags & RTF_CACHE))
1606 		return;
1607 
1608 	rcu_read_lock();
1609 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1610 
1611 #ifdef CONFIG_IPV6_SUBTREES
1612 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1613 	 * and exception table is indexed by a hash of
1614 	 * both rt6i_dst and rt6i_src.
1615 	 * Otherwise, the exception table is indexed by
1616 	 * a hash of only rt6i_dst.
1617 	 */
1618 	if (from->fib6_src.plen)
1619 		src_key = &rt->rt6i_src.addr;
1620 #endif
1621 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1622 					  &rt->rt6i_dst.addr,
1623 					  src_key);
1624 	if (rt6_ex)
1625 		rt6_ex->stamp = jiffies;
1626 
1627 	rcu_read_unlock();
1628 }
1629 
1630 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1631 					 struct rt6_info *rt, int mtu)
1632 {
1633 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1634 	 * lowest MTU in the path: always allow updating the route PMTU to
1635 	 * reflect PMTU decreases.
1636 	 *
1637 	 * If the new MTU is higher, and the route PMTU is equal to the local
1638 	 * MTU, this means the old MTU is the lowest in the path, so allow
1639 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1640 	 * handle this.
1641 	 */
1642 
1643 	if (dst_mtu(&rt->dst) >= mtu)
1644 		return true;
1645 
1646 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1647 		return true;
1648 
1649 	return false;
1650 }
1651 
1652 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1653 				       struct fib6_info *rt, int mtu)
1654 {
1655 	struct rt6_exception_bucket *bucket;
1656 	struct rt6_exception *rt6_ex;
1657 	int i;
1658 
1659 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1660 					lockdep_is_held(&rt6_exception_lock));
1661 
1662 	if (!bucket)
1663 		return;
1664 
1665 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1667 			struct rt6_info *entry = rt6_ex->rt6i;
1668 
1669 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1670 			 * route), the metrics of its rt->from have already
1671 			 * been updated.
1672 			 */
1673 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1674 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1675 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1676 		}
1677 		bucket++;
1678 	}
1679 }
1680 
1681 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1682 
1683 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1684 					struct in6_addr *gateway)
1685 {
1686 	struct rt6_exception_bucket *bucket;
1687 	struct rt6_exception *rt6_ex;
1688 	struct hlist_node *tmp;
1689 	int i;
1690 
1691 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1692 		return;
1693 
1694 	spin_lock_bh(&rt6_exception_lock);
1695 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1696 				     lockdep_is_held(&rt6_exception_lock));
1697 
1698 	if (bucket) {
1699 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1700 			hlist_for_each_entry_safe(rt6_ex, tmp,
1701 						  &bucket->chain, hlist) {
1702 				struct rt6_info *entry = rt6_ex->rt6i;
1703 
1704 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1705 				    RTF_CACHE_GATEWAY &&
1706 				    ipv6_addr_equal(gateway,
1707 						    &entry->rt6i_gateway)) {
1708 					rt6_remove_exception(bucket, rt6_ex);
1709 				}
1710 			}
1711 			bucket++;
1712 		}
1713 	}
1714 
1715 	spin_unlock_bh(&rt6_exception_lock);
1716 }
1717 
1718 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1719 				      struct rt6_exception *rt6_ex,
1720 				      struct fib6_gc_args *gc_args,
1721 				      unsigned long now)
1722 {
1723 	struct rt6_info *rt = rt6_ex->rt6i;
1724 
1725 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1726 	 * even if others have still references to them, so that on next
1727 	 * dst_check() such references can be dropped.
1728 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1729 	 * expired, independently from their aging, as per RFC 8201 section 4
1730 	 */
1731 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1732 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1733 			RT6_TRACE("aging clone %p\n", rt);
1734 			rt6_remove_exception(bucket, rt6_ex);
1735 			return;
1736 		}
1737 	} else if (time_after(jiffies, rt->dst.expires)) {
1738 		RT6_TRACE("purging expired route %p\n", rt);
1739 		rt6_remove_exception(bucket, rt6_ex);
1740 		return;
1741 	}
1742 
1743 	if (rt->rt6i_flags & RTF_GATEWAY) {
1744 		struct neighbour *neigh;
1745 		__u8 neigh_flags = 0;
1746 
1747 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1748 		if (neigh)
1749 			neigh_flags = neigh->flags;
1750 
1751 		if (!(neigh_flags & NTF_ROUTER)) {
1752 			RT6_TRACE("purging route %p via non-router but gateway\n",
1753 				  rt);
1754 			rt6_remove_exception(bucket, rt6_ex);
1755 			return;
1756 		}
1757 	}
1758 
1759 	gc_args->more++;
1760 }
1761 
1762 void rt6_age_exceptions(struct fib6_info *rt,
1763 			struct fib6_gc_args *gc_args,
1764 			unsigned long now)
1765 {
1766 	struct rt6_exception_bucket *bucket;
1767 	struct rt6_exception *rt6_ex;
1768 	struct hlist_node *tmp;
1769 	int i;
1770 
1771 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1772 		return;
1773 
1774 	rcu_read_lock_bh();
1775 	spin_lock(&rt6_exception_lock);
1776 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1777 				    lockdep_is_held(&rt6_exception_lock));
1778 
1779 	if (bucket) {
1780 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1781 			hlist_for_each_entry_safe(rt6_ex, tmp,
1782 						  &bucket->chain, hlist) {
1783 				rt6_age_examine_exception(bucket, rt6_ex,
1784 							  gc_args, now);
1785 			}
1786 			bucket++;
1787 		}
1788 	}
1789 	spin_unlock(&rt6_exception_lock);
1790 	rcu_read_unlock_bh();
1791 }
1792 
1793 /* must be called with rcu lock held */
1794 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1795 				    int oif, struct flowi6 *fl6, int strict)
1796 {
1797 	struct fib6_node *fn, *saved_fn;
1798 	struct fib6_info *f6i;
1799 
1800 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1801 	saved_fn = fn;
1802 
1803 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1804 		oif = 0;
1805 
1806 redo_rt6_select:
1807 	f6i = rt6_select(net, fn, oif, strict);
1808 	if (f6i == net->ipv6.fib6_null_entry) {
1809 		fn = fib6_backtrack(fn, &fl6->saddr);
1810 		if (fn)
1811 			goto redo_rt6_select;
1812 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1813 			/* also consider unreachable route */
1814 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1815 			fn = saved_fn;
1816 			goto redo_rt6_select;
1817 		}
1818 	}
1819 
1820 	trace_fib6_table_lookup(net, f6i, table, fl6);
1821 
1822 	return f6i;
1823 }
1824 
1825 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1826 			       int oif, struct flowi6 *fl6,
1827 			       const struct sk_buff *skb, int flags)
1828 {
1829 	struct fib6_info *f6i;
1830 	struct rt6_info *rt;
1831 	int strict = 0;
1832 
1833 	strict |= flags & RT6_LOOKUP_F_IFACE;
1834 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1835 	if (net->ipv6.devconf_all->forwarding == 0)
1836 		strict |= RT6_LOOKUP_F_REACHABLE;
1837 
1838 	rcu_read_lock();
1839 
1840 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1841 	if (f6i->fib6_nsiblings)
1842 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1843 
1844 	if (f6i == net->ipv6.fib6_null_entry) {
1845 		rt = net->ipv6.ip6_null_entry;
1846 		rcu_read_unlock();
1847 		dst_hold(&rt->dst);
1848 		return rt;
1849 	}
1850 
1851 	/*Search through exception table */
1852 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1853 	if (rt) {
1854 		if (ip6_hold_safe(net, &rt, true))
1855 			dst_use_noref(&rt->dst, jiffies);
1856 
1857 		rcu_read_unlock();
1858 		return rt;
1859 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1860 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1861 		/* Create a RTF_CACHE clone which will not be
1862 		 * owned by the fib6 tree.  It is for the special case where
1863 		 * the daddr in the skb during the neighbor look-up is different
1864 		 * from the fl6->daddr used to look-up route here.
1865 		 */
1866 		struct rt6_info *uncached_rt;
1867 
1868 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1869 
1870 		rcu_read_unlock();
1871 
1872 		if (uncached_rt) {
1873 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1874 			 * No need for another dst_hold()
1875 			 */
1876 			rt6_uncached_list_add(uncached_rt);
1877 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1878 		} else {
1879 			uncached_rt = net->ipv6.ip6_null_entry;
1880 			dst_hold(&uncached_rt->dst);
1881 		}
1882 
1883 		return uncached_rt;
1884 	} else {
1885 		/* Get a percpu copy */
1886 
1887 		struct rt6_info *pcpu_rt;
1888 
1889 		local_bh_disable();
1890 		pcpu_rt = rt6_get_pcpu_route(f6i);
1891 
1892 		if (!pcpu_rt)
1893 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894 
1895 		local_bh_enable();
1896 		rcu_read_unlock();
1897 
1898 		return pcpu_rt;
1899 	}
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902 
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904 					    struct fib6_table *table,
1905 					    struct flowi6 *fl6,
1906 					    const struct sk_buff *skb,
1907 					    int flags)
1908 {
1909 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 }
1911 
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913 					 struct net_device *dev,
1914 					 struct flowi6 *fl6,
1915 					 const struct sk_buff *skb,
1916 					 int flags)
1917 {
1918 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919 		flags |= RT6_LOOKUP_F_IFACE;
1920 
1921 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 }
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924 
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926 				  struct flow_keys *keys,
1927 				  struct flow_keys *flkeys)
1928 {
1929 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930 	const struct ipv6hdr *key_iph = outer_iph;
1931 	struct flow_keys *_flkeys = flkeys;
1932 	const struct ipv6hdr *inner_iph;
1933 	const struct icmp6hdr *icmph;
1934 	struct ipv6hdr _inner_iph;
1935 	struct icmp6hdr _icmph;
1936 
1937 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938 		goto out;
1939 
1940 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941 				   sizeof(_icmph), &_icmph);
1942 	if (!icmph)
1943 		goto out;
1944 
1945 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1949 		goto out;
1950 
1951 	inner_iph = skb_header_pointer(skb,
1952 				       skb_transport_offset(skb) + sizeof(*icmph),
1953 				       sizeof(_inner_iph), &_inner_iph);
1954 	if (!inner_iph)
1955 		goto out;
1956 
1957 	key_iph = inner_iph;
1958 	_flkeys = NULL;
1959 out:
1960 	if (_flkeys) {
1961 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963 		keys->tags.flow_label = _flkeys->tags.flow_label;
1964 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965 	} else {
1966 		keys->addrs.v6addrs.src = key_iph->saddr;
1967 		keys->addrs.v6addrs.dst = key_iph->daddr;
1968 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1969 		keys->basic.ip_proto = key_iph->nexthdr;
1970 	}
1971 }
1972 
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1976 {
1977 	struct flow_keys hash_keys;
1978 	u32 mhash;
1979 
1980 	switch (ip6_multipath_hash_policy(net)) {
1981 	case 0:
1982 		memset(&hash_keys, 0, sizeof(hash_keys));
1983 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984 		if (skb) {
1985 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986 		} else {
1987 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1988 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1990 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991 		}
1992 		break;
1993 	case 1:
1994 		if (skb) {
1995 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996 			struct flow_keys keys;
1997 
1998 			/* short-circuit if we already have L4 hash present */
1999 			if (skb->l4_hash)
2000 				return skb_get_hash_raw(skb) >> 1;
2001 
2002 			memset(&hash_keys, 0, sizeof(hash_keys));
2003 
2004                         if (!flkeys) {
2005 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2006 				flkeys = &keys;
2007 			}
2008 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011 			hash_keys.ports.src = flkeys->ports.src;
2012 			hash_keys.ports.dst = flkeys->ports.dst;
2013 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014 		} else {
2015 			memset(&hash_keys, 0, sizeof(hash_keys));
2016 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2018 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019 			hash_keys.ports.src = fl6->fl6_sport;
2020 			hash_keys.ports.dst = fl6->fl6_dport;
2021 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022 		}
2023 		break;
2024 	}
2025 	mhash = flow_hash_from_keys(&hash_keys);
2026 
2027 	return mhash >> 1;
2028 }
2029 
2030 void ip6_route_input(struct sk_buff *skb)
2031 {
2032 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2033 	struct net *net = dev_net(skb->dev);
2034 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2035 	struct ip_tunnel_info *tun_info;
2036 	struct flowi6 fl6 = {
2037 		.flowi6_iif = skb->dev->ifindex,
2038 		.daddr = iph->daddr,
2039 		.saddr = iph->saddr,
2040 		.flowlabel = ip6_flowinfo(iph),
2041 		.flowi6_mark = skb->mark,
2042 		.flowi6_proto = iph->nexthdr,
2043 	};
2044 	struct flow_keys *flkeys = NULL, _flkeys;
2045 
2046 	tun_info = skb_tunnel_info(skb);
2047 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049 
2050 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051 		flkeys = &_flkeys;
2052 
2053 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055 	skb_dst_drop(skb);
2056 	skb_dst_set(skb,
2057 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058 }
2059 
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061 					     struct fib6_table *table,
2062 					     struct flowi6 *fl6,
2063 					     const struct sk_buff *skb,
2064 					     int flags)
2065 {
2066 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067 }
2068 
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070 					 struct flowi6 *fl6, int flags)
2071 {
2072 	bool any_src;
2073 
2074 	if (ipv6_addr_type(&fl6->daddr) &
2075 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2076 		struct dst_entry *dst;
2077 
2078 		dst = l3mdev_link_scope_lookup(net, fl6);
2079 		if (dst)
2080 			return dst;
2081 	}
2082 
2083 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2084 
2085 	any_src = ipv6_addr_any(&fl6->saddr);
2086 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2087 	    (fl6->flowi6_oif && any_src))
2088 		flags |= RT6_LOOKUP_F_IFACE;
2089 
2090 	if (!any_src)
2091 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2092 	else if (sk)
2093 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2094 
2095 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2096 }
2097 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2098 
2099 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2100 {
2101 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2102 	struct net_device *loopback_dev = net->loopback_dev;
2103 	struct dst_entry *new = NULL;
2104 
2105 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2106 		       DST_OBSOLETE_DEAD, 0);
2107 	if (rt) {
2108 		rt6_info_init(rt);
2109 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2110 
2111 		new = &rt->dst;
2112 		new->__use = 1;
2113 		new->input = dst_discard;
2114 		new->output = dst_discard_out;
2115 
2116 		dst_copy_metrics(new, &ort->dst);
2117 
2118 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2119 		rt->rt6i_gateway = ort->rt6i_gateway;
2120 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2121 
2122 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2123 #ifdef CONFIG_IPV6_SUBTREES
2124 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2125 #endif
2126 	}
2127 
2128 	dst_release(dst_orig);
2129 	return new ? new : ERR_PTR(-ENOMEM);
2130 }
2131 
2132 /*
2133  *	Destination cache support functions
2134  */
2135 
2136 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2137 {
2138 	u32 rt_cookie = 0;
2139 
2140 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2141 		return false;
2142 
2143 	if (fib6_check_expired(f6i))
2144 		return false;
2145 
2146 	return true;
2147 }
2148 
2149 static struct dst_entry *rt6_check(struct rt6_info *rt,
2150 				   struct fib6_info *from,
2151 				   u32 cookie)
2152 {
2153 	u32 rt_cookie = 0;
2154 
2155 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2156 	    rt_cookie != cookie)
2157 		return NULL;
2158 
2159 	if (rt6_check_expired(rt))
2160 		return NULL;
2161 
2162 	return &rt->dst;
2163 }
2164 
2165 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2166 					    struct fib6_info *from,
2167 					    u32 cookie)
2168 {
2169 	if (!__rt6_check_expired(rt) &&
2170 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2171 	    fib6_check(from, cookie))
2172 		return &rt->dst;
2173 	else
2174 		return NULL;
2175 }
2176 
2177 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2178 {
2179 	struct dst_entry *dst_ret;
2180 	struct fib6_info *from;
2181 	struct rt6_info *rt;
2182 
2183 	rt = container_of(dst, struct rt6_info, dst);
2184 
2185 	rcu_read_lock();
2186 
2187 	/* All IPV6 dsts are created with ->obsolete set to the value
2188 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2189 	 * into this function always.
2190 	 */
2191 
2192 	from = rcu_dereference(rt->from);
2193 
2194 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2195 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2196 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2197 	else
2198 		dst_ret = rt6_check(rt, from, cookie);
2199 
2200 	rcu_read_unlock();
2201 
2202 	return dst_ret;
2203 }
2204 
2205 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2206 {
2207 	struct rt6_info *rt = (struct rt6_info *) dst;
2208 
2209 	if (rt) {
2210 		if (rt->rt6i_flags & RTF_CACHE) {
2211 			rcu_read_lock();
2212 			if (rt6_check_expired(rt)) {
2213 				rt6_remove_exception_rt(rt);
2214 				dst = NULL;
2215 			}
2216 			rcu_read_unlock();
2217 		} else {
2218 			dst_release(dst);
2219 			dst = NULL;
2220 		}
2221 	}
2222 	return dst;
2223 }
2224 
2225 static void ip6_link_failure(struct sk_buff *skb)
2226 {
2227 	struct rt6_info *rt;
2228 
2229 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2230 
2231 	rt = (struct rt6_info *) skb_dst(skb);
2232 	if (rt) {
2233 		rcu_read_lock();
2234 		if (rt->rt6i_flags & RTF_CACHE) {
2235 			rt6_remove_exception_rt(rt);
2236 		} else {
2237 			struct fib6_info *from;
2238 			struct fib6_node *fn;
2239 
2240 			from = rcu_dereference(rt->from);
2241 			if (from) {
2242 				fn = rcu_dereference(from->fib6_node);
2243 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2244 					fn->fn_sernum = -1;
2245 			}
2246 		}
2247 		rcu_read_unlock();
2248 	}
2249 }
2250 
2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2252 {
2253 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254 		struct fib6_info *from;
2255 
2256 		rcu_read_lock();
2257 		from = rcu_dereference(rt0->from);
2258 		if (from)
2259 			rt0->dst.expires = from->expires;
2260 		rcu_read_unlock();
2261 	}
2262 
2263 	dst_set_expires(&rt0->dst, timeout);
2264 	rt0->rt6i_flags |= RTF_EXPIRES;
2265 }
2266 
2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2268 {
2269 	struct net *net = dev_net(rt->dst.dev);
2270 
2271 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272 	rt->rt6i_flags |= RTF_MODIFIED;
2273 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2274 }
2275 
2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277 {
2278 	bool from_set;
2279 
2280 	rcu_read_lock();
2281 	from_set = !!rcu_dereference(rt->from);
2282 	rcu_read_unlock();
2283 
2284 	return !(rt->rt6i_flags & RTF_CACHE) &&
2285 		(rt->rt6i_flags & RTF_PCPU || from_set);
2286 }
2287 
2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289 				 const struct ipv6hdr *iph, u32 mtu)
2290 {
2291 	const struct in6_addr *daddr, *saddr;
2292 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2293 
2294 	if (dst_metric_locked(dst, RTAX_MTU))
2295 		return;
2296 
2297 	if (iph) {
2298 		daddr = &iph->daddr;
2299 		saddr = &iph->saddr;
2300 	} else if (sk) {
2301 		daddr = &sk->sk_v6_daddr;
2302 		saddr = &inet6_sk(sk)->saddr;
2303 	} else {
2304 		daddr = NULL;
2305 		saddr = NULL;
2306 	}
2307 	dst_confirm_neigh(dst, daddr);
2308 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2309 	if (mtu >= dst_mtu(dst))
2310 		return;
2311 
2312 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2313 		rt6_do_update_pmtu(rt6, mtu);
2314 		/* update rt6_ex->stamp for cache */
2315 		if (rt6->rt6i_flags & RTF_CACHE)
2316 			rt6_update_exception_stamp_rt(rt6);
2317 	} else if (daddr) {
2318 		struct fib6_info *from;
2319 		struct rt6_info *nrt6;
2320 
2321 		rcu_read_lock();
2322 		from = rcu_dereference(rt6->from);
2323 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2324 		if (nrt6) {
2325 			rt6_do_update_pmtu(nrt6, mtu);
2326 			if (rt6_insert_exception(nrt6, from))
2327 				dst_release_immediate(&nrt6->dst);
2328 		}
2329 		rcu_read_unlock();
2330 	}
2331 }
2332 
2333 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2334 			       struct sk_buff *skb, u32 mtu)
2335 {
2336 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2337 }
2338 
2339 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2340 		     int oif, u32 mark, kuid_t uid)
2341 {
2342 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2343 	struct dst_entry *dst;
2344 	struct flowi6 fl6 = {
2345 		.flowi6_oif = oif,
2346 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2347 		.daddr = iph->daddr,
2348 		.saddr = iph->saddr,
2349 		.flowlabel = ip6_flowinfo(iph),
2350 		.flowi6_uid = uid,
2351 	};
2352 
2353 	dst = ip6_route_output(net, NULL, &fl6);
2354 	if (!dst->error)
2355 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356 	dst_release(dst);
2357 }
2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359 
2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361 {
2362 	int oif = sk->sk_bound_dev_if;
2363 	struct dst_entry *dst;
2364 
2365 	if (!oif && skb->dev)
2366 		oif = l3mdev_master_ifindex(skb->dev);
2367 
2368 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2369 
2370 	dst = __sk_dst_get(sk);
2371 	if (!dst || !dst->obsolete ||
2372 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373 		return;
2374 
2375 	bh_lock_sock(sk);
2376 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2377 		ip6_datagram_dst_update(sk, false);
2378 	bh_unlock_sock(sk);
2379 }
2380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2381 
2382 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2383 			   const struct flowi6 *fl6)
2384 {
2385 #ifdef CONFIG_IPV6_SUBTREES
2386 	struct ipv6_pinfo *np = inet6_sk(sk);
2387 #endif
2388 
2389 	ip6_dst_store(sk, dst,
2390 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2391 		      &sk->sk_v6_daddr : NULL,
2392 #ifdef CONFIG_IPV6_SUBTREES
2393 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2394 		      &np->saddr :
2395 #endif
2396 		      NULL);
2397 }
2398 
2399 /* Handle redirects */
2400 struct ip6rd_flowi {
2401 	struct flowi6 fl6;
2402 	struct in6_addr gateway;
2403 };
2404 
2405 static struct rt6_info *__ip6_route_redirect(struct net *net,
2406 					     struct fib6_table *table,
2407 					     struct flowi6 *fl6,
2408 					     const struct sk_buff *skb,
2409 					     int flags)
2410 {
2411 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2412 	struct rt6_info *ret = NULL, *rt_cache;
2413 	struct fib6_info *rt;
2414 	struct fib6_node *fn;
2415 
2416 	/* Get the "current" route for this destination and
2417 	 * check if the redirect has come from appropriate router.
2418 	 *
2419 	 * RFC 4861 specifies that redirects should only be
2420 	 * accepted if they come from the nexthop to the target.
2421 	 * Due to the way the routes are chosen, this notion
2422 	 * is a bit fuzzy and one might need to check all possible
2423 	 * routes.
2424 	 */
2425 
2426 	rcu_read_lock();
2427 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2428 restart:
2429 	for_each_fib6_node_rt_rcu(fn) {
2430 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2431 			continue;
2432 		if (fib6_check_expired(rt))
2433 			continue;
2434 		if (rt->fib6_flags & RTF_REJECT)
2435 			break;
2436 		if (!(rt->fib6_flags & RTF_GATEWAY))
2437 			continue;
2438 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2439 			continue;
2440 		/* rt_cache's gateway might be different from its 'parent'
2441 		 * in the case of an ip redirect.
2442 		 * So we keep searching in the exception table if the gateway
2443 		 * is different.
2444 		 */
2445 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2446 			rt_cache = rt6_find_cached_rt(rt,
2447 						      &fl6->daddr,
2448 						      &fl6->saddr);
2449 			if (rt_cache &&
2450 			    ipv6_addr_equal(&rdfl->gateway,
2451 					    &rt_cache->rt6i_gateway)) {
2452 				ret = rt_cache;
2453 				break;
2454 			}
2455 			continue;
2456 		}
2457 		break;
2458 	}
2459 
2460 	if (!rt)
2461 		rt = net->ipv6.fib6_null_entry;
2462 	else if (rt->fib6_flags & RTF_REJECT) {
2463 		ret = net->ipv6.ip6_null_entry;
2464 		goto out;
2465 	}
2466 
2467 	if (rt == net->ipv6.fib6_null_entry) {
2468 		fn = fib6_backtrack(fn, &fl6->saddr);
2469 		if (fn)
2470 			goto restart;
2471 	}
2472 
2473 out:
2474 	if (ret)
2475 		ip6_hold_safe(net, &ret, true);
2476 	else
2477 		ret = ip6_create_rt_rcu(rt);
2478 
2479 	rcu_read_unlock();
2480 
2481 	trace_fib6_table_lookup(net, rt, table, fl6);
2482 	return ret;
2483 };
2484 
2485 static struct dst_entry *ip6_route_redirect(struct net *net,
2486 					    const struct flowi6 *fl6,
2487 					    const struct sk_buff *skb,
2488 					    const struct in6_addr *gateway)
2489 {
2490 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2491 	struct ip6rd_flowi rdfl;
2492 
2493 	rdfl.fl6 = *fl6;
2494 	rdfl.gateway = *gateway;
2495 
2496 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2497 				flags, __ip6_route_redirect);
2498 }
2499 
2500 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2501 		  kuid_t uid)
2502 {
2503 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2504 	struct dst_entry *dst;
2505 	struct flowi6 fl6 = {
2506 		.flowi6_iif = LOOPBACK_IFINDEX,
2507 		.flowi6_oif = oif,
2508 		.flowi6_mark = mark,
2509 		.daddr = iph->daddr,
2510 		.saddr = iph->saddr,
2511 		.flowlabel = ip6_flowinfo(iph),
2512 		.flowi6_uid = uid,
2513 	};
2514 
2515 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2516 	rt6_do_redirect(dst, NULL, skb);
2517 	dst_release(dst);
2518 }
2519 EXPORT_SYMBOL_GPL(ip6_redirect);
2520 
2521 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2522 {
2523 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2524 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2525 	struct dst_entry *dst;
2526 	struct flowi6 fl6 = {
2527 		.flowi6_iif = LOOPBACK_IFINDEX,
2528 		.flowi6_oif = oif,
2529 		.daddr = msg->dest,
2530 		.saddr = iph->daddr,
2531 		.flowi6_uid = sock_net_uid(net, NULL),
2532 	};
2533 
2534 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2535 	rt6_do_redirect(dst, NULL, skb);
2536 	dst_release(dst);
2537 }
2538 
2539 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2540 {
2541 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2542 		     sk->sk_uid);
2543 }
2544 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2545 
2546 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2547 {
2548 	struct net_device *dev = dst->dev;
2549 	unsigned int mtu = dst_mtu(dst);
2550 	struct net *net = dev_net(dev);
2551 
2552 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2553 
2554 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2555 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2556 
2557 	/*
2558 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2559 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2560 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2561 	 * rely only on pmtu discovery"
2562 	 */
2563 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2564 		mtu = IPV6_MAXPLEN;
2565 	return mtu;
2566 }
2567 
2568 static unsigned int ip6_mtu(const struct dst_entry *dst)
2569 {
2570 	struct inet6_dev *idev;
2571 	unsigned int mtu;
2572 
2573 	mtu = dst_metric_raw(dst, RTAX_MTU);
2574 	if (mtu)
2575 		goto out;
2576 
2577 	mtu = IPV6_MIN_MTU;
2578 
2579 	rcu_read_lock();
2580 	idev = __in6_dev_get(dst->dev);
2581 	if (idev)
2582 		mtu = idev->cnf.mtu6;
2583 	rcu_read_unlock();
2584 
2585 out:
2586 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2587 
2588 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2589 }
2590 
2591 /* MTU selection:
2592  * 1. mtu on route is locked - use it
2593  * 2. mtu from nexthop exception
2594  * 3. mtu from egress device
2595  *
2596  * based on ip6_dst_mtu_forward and exception logic of
2597  * rt6_find_cached_rt; called with rcu_read_lock
2598  */
2599 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2600 		      struct in6_addr *saddr)
2601 {
2602 	struct rt6_exception_bucket *bucket;
2603 	struct rt6_exception *rt6_ex;
2604 	struct in6_addr *src_key;
2605 	struct inet6_dev *idev;
2606 	u32 mtu = 0;
2607 
2608 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2609 		mtu = f6i->fib6_pmtu;
2610 		if (mtu)
2611 			goto out;
2612 	}
2613 
2614 	src_key = NULL;
2615 #ifdef CONFIG_IPV6_SUBTREES
2616 	if (f6i->fib6_src.plen)
2617 		src_key = saddr;
2618 #endif
2619 
2620 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2621 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2622 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2623 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2624 
2625 	if (likely(!mtu)) {
2626 		struct net_device *dev = fib6_info_nh_dev(f6i);
2627 
2628 		mtu = IPV6_MIN_MTU;
2629 		idev = __in6_dev_get(dev);
2630 		if (idev && idev->cnf.mtu6 > mtu)
2631 			mtu = idev->cnf.mtu6;
2632 	}
2633 
2634 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2635 out:
2636 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2637 }
2638 
2639 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2640 				  struct flowi6 *fl6)
2641 {
2642 	struct dst_entry *dst;
2643 	struct rt6_info *rt;
2644 	struct inet6_dev *idev = in6_dev_get(dev);
2645 	struct net *net = dev_net(dev);
2646 
2647 	if (unlikely(!idev))
2648 		return ERR_PTR(-ENODEV);
2649 
2650 	rt = ip6_dst_alloc(net, dev, 0);
2651 	if (unlikely(!rt)) {
2652 		in6_dev_put(idev);
2653 		dst = ERR_PTR(-ENOMEM);
2654 		goto out;
2655 	}
2656 
2657 	rt->dst.flags |= DST_HOST;
2658 	rt->dst.input = ip6_input;
2659 	rt->dst.output  = ip6_output;
2660 	rt->rt6i_gateway  = fl6->daddr;
2661 	rt->rt6i_dst.addr = fl6->daddr;
2662 	rt->rt6i_dst.plen = 128;
2663 	rt->rt6i_idev     = idev;
2664 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2665 
2666 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2667 	 * do proper release of the net_device
2668 	 */
2669 	rt6_uncached_list_add(rt);
2670 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2671 
2672 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2673 
2674 out:
2675 	return dst;
2676 }
2677 
2678 static int ip6_dst_gc(struct dst_ops *ops)
2679 {
2680 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2681 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2682 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2683 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2684 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2685 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2686 	int entries;
2687 
2688 	entries = dst_entries_get_fast(ops);
2689 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2690 	    entries <= rt_max_size)
2691 		goto out;
2692 
2693 	net->ipv6.ip6_rt_gc_expire++;
2694 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2695 	entries = dst_entries_get_slow(ops);
2696 	if (entries < ops->gc_thresh)
2697 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2698 out:
2699 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2700 	return entries > rt_max_size;
2701 }
2702 
2703 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2704 					    struct fib6_config *cfg,
2705 					    const struct in6_addr *gw_addr,
2706 					    u32 tbid, int flags)
2707 {
2708 	struct flowi6 fl6 = {
2709 		.flowi6_oif = cfg->fc_ifindex,
2710 		.daddr = *gw_addr,
2711 		.saddr = cfg->fc_prefsrc,
2712 	};
2713 	struct fib6_table *table;
2714 	struct rt6_info *rt;
2715 
2716 	table = fib6_get_table(net, tbid);
2717 	if (!table)
2718 		return NULL;
2719 
2720 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2721 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2722 
2723 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2724 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2725 
2726 	/* if table lookup failed, fall back to full lookup */
2727 	if (rt == net->ipv6.ip6_null_entry) {
2728 		ip6_rt_put(rt);
2729 		rt = NULL;
2730 	}
2731 
2732 	return rt;
2733 }
2734 
2735 static int ip6_route_check_nh_onlink(struct net *net,
2736 				     struct fib6_config *cfg,
2737 				     const struct net_device *dev,
2738 				     struct netlink_ext_ack *extack)
2739 {
2740 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2741 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2742 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2743 	struct rt6_info *grt;
2744 	int err;
2745 
2746 	err = 0;
2747 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2748 	if (grt) {
2749 		if (!grt->dst.error &&
2750 		    /* ignore match if it is the default route */
2751 		    grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2752 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753 			NL_SET_ERR_MSG(extack,
2754 				       "Nexthop has invalid gateway or device mismatch");
2755 			err = -EINVAL;
2756 		}
2757 
2758 		ip6_rt_put(grt);
2759 	}
2760 
2761 	return err;
2762 }
2763 
2764 static int ip6_route_check_nh(struct net *net,
2765 			      struct fib6_config *cfg,
2766 			      struct net_device **_dev,
2767 			      struct inet6_dev **idev)
2768 {
2769 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2770 	struct net_device *dev = _dev ? *_dev : NULL;
2771 	struct rt6_info *grt = NULL;
2772 	int err = -EHOSTUNREACH;
2773 
2774 	if (cfg->fc_table) {
2775 		int flags = RT6_LOOKUP_F_IFACE;
2776 
2777 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2778 					  cfg->fc_table, flags);
2779 		if (grt) {
2780 			if (grt->rt6i_flags & RTF_GATEWAY ||
2781 			    (dev && dev != grt->dst.dev)) {
2782 				ip6_rt_put(grt);
2783 				grt = NULL;
2784 			}
2785 		}
2786 	}
2787 
2788 	if (!grt)
2789 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2790 
2791 	if (!grt)
2792 		goto out;
2793 
2794 	if (dev) {
2795 		if (dev != grt->dst.dev) {
2796 			ip6_rt_put(grt);
2797 			goto out;
2798 		}
2799 	} else {
2800 		*_dev = dev = grt->dst.dev;
2801 		*idev = grt->rt6i_idev;
2802 		dev_hold(dev);
2803 		in6_dev_hold(grt->rt6i_idev);
2804 	}
2805 
2806 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2807 		err = 0;
2808 
2809 	ip6_rt_put(grt);
2810 
2811 out:
2812 	return err;
2813 }
2814 
2815 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2816 			   struct net_device **_dev, struct inet6_dev **idev,
2817 			   struct netlink_ext_ack *extack)
2818 {
2819 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820 	int gwa_type = ipv6_addr_type(gw_addr);
2821 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2822 	const struct net_device *dev = *_dev;
2823 	bool need_addr_check = !dev;
2824 	int err = -EINVAL;
2825 
2826 	/* if gw_addr is local we will fail to detect this in case
2827 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2828 	 * will return already-added prefix route via interface that
2829 	 * prefix route was assigned to, which might be non-loopback.
2830 	 */
2831 	if (dev &&
2832 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2833 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2834 		goto out;
2835 	}
2836 
2837 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2838 		/* IPv6 strictly inhibits using not link-local
2839 		 * addresses as nexthop address.
2840 		 * Otherwise, router will not able to send redirects.
2841 		 * It is very good, but in some (rare!) circumstances
2842 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2843 		 * some exceptions. --ANK
2844 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2845 		 * addressing
2846 		 */
2847 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2848 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2849 			goto out;
2850 		}
2851 
2852 		if (cfg->fc_flags & RTNH_F_ONLINK)
2853 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2854 		else
2855 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2856 
2857 		if (err)
2858 			goto out;
2859 	}
2860 
2861 	/* reload in case device was changed */
2862 	dev = *_dev;
2863 
2864 	err = -EINVAL;
2865 	if (!dev) {
2866 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2867 		goto out;
2868 	} else if (dev->flags & IFF_LOOPBACK) {
2869 		NL_SET_ERR_MSG(extack,
2870 			       "Egress device can not be loopback device for this route");
2871 		goto out;
2872 	}
2873 
2874 	/* if we did not check gw_addr above, do so now that the
2875 	 * egress device has been resolved.
2876 	 */
2877 	if (need_addr_check &&
2878 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2879 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2880 		goto out;
2881 	}
2882 
2883 	err = 0;
2884 out:
2885 	return err;
2886 }
2887 
2888 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2889 					      gfp_t gfp_flags,
2890 					      struct netlink_ext_ack *extack)
2891 {
2892 	struct net *net = cfg->fc_nlinfo.nl_net;
2893 	struct fib6_info *rt = NULL;
2894 	struct net_device *dev = NULL;
2895 	struct inet6_dev *idev = NULL;
2896 	struct fib6_table *table;
2897 	int addr_type;
2898 	int err = -EINVAL;
2899 
2900 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2901 	if (cfg->fc_flags & RTF_PCPU) {
2902 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2903 		goto out;
2904 	}
2905 
2906 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2907 	if (cfg->fc_flags & RTF_CACHE) {
2908 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2909 		goto out;
2910 	}
2911 
2912 	if (cfg->fc_type > RTN_MAX) {
2913 		NL_SET_ERR_MSG(extack, "Invalid route type");
2914 		goto out;
2915 	}
2916 
2917 	if (cfg->fc_dst_len > 128) {
2918 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2919 		goto out;
2920 	}
2921 	if (cfg->fc_src_len > 128) {
2922 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2923 		goto out;
2924 	}
2925 #ifndef CONFIG_IPV6_SUBTREES
2926 	if (cfg->fc_src_len) {
2927 		NL_SET_ERR_MSG(extack,
2928 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2929 		goto out;
2930 	}
2931 #endif
2932 	if (cfg->fc_ifindex) {
2933 		err = -ENODEV;
2934 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2935 		if (!dev)
2936 			goto out;
2937 		idev = in6_dev_get(dev);
2938 		if (!idev)
2939 			goto out;
2940 	}
2941 
2942 	if (cfg->fc_metric == 0)
2943 		cfg->fc_metric = IP6_RT_PRIO_USER;
2944 
2945 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2946 		if (!dev) {
2947 			NL_SET_ERR_MSG(extack,
2948 				       "Nexthop device required for onlink");
2949 			err = -ENODEV;
2950 			goto out;
2951 		}
2952 
2953 		if (!(dev->flags & IFF_UP)) {
2954 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2955 			err = -ENETDOWN;
2956 			goto out;
2957 		}
2958 	}
2959 
2960 	err = -ENOBUFS;
2961 	if (cfg->fc_nlinfo.nlh &&
2962 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2963 		table = fib6_get_table(net, cfg->fc_table);
2964 		if (!table) {
2965 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2966 			table = fib6_new_table(net, cfg->fc_table);
2967 		}
2968 	} else {
2969 		table = fib6_new_table(net, cfg->fc_table);
2970 	}
2971 
2972 	if (!table)
2973 		goto out;
2974 
2975 	err = -ENOMEM;
2976 	rt = fib6_info_alloc(gfp_flags);
2977 	if (!rt)
2978 		goto out;
2979 
2980 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2981 					       extack);
2982 	if (IS_ERR(rt->fib6_metrics)) {
2983 		err = PTR_ERR(rt->fib6_metrics);
2984 		/* Do not leave garbage there. */
2985 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2986 		goto out;
2987 	}
2988 
2989 	if (cfg->fc_flags & RTF_ADDRCONF)
2990 		rt->dst_nocount = true;
2991 
2992 	if (cfg->fc_flags & RTF_EXPIRES)
2993 		fib6_set_expires(rt, jiffies +
2994 				clock_t_to_jiffies(cfg->fc_expires));
2995 	else
2996 		fib6_clean_expires(rt);
2997 
2998 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2999 		cfg->fc_protocol = RTPROT_BOOT;
3000 	rt->fib6_protocol = cfg->fc_protocol;
3001 
3002 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3003 
3004 	if (cfg->fc_encap) {
3005 		struct lwtunnel_state *lwtstate;
3006 
3007 		err = lwtunnel_build_state(cfg->fc_encap_type,
3008 					   cfg->fc_encap, AF_INET6, cfg,
3009 					   &lwtstate, extack);
3010 		if (err)
3011 			goto out;
3012 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3013 	}
3014 
3015 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3016 	rt->fib6_dst.plen = cfg->fc_dst_len;
3017 	if (rt->fib6_dst.plen == 128)
3018 		rt->dst_host = true;
3019 
3020 #ifdef CONFIG_IPV6_SUBTREES
3021 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3022 	rt->fib6_src.plen = cfg->fc_src_len;
3023 #endif
3024 
3025 	rt->fib6_metric = cfg->fc_metric;
3026 	rt->fib6_nh.nh_weight = 1;
3027 
3028 	rt->fib6_type = cfg->fc_type;
3029 
3030 	/* We cannot add true routes via loopback here,
3031 	   they would result in kernel looping; promote them to reject routes
3032 	 */
3033 	if ((cfg->fc_flags & RTF_REJECT) ||
3034 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3035 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3036 	     !(cfg->fc_flags & RTF_LOCAL))) {
3037 		/* hold loopback dev/idev if we haven't done so. */
3038 		if (dev != net->loopback_dev) {
3039 			if (dev) {
3040 				dev_put(dev);
3041 				in6_dev_put(idev);
3042 			}
3043 			dev = net->loopback_dev;
3044 			dev_hold(dev);
3045 			idev = in6_dev_get(dev);
3046 			if (!idev) {
3047 				err = -ENODEV;
3048 				goto out;
3049 			}
3050 		}
3051 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3052 		goto install_route;
3053 	}
3054 
3055 	if (cfg->fc_flags & RTF_GATEWAY) {
3056 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3057 		if (err)
3058 			goto out;
3059 
3060 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3061 	}
3062 
3063 	err = -ENODEV;
3064 	if (!dev)
3065 		goto out;
3066 
3067 	if (idev->cnf.disable_ipv6) {
3068 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3069 		err = -EACCES;
3070 		goto out;
3071 	}
3072 
3073 	if (!(dev->flags & IFF_UP)) {
3074 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3075 		err = -ENETDOWN;
3076 		goto out;
3077 	}
3078 
3079 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3080 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3081 			NL_SET_ERR_MSG(extack, "Invalid source address");
3082 			err = -EINVAL;
3083 			goto out;
3084 		}
3085 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3086 		rt->fib6_prefsrc.plen = 128;
3087 	} else
3088 		rt->fib6_prefsrc.plen = 0;
3089 
3090 	rt->fib6_flags = cfg->fc_flags;
3091 
3092 install_route:
3093 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3094 	    !netif_carrier_ok(dev))
3095 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3096 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3097 	rt->fib6_nh.nh_dev = dev;
3098 	rt->fib6_table = table;
3099 
3100 	if (idev)
3101 		in6_dev_put(idev);
3102 
3103 	return rt;
3104 out:
3105 	if (dev)
3106 		dev_put(dev);
3107 	if (idev)
3108 		in6_dev_put(idev);
3109 
3110 	fib6_info_release(rt);
3111 	return ERR_PTR(err);
3112 }
3113 
3114 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3115 		  struct netlink_ext_ack *extack)
3116 {
3117 	struct fib6_info *rt;
3118 	int err;
3119 
3120 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3121 	if (IS_ERR(rt))
3122 		return PTR_ERR(rt);
3123 
3124 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3125 	fib6_info_release(rt);
3126 
3127 	return err;
3128 }
3129 
3130 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3131 {
3132 	struct net *net = info->nl_net;
3133 	struct fib6_table *table;
3134 	int err;
3135 
3136 	if (rt == net->ipv6.fib6_null_entry) {
3137 		err = -ENOENT;
3138 		goto out;
3139 	}
3140 
3141 	table = rt->fib6_table;
3142 	spin_lock_bh(&table->tb6_lock);
3143 	err = fib6_del(rt, info);
3144 	spin_unlock_bh(&table->tb6_lock);
3145 
3146 out:
3147 	fib6_info_release(rt);
3148 	return err;
3149 }
3150 
3151 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3152 {
3153 	struct nl_info info = { .nl_net = net };
3154 
3155 	return __ip6_del_rt(rt, &info);
3156 }
3157 
3158 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3159 {
3160 	struct nl_info *info = &cfg->fc_nlinfo;
3161 	struct net *net = info->nl_net;
3162 	struct sk_buff *skb = NULL;
3163 	struct fib6_table *table;
3164 	int err = -ENOENT;
3165 
3166 	if (rt == net->ipv6.fib6_null_entry)
3167 		goto out_put;
3168 	table = rt->fib6_table;
3169 	spin_lock_bh(&table->tb6_lock);
3170 
3171 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3172 		struct fib6_info *sibling, *next_sibling;
3173 
3174 		/* prefer to send a single notification with all hops */
3175 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3176 		if (skb) {
3177 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3178 
3179 			if (rt6_fill_node(net, skb, rt, NULL,
3180 					  NULL, NULL, 0, RTM_DELROUTE,
3181 					  info->portid, seq, 0) < 0) {
3182 				kfree_skb(skb);
3183 				skb = NULL;
3184 			} else
3185 				info->skip_notify = 1;
3186 		}
3187 
3188 		list_for_each_entry_safe(sibling, next_sibling,
3189 					 &rt->fib6_siblings,
3190 					 fib6_siblings) {
3191 			err = fib6_del(sibling, info);
3192 			if (err)
3193 				goto out_unlock;
3194 		}
3195 	}
3196 
3197 	err = fib6_del(rt, info);
3198 out_unlock:
3199 	spin_unlock_bh(&table->tb6_lock);
3200 out_put:
3201 	fib6_info_release(rt);
3202 
3203 	if (skb) {
3204 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3205 			    info->nlh, gfp_any());
3206 	}
3207 	return err;
3208 }
3209 
3210 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3211 {
3212 	int rc = -ESRCH;
3213 
3214 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3215 		goto out;
3216 
3217 	if (cfg->fc_flags & RTF_GATEWAY &&
3218 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3219 		goto out;
3220 
3221 	rc = rt6_remove_exception_rt(rt);
3222 out:
3223 	return rc;
3224 }
3225 
3226 static int ip6_route_del(struct fib6_config *cfg,
3227 			 struct netlink_ext_ack *extack)
3228 {
3229 	struct rt6_info *rt_cache;
3230 	struct fib6_table *table;
3231 	struct fib6_info *rt;
3232 	struct fib6_node *fn;
3233 	int err = -ESRCH;
3234 
3235 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3236 	if (!table) {
3237 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3238 		return err;
3239 	}
3240 
3241 	rcu_read_lock();
3242 
3243 	fn = fib6_locate(&table->tb6_root,
3244 			 &cfg->fc_dst, cfg->fc_dst_len,
3245 			 &cfg->fc_src, cfg->fc_src_len,
3246 			 !(cfg->fc_flags & RTF_CACHE));
3247 
3248 	if (fn) {
3249 		for_each_fib6_node_rt_rcu(fn) {
3250 			if (cfg->fc_flags & RTF_CACHE) {
3251 				int rc;
3252 
3253 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3254 							      &cfg->fc_src);
3255 				if (rt_cache) {
3256 					rc = ip6_del_cached_rt(rt_cache, cfg);
3257 					if (rc != -ESRCH) {
3258 						rcu_read_unlock();
3259 						return rc;
3260 					}
3261 				}
3262 				continue;
3263 			}
3264 			if (cfg->fc_ifindex &&
3265 			    (!rt->fib6_nh.nh_dev ||
3266 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3267 				continue;
3268 			if (cfg->fc_flags & RTF_GATEWAY &&
3269 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3270 				continue;
3271 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3272 				continue;
3273 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3274 				continue;
3275 			if (!fib6_info_hold_safe(rt))
3276 				continue;
3277 			rcu_read_unlock();
3278 
3279 			/* if gateway was specified only delete the one hop */
3280 			if (cfg->fc_flags & RTF_GATEWAY)
3281 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3282 
3283 			return __ip6_del_rt_siblings(rt, cfg);
3284 		}
3285 	}
3286 	rcu_read_unlock();
3287 
3288 	return err;
3289 }
3290 
3291 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3292 {
3293 	struct netevent_redirect netevent;
3294 	struct rt6_info *rt, *nrt = NULL;
3295 	struct ndisc_options ndopts;
3296 	struct inet6_dev *in6_dev;
3297 	struct neighbour *neigh;
3298 	struct fib6_info *from;
3299 	struct rd_msg *msg;
3300 	int optlen, on_link;
3301 	u8 *lladdr;
3302 
3303 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3304 	optlen -= sizeof(*msg);
3305 
3306 	if (optlen < 0) {
3307 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3308 		return;
3309 	}
3310 
3311 	msg = (struct rd_msg *)icmp6_hdr(skb);
3312 
3313 	if (ipv6_addr_is_multicast(&msg->dest)) {
3314 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3315 		return;
3316 	}
3317 
3318 	on_link = 0;
3319 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3320 		on_link = 1;
3321 	} else if (ipv6_addr_type(&msg->target) !=
3322 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3323 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3324 		return;
3325 	}
3326 
3327 	in6_dev = __in6_dev_get(skb->dev);
3328 	if (!in6_dev)
3329 		return;
3330 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3331 		return;
3332 
3333 	/* RFC2461 8.1:
3334 	 *	The IP source address of the Redirect MUST be the same as the current
3335 	 *	first-hop router for the specified ICMP Destination Address.
3336 	 */
3337 
3338 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3339 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3340 		return;
3341 	}
3342 
3343 	lladdr = NULL;
3344 	if (ndopts.nd_opts_tgt_lladdr) {
3345 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3346 					     skb->dev);
3347 		if (!lladdr) {
3348 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3349 			return;
3350 		}
3351 	}
3352 
3353 	rt = (struct rt6_info *) dst;
3354 	if (rt->rt6i_flags & RTF_REJECT) {
3355 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3356 		return;
3357 	}
3358 
3359 	/* Redirect received -> path was valid.
3360 	 * Look, redirects are sent only in response to data packets,
3361 	 * so that this nexthop apparently is reachable. --ANK
3362 	 */
3363 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3364 
3365 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3366 	if (!neigh)
3367 		return;
3368 
3369 	/*
3370 	 *	We have finally decided to accept it.
3371 	 */
3372 
3373 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3374 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3375 		     NEIGH_UPDATE_F_OVERRIDE|
3376 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3377 				     NEIGH_UPDATE_F_ISROUTER)),
3378 		     NDISC_REDIRECT, &ndopts);
3379 
3380 	rcu_read_lock();
3381 	from = rcu_dereference(rt->from);
3382 	/* This fib6_info_hold() is safe here because we hold reference to rt
3383 	 * and rt already holds reference to fib6_info.
3384 	 */
3385 	fib6_info_hold(from);
3386 	rcu_read_unlock();
3387 
3388 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3389 	if (!nrt)
3390 		goto out;
3391 
3392 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3393 	if (on_link)
3394 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3395 
3396 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3397 
3398 	/* No need to remove rt from the exception table if rt is
3399 	 * a cached route because rt6_insert_exception() will
3400 	 * takes care of it
3401 	 */
3402 	if (rt6_insert_exception(nrt, from)) {
3403 		dst_release_immediate(&nrt->dst);
3404 		goto out;
3405 	}
3406 
3407 	netevent.old = &rt->dst;
3408 	netevent.new = &nrt->dst;
3409 	netevent.daddr = &msg->dest;
3410 	netevent.neigh = neigh;
3411 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3412 
3413 out:
3414 	fib6_info_release(from);
3415 	neigh_release(neigh);
3416 }
3417 
3418 #ifdef CONFIG_IPV6_ROUTE_INFO
3419 static struct fib6_info *rt6_get_route_info(struct net *net,
3420 					   const struct in6_addr *prefix, int prefixlen,
3421 					   const struct in6_addr *gwaddr,
3422 					   struct net_device *dev)
3423 {
3424 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3425 	int ifindex = dev->ifindex;
3426 	struct fib6_node *fn;
3427 	struct fib6_info *rt = NULL;
3428 	struct fib6_table *table;
3429 
3430 	table = fib6_get_table(net, tb_id);
3431 	if (!table)
3432 		return NULL;
3433 
3434 	rcu_read_lock();
3435 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3436 	if (!fn)
3437 		goto out;
3438 
3439 	for_each_fib6_node_rt_rcu(fn) {
3440 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3441 			continue;
3442 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3443 			continue;
3444 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3445 			continue;
3446 		if (!fib6_info_hold_safe(rt))
3447 			continue;
3448 		break;
3449 	}
3450 out:
3451 	rcu_read_unlock();
3452 	return rt;
3453 }
3454 
3455 static struct fib6_info *rt6_add_route_info(struct net *net,
3456 					   const struct in6_addr *prefix, int prefixlen,
3457 					   const struct in6_addr *gwaddr,
3458 					   struct net_device *dev,
3459 					   unsigned int pref)
3460 {
3461 	struct fib6_config cfg = {
3462 		.fc_metric	= IP6_RT_PRIO_USER,
3463 		.fc_ifindex	= dev->ifindex,
3464 		.fc_dst_len	= prefixlen,
3465 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3466 				  RTF_UP | RTF_PREF(pref),
3467 		.fc_protocol = RTPROT_RA,
3468 		.fc_type = RTN_UNICAST,
3469 		.fc_nlinfo.portid = 0,
3470 		.fc_nlinfo.nlh = NULL,
3471 		.fc_nlinfo.nl_net = net,
3472 	};
3473 
3474 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3475 	cfg.fc_dst = *prefix;
3476 	cfg.fc_gateway = *gwaddr;
3477 
3478 	/* We should treat it as a default route if prefix length is 0. */
3479 	if (!prefixlen)
3480 		cfg.fc_flags |= RTF_DEFAULT;
3481 
3482 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3483 
3484 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3485 }
3486 #endif
3487 
3488 struct fib6_info *rt6_get_dflt_router(struct net *net,
3489 				     const struct in6_addr *addr,
3490 				     struct net_device *dev)
3491 {
3492 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3493 	struct fib6_info *rt;
3494 	struct fib6_table *table;
3495 
3496 	table = fib6_get_table(net, tb_id);
3497 	if (!table)
3498 		return NULL;
3499 
3500 	rcu_read_lock();
3501 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3502 		if (dev == rt->fib6_nh.nh_dev &&
3503 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3504 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3505 			break;
3506 	}
3507 	if (rt && !fib6_info_hold_safe(rt))
3508 		rt = NULL;
3509 	rcu_read_unlock();
3510 	return rt;
3511 }
3512 
3513 struct fib6_info *rt6_add_dflt_router(struct net *net,
3514 				     const struct in6_addr *gwaddr,
3515 				     struct net_device *dev,
3516 				     unsigned int pref)
3517 {
3518 	struct fib6_config cfg = {
3519 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3520 		.fc_metric	= IP6_RT_PRIO_USER,
3521 		.fc_ifindex	= dev->ifindex,
3522 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3523 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3524 		.fc_protocol = RTPROT_RA,
3525 		.fc_type = RTN_UNICAST,
3526 		.fc_nlinfo.portid = 0,
3527 		.fc_nlinfo.nlh = NULL,
3528 		.fc_nlinfo.nl_net = net,
3529 	};
3530 
3531 	cfg.fc_gateway = *gwaddr;
3532 
3533 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3534 		struct fib6_table *table;
3535 
3536 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3537 		if (table)
3538 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3539 	}
3540 
3541 	return rt6_get_dflt_router(net, gwaddr, dev);
3542 }
3543 
3544 static void __rt6_purge_dflt_routers(struct net *net,
3545 				     struct fib6_table *table)
3546 {
3547 	struct fib6_info *rt;
3548 
3549 restart:
3550 	rcu_read_lock();
3551 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3552 		struct net_device *dev = fib6_info_nh_dev(rt);
3553 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3554 
3555 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3556 		    (!idev || idev->cnf.accept_ra != 2) &&
3557 		    fib6_info_hold_safe(rt)) {
3558 			rcu_read_unlock();
3559 			ip6_del_rt(net, rt);
3560 			goto restart;
3561 		}
3562 	}
3563 	rcu_read_unlock();
3564 
3565 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3566 }
3567 
3568 void rt6_purge_dflt_routers(struct net *net)
3569 {
3570 	struct fib6_table *table;
3571 	struct hlist_head *head;
3572 	unsigned int h;
3573 
3574 	rcu_read_lock();
3575 
3576 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3577 		head = &net->ipv6.fib_table_hash[h];
3578 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3579 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3580 				__rt6_purge_dflt_routers(net, table);
3581 		}
3582 	}
3583 
3584 	rcu_read_unlock();
3585 }
3586 
3587 static void rtmsg_to_fib6_config(struct net *net,
3588 				 struct in6_rtmsg *rtmsg,
3589 				 struct fib6_config *cfg)
3590 {
3591 	*cfg = (struct fib6_config){
3592 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3593 			 : RT6_TABLE_MAIN,
3594 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3595 		.fc_metric = rtmsg->rtmsg_metric,
3596 		.fc_expires = rtmsg->rtmsg_info,
3597 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3598 		.fc_src_len = rtmsg->rtmsg_src_len,
3599 		.fc_flags = rtmsg->rtmsg_flags,
3600 		.fc_type = rtmsg->rtmsg_type,
3601 
3602 		.fc_nlinfo.nl_net = net,
3603 
3604 		.fc_dst = rtmsg->rtmsg_dst,
3605 		.fc_src = rtmsg->rtmsg_src,
3606 		.fc_gateway = rtmsg->rtmsg_gateway,
3607 	};
3608 }
3609 
3610 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3611 {
3612 	struct fib6_config cfg;
3613 	struct in6_rtmsg rtmsg;
3614 	int err;
3615 
3616 	switch (cmd) {
3617 	case SIOCADDRT:		/* Add a route */
3618 	case SIOCDELRT:		/* Delete a route */
3619 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3620 			return -EPERM;
3621 		err = copy_from_user(&rtmsg, arg,
3622 				     sizeof(struct in6_rtmsg));
3623 		if (err)
3624 			return -EFAULT;
3625 
3626 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3627 
3628 		rtnl_lock();
3629 		switch (cmd) {
3630 		case SIOCADDRT:
3631 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3632 			break;
3633 		case SIOCDELRT:
3634 			err = ip6_route_del(&cfg, NULL);
3635 			break;
3636 		default:
3637 			err = -EINVAL;
3638 		}
3639 		rtnl_unlock();
3640 
3641 		return err;
3642 	}
3643 
3644 	return -EINVAL;
3645 }
3646 
3647 /*
3648  *	Drop the packet on the floor
3649  */
3650 
3651 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3652 {
3653 	int type;
3654 	struct dst_entry *dst = skb_dst(skb);
3655 	switch (ipstats_mib_noroutes) {
3656 	case IPSTATS_MIB_INNOROUTES:
3657 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3658 		if (type == IPV6_ADDR_ANY) {
3659 			IP6_INC_STATS(dev_net(dst->dev),
3660 				      __in6_dev_get_safely(skb->dev),
3661 				      IPSTATS_MIB_INADDRERRORS);
3662 			break;
3663 		}
3664 		/* FALLTHROUGH */
3665 	case IPSTATS_MIB_OUTNOROUTES:
3666 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3667 			      ipstats_mib_noroutes);
3668 		break;
3669 	}
3670 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3671 	kfree_skb(skb);
3672 	return 0;
3673 }
3674 
3675 static int ip6_pkt_discard(struct sk_buff *skb)
3676 {
3677 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3678 }
3679 
3680 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3681 {
3682 	skb->dev = skb_dst(skb)->dev;
3683 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3684 }
3685 
3686 static int ip6_pkt_prohibit(struct sk_buff *skb)
3687 {
3688 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3689 }
3690 
3691 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3692 {
3693 	skb->dev = skb_dst(skb)->dev;
3694 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3695 }
3696 
3697 /*
3698  *	Allocate a dst for local (unicast / anycast) address.
3699  */
3700 
3701 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3702 				     struct inet6_dev *idev,
3703 				     const struct in6_addr *addr,
3704 				     bool anycast, gfp_t gfp_flags)
3705 {
3706 	u32 tb_id;
3707 	struct net_device *dev = idev->dev;
3708 	struct fib6_info *f6i;
3709 
3710 	f6i = fib6_info_alloc(gfp_flags);
3711 	if (!f6i)
3712 		return ERR_PTR(-ENOMEM);
3713 
3714 	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3715 	f6i->dst_nocount = true;
3716 	f6i->dst_host = true;
3717 	f6i->fib6_protocol = RTPROT_KERNEL;
3718 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3719 	if (anycast) {
3720 		f6i->fib6_type = RTN_ANYCAST;
3721 		f6i->fib6_flags |= RTF_ANYCAST;
3722 	} else {
3723 		f6i->fib6_type = RTN_LOCAL;
3724 		f6i->fib6_flags |= RTF_LOCAL;
3725 	}
3726 
3727 	f6i->fib6_nh.nh_gw = *addr;
3728 	dev_hold(dev);
3729 	f6i->fib6_nh.nh_dev = dev;
3730 	f6i->fib6_dst.addr = *addr;
3731 	f6i->fib6_dst.plen = 128;
3732 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3733 	f6i->fib6_table = fib6_get_table(net, tb_id);
3734 
3735 	return f6i;
3736 }
3737 
3738 /* remove deleted ip from prefsrc entries */
3739 struct arg_dev_net_ip {
3740 	struct net_device *dev;
3741 	struct net *net;
3742 	struct in6_addr *addr;
3743 };
3744 
3745 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3746 {
3747 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3748 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3749 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3750 
3751 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3752 	    rt != net->ipv6.fib6_null_entry &&
3753 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3754 		spin_lock_bh(&rt6_exception_lock);
3755 		/* remove prefsrc entry */
3756 		rt->fib6_prefsrc.plen = 0;
3757 		spin_unlock_bh(&rt6_exception_lock);
3758 	}
3759 	return 0;
3760 }
3761 
3762 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3763 {
3764 	struct net *net = dev_net(ifp->idev->dev);
3765 	struct arg_dev_net_ip adni = {
3766 		.dev = ifp->idev->dev,
3767 		.net = net,
3768 		.addr = &ifp->addr,
3769 	};
3770 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3771 }
3772 
3773 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3774 
3775 /* Remove routers and update dst entries when gateway turn into host. */
3776 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3777 {
3778 	struct in6_addr *gateway = (struct in6_addr *)arg;
3779 
3780 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3781 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3782 		return -1;
3783 	}
3784 
3785 	/* Further clean up cached routes in exception table.
3786 	 * This is needed because cached route may have a different
3787 	 * gateway than its 'parent' in the case of an ip redirect.
3788 	 */
3789 	rt6_exceptions_clean_tohost(rt, gateway);
3790 
3791 	return 0;
3792 }
3793 
3794 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3795 {
3796 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3797 }
3798 
3799 struct arg_netdev_event {
3800 	const struct net_device *dev;
3801 	union {
3802 		unsigned int nh_flags;
3803 		unsigned long event;
3804 	};
3805 };
3806 
3807 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3808 {
3809 	struct fib6_info *iter;
3810 	struct fib6_node *fn;
3811 
3812 	fn = rcu_dereference_protected(rt->fib6_node,
3813 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3814 	iter = rcu_dereference_protected(fn->leaf,
3815 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3816 	while (iter) {
3817 		if (iter->fib6_metric == rt->fib6_metric &&
3818 		    rt6_qualify_for_ecmp(iter))
3819 			return iter;
3820 		iter = rcu_dereference_protected(iter->fib6_next,
3821 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3822 	}
3823 
3824 	return NULL;
3825 }
3826 
3827 static bool rt6_is_dead(const struct fib6_info *rt)
3828 {
3829 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3830 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3831 	     fib6_ignore_linkdown(rt)))
3832 		return true;
3833 
3834 	return false;
3835 }
3836 
3837 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3838 {
3839 	struct fib6_info *iter;
3840 	int total = 0;
3841 
3842 	if (!rt6_is_dead(rt))
3843 		total += rt->fib6_nh.nh_weight;
3844 
3845 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3846 		if (!rt6_is_dead(iter))
3847 			total += iter->fib6_nh.nh_weight;
3848 	}
3849 
3850 	return total;
3851 }
3852 
3853 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3854 {
3855 	int upper_bound = -1;
3856 
3857 	if (!rt6_is_dead(rt)) {
3858 		*weight += rt->fib6_nh.nh_weight;
3859 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3860 						    total) - 1;
3861 	}
3862 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3863 }
3864 
3865 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3866 {
3867 	struct fib6_info *iter;
3868 	int weight = 0;
3869 
3870 	rt6_upper_bound_set(rt, &weight, total);
3871 
3872 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3873 		rt6_upper_bound_set(iter, &weight, total);
3874 }
3875 
3876 void rt6_multipath_rebalance(struct fib6_info *rt)
3877 {
3878 	struct fib6_info *first;
3879 	int total;
3880 
3881 	/* In case the entire multipath route was marked for flushing,
3882 	 * then there is no need to rebalance upon the removal of every
3883 	 * sibling route.
3884 	 */
3885 	if (!rt->fib6_nsiblings || rt->should_flush)
3886 		return;
3887 
3888 	/* During lookup routes are evaluated in order, so we need to
3889 	 * make sure upper bounds are assigned from the first sibling
3890 	 * onwards.
3891 	 */
3892 	first = rt6_multipath_first_sibling(rt);
3893 	if (WARN_ON_ONCE(!first))
3894 		return;
3895 
3896 	total = rt6_multipath_total_weight(first);
3897 	rt6_multipath_upper_bound_set(first, total);
3898 }
3899 
3900 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3901 {
3902 	const struct arg_netdev_event *arg = p_arg;
3903 	struct net *net = dev_net(arg->dev);
3904 
3905 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3906 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3907 		fib6_update_sernum_upto_root(net, rt);
3908 		rt6_multipath_rebalance(rt);
3909 	}
3910 
3911 	return 0;
3912 }
3913 
3914 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3915 {
3916 	struct arg_netdev_event arg = {
3917 		.dev = dev,
3918 		{
3919 			.nh_flags = nh_flags,
3920 		},
3921 	};
3922 
3923 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3924 		arg.nh_flags |= RTNH_F_LINKDOWN;
3925 
3926 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3927 }
3928 
3929 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3930 				   const struct net_device *dev)
3931 {
3932 	struct fib6_info *iter;
3933 
3934 	if (rt->fib6_nh.nh_dev == dev)
3935 		return true;
3936 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3937 		if (iter->fib6_nh.nh_dev == dev)
3938 			return true;
3939 
3940 	return false;
3941 }
3942 
3943 static void rt6_multipath_flush(struct fib6_info *rt)
3944 {
3945 	struct fib6_info *iter;
3946 
3947 	rt->should_flush = 1;
3948 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3949 		iter->should_flush = 1;
3950 }
3951 
3952 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3953 					     const struct net_device *down_dev)
3954 {
3955 	struct fib6_info *iter;
3956 	unsigned int dead = 0;
3957 
3958 	if (rt->fib6_nh.nh_dev == down_dev ||
3959 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3960 		dead++;
3961 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3962 		if (iter->fib6_nh.nh_dev == down_dev ||
3963 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3964 			dead++;
3965 
3966 	return dead;
3967 }
3968 
3969 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3970 				       const struct net_device *dev,
3971 				       unsigned int nh_flags)
3972 {
3973 	struct fib6_info *iter;
3974 
3975 	if (rt->fib6_nh.nh_dev == dev)
3976 		rt->fib6_nh.nh_flags |= nh_flags;
3977 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3978 		if (iter->fib6_nh.nh_dev == dev)
3979 			iter->fib6_nh.nh_flags |= nh_flags;
3980 }
3981 
3982 /* called with write lock held for table with rt */
3983 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3984 {
3985 	const struct arg_netdev_event *arg = p_arg;
3986 	const struct net_device *dev = arg->dev;
3987 	struct net *net = dev_net(dev);
3988 
3989 	if (rt == net->ipv6.fib6_null_entry)
3990 		return 0;
3991 
3992 	switch (arg->event) {
3993 	case NETDEV_UNREGISTER:
3994 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3995 	case NETDEV_DOWN:
3996 		if (rt->should_flush)
3997 			return -1;
3998 		if (!rt->fib6_nsiblings)
3999 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4000 		if (rt6_multipath_uses_dev(rt, dev)) {
4001 			unsigned int count;
4002 
4003 			count = rt6_multipath_dead_count(rt, dev);
4004 			if (rt->fib6_nsiblings + 1 == count) {
4005 				rt6_multipath_flush(rt);
4006 				return -1;
4007 			}
4008 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4009 						   RTNH_F_LINKDOWN);
4010 			fib6_update_sernum(net, rt);
4011 			rt6_multipath_rebalance(rt);
4012 		}
4013 		return -2;
4014 	case NETDEV_CHANGE:
4015 		if (rt->fib6_nh.nh_dev != dev ||
4016 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4017 			break;
4018 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4019 		rt6_multipath_rebalance(rt);
4020 		break;
4021 	}
4022 
4023 	return 0;
4024 }
4025 
4026 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4027 {
4028 	struct arg_netdev_event arg = {
4029 		.dev = dev,
4030 		{
4031 			.event = event,
4032 		},
4033 	};
4034 	struct net *net = dev_net(dev);
4035 
4036 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4037 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4038 	else
4039 		fib6_clean_all(net, fib6_ifdown, &arg);
4040 }
4041 
4042 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4043 {
4044 	rt6_sync_down_dev(dev, event);
4045 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4046 	neigh_ifdown(&nd_tbl, dev);
4047 }
4048 
4049 struct rt6_mtu_change_arg {
4050 	struct net_device *dev;
4051 	unsigned int mtu;
4052 };
4053 
4054 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4055 {
4056 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4057 	struct inet6_dev *idev;
4058 
4059 	/* In IPv6 pmtu discovery is not optional,
4060 	   so that RTAX_MTU lock cannot disable it.
4061 	   We still use this lock to block changes
4062 	   caused by addrconf/ndisc.
4063 	*/
4064 
4065 	idev = __in6_dev_get(arg->dev);
4066 	if (!idev)
4067 		return 0;
4068 
4069 	/* For administrative MTU increase, there is no way to discover
4070 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4071 	   Since RFC 1981 doesn't include administrative MTU increase
4072 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4073 	 */
4074 	if (rt->fib6_nh.nh_dev == arg->dev &&
4075 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4076 		u32 mtu = rt->fib6_pmtu;
4077 
4078 		if (mtu >= arg->mtu ||
4079 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4080 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4081 
4082 		spin_lock_bh(&rt6_exception_lock);
4083 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4084 		spin_unlock_bh(&rt6_exception_lock);
4085 	}
4086 	return 0;
4087 }
4088 
4089 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4090 {
4091 	struct rt6_mtu_change_arg arg = {
4092 		.dev = dev,
4093 		.mtu = mtu,
4094 	};
4095 
4096 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4097 }
4098 
4099 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4100 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4101 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4102 	[RTA_OIF]               = { .type = NLA_U32 },
4103 	[RTA_IIF]		= { .type = NLA_U32 },
4104 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4105 	[RTA_METRICS]           = { .type = NLA_NESTED },
4106 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4107 	[RTA_PREF]              = { .type = NLA_U8 },
4108 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4109 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4110 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4111 	[RTA_UID]		= { .type = NLA_U32 },
4112 	[RTA_MARK]		= { .type = NLA_U32 },
4113 	[RTA_TABLE]		= { .type = NLA_U32 },
4114 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4115 	[RTA_SPORT]		= { .type = NLA_U16 },
4116 	[RTA_DPORT]		= { .type = NLA_U16 },
4117 };
4118 
4119 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4120 			      struct fib6_config *cfg,
4121 			      struct netlink_ext_ack *extack)
4122 {
4123 	struct rtmsg *rtm;
4124 	struct nlattr *tb[RTA_MAX+1];
4125 	unsigned int pref;
4126 	int err;
4127 
4128 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4129 			  extack);
4130 	if (err < 0)
4131 		goto errout;
4132 
4133 	err = -EINVAL;
4134 	rtm = nlmsg_data(nlh);
4135 
4136 	*cfg = (struct fib6_config){
4137 		.fc_table = rtm->rtm_table,
4138 		.fc_dst_len = rtm->rtm_dst_len,
4139 		.fc_src_len = rtm->rtm_src_len,
4140 		.fc_flags = RTF_UP,
4141 		.fc_protocol = rtm->rtm_protocol,
4142 		.fc_type = rtm->rtm_type,
4143 
4144 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4145 		.fc_nlinfo.nlh = nlh,
4146 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4147 	};
4148 
4149 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4150 	    rtm->rtm_type == RTN_BLACKHOLE ||
4151 	    rtm->rtm_type == RTN_PROHIBIT ||
4152 	    rtm->rtm_type == RTN_THROW)
4153 		cfg->fc_flags |= RTF_REJECT;
4154 
4155 	if (rtm->rtm_type == RTN_LOCAL)
4156 		cfg->fc_flags |= RTF_LOCAL;
4157 
4158 	if (rtm->rtm_flags & RTM_F_CLONED)
4159 		cfg->fc_flags |= RTF_CACHE;
4160 
4161 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4162 
4163 	if (tb[RTA_GATEWAY]) {
4164 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4165 		cfg->fc_flags |= RTF_GATEWAY;
4166 	}
4167 
4168 	if (tb[RTA_DST]) {
4169 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4170 
4171 		if (nla_len(tb[RTA_DST]) < plen)
4172 			goto errout;
4173 
4174 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4175 	}
4176 
4177 	if (tb[RTA_SRC]) {
4178 		int plen = (rtm->rtm_src_len + 7) >> 3;
4179 
4180 		if (nla_len(tb[RTA_SRC]) < plen)
4181 			goto errout;
4182 
4183 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4184 	}
4185 
4186 	if (tb[RTA_PREFSRC])
4187 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4188 
4189 	if (tb[RTA_OIF])
4190 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4191 
4192 	if (tb[RTA_PRIORITY])
4193 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4194 
4195 	if (tb[RTA_METRICS]) {
4196 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4197 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4198 	}
4199 
4200 	if (tb[RTA_TABLE])
4201 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4202 
4203 	if (tb[RTA_MULTIPATH]) {
4204 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4205 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4206 
4207 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4208 						     cfg->fc_mp_len, extack);
4209 		if (err < 0)
4210 			goto errout;
4211 	}
4212 
4213 	if (tb[RTA_PREF]) {
4214 		pref = nla_get_u8(tb[RTA_PREF]);
4215 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4216 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4217 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4218 		cfg->fc_flags |= RTF_PREF(pref);
4219 	}
4220 
4221 	if (tb[RTA_ENCAP])
4222 		cfg->fc_encap = tb[RTA_ENCAP];
4223 
4224 	if (tb[RTA_ENCAP_TYPE]) {
4225 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4226 
4227 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4228 		if (err < 0)
4229 			goto errout;
4230 	}
4231 
4232 	if (tb[RTA_EXPIRES]) {
4233 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4234 
4235 		if (addrconf_finite_timeout(timeout)) {
4236 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4237 			cfg->fc_flags |= RTF_EXPIRES;
4238 		}
4239 	}
4240 
4241 	err = 0;
4242 errout:
4243 	return err;
4244 }
4245 
4246 struct rt6_nh {
4247 	struct fib6_info *fib6_info;
4248 	struct fib6_config r_cfg;
4249 	struct list_head next;
4250 };
4251 
4252 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4253 {
4254 	struct rt6_nh *nh;
4255 
4256 	list_for_each_entry(nh, rt6_nh_list, next) {
4257 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4258 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4259 		        nh->r_cfg.fc_ifindex);
4260 	}
4261 }
4262 
4263 static int ip6_route_info_append(struct net *net,
4264 				 struct list_head *rt6_nh_list,
4265 				 struct fib6_info *rt,
4266 				 struct fib6_config *r_cfg)
4267 {
4268 	struct rt6_nh *nh;
4269 	int err = -EEXIST;
4270 
4271 	list_for_each_entry(nh, rt6_nh_list, next) {
4272 		/* check if fib6_info already exists */
4273 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4274 			return err;
4275 	}
4276 
4277 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4278 	if (!nh)
4279 		return -ENOMEM;
4280 	nh->fib6_info = rt;
4281 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4282 	list_add_tail(&nh->next, rt6_nh_list);
4283 
4284 	return 0;
4285 }
4286 
4287 static void ip6_route_mpath_notify(struct fib6_info *rt,
4288 				   struct fib6_info *rt_last,
4289 				   struct nl_info *info,
4290 				   __u16 nlflags)
4291 {
4292 	/* if this is an APPEND route, then rt points to the first route
4293 	 * inserted and rt_last points to last route inserted. Userspace
4294 	 * wants a consistent dump of the route which starts at the first
4295 	 * nexthop. Since sibling routes are always added at the end of
4296 	 * the list, find the first sibling of the last route appended
4297 	 */
4298 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4299 		rt = list_first_entry(&rt_last->fib6_siblings,
4300 				      struct fib6_info,
4301 				      fib6_siblings);
4302 	}
4303 
4304 	if (rt)
4305 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4306 }
4307 
4308 static int ip6_route_multipath_add(struct fib6_config *cfg,
4309 				   struct netlink_ext_ack *extack)
4310 {
4311 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4312 	struct nl_info *info = &cfg->fc_nlinfo;
4313 	struct fib6_config r_cfg;
4314 	struct rtnexthop *rtnh;
4315 	struct fib6_info *rt;
4316 	struct rt6_nh *err_nh;
4317 	struct rt6_nh *nh, *nh_safe;
4318 	__u16 nlflags;
4319 	int remaining;
4320 	int attrlen;
4321 	int err = 1;
4322 	int nhn = 0;
4323 	int replace = (cfg->fc_nlinfo.nlh &&
4324 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4325 	LIST_HEAD(rt6_nh_list);
4326 
4327 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4328 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4329 		nlflags |= NLM_F_APPEND;
4330 
4331 	remaining = cfg->fc_mp_len;
4332 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4333 
4334 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4335 	 * fib6_info structs per nexthop
4336 	 */
4337 	while (rtnh_ok(rtnh, remaining)) {
4338 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4339 		if (rtnh->rtnh_ifindex)
4340 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4341 
4342 		attrlen = rtnh_attrlen(rtnh);
4343 		if (attrlen > 0) {
4344 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4345 
4346 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4347 			if (nla) {
4348 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4349 				r_cfg.fc_flags |= RTF_GATEWAY;
4350 			}
4351 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4352 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4353 			if (nla)
4354 				r_cfg.fc_encap_type = nla_get_u16(nla);
4355 		}
4356 
4357 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4358 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4359 		if (IS_ERR(rt)) {
4360 			err = PTR_ERR(rt);
4361 			rt = NULL;
4362 			goto cleanup;
4363 		}
4364 		if (!rt6_qualify_for_ecmp(rt)) {
4365 			err = -EINVAL;
4366 			NL_SET_ERR_MSG(extack,
4367 				       "Device only routes can not be added for IPv6 using the multipath API.");
4368 			fib6_info_release(rt);
4369 			goto cleanup;
4370 		}
4371 
4372 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4373 
4374 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4375 					    rt, &r_cfg);
4376 		if (err) {
4377 			fib6_info_release(rt);
4378 			goto cleanup;
4379 		}
4380 
4381 		rtnh = rtnh_next(rtnh, &remaining);
4382 	}
4383 
4384 	/* for add and replace send one notification with all nexthops.
4385 	 * Skip the notification in fib6_add_rt2node and send one with
4386 	 * the full route when done
4387 	 */
4388 	info->skip_notify = 1;
4389 
4390 	err_nh = NULL;
4391 	list_for_each_entry(nh, &rt6_nh_list, next) {
4392 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4393 		fib6_info_release(nh->fib6_info);
4394 
4395 		if (!err) {
4396 			/* save reference to last route successfully inserted */
4397 			rt_last = nh->fib6_info;
4398 
4399 			/* save reference to first route for notification */
4400 			if (!rt_notif)
4401 				rt_notif = nh->fib6_info;
4402 		}
4403 
4404 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4405 		nh->fib6_info = NULL;
4406 		if (err) {
4407 			if (replace && nhn)
4408 				ip6_print_replace_route_err(&rt6_nh_list);
4409 			err_nh = nh;
4410 			goto add_errout;
4411 		}
4412 
4413 		/* Because each route is added like a single route we remove
4414 		 * these flags after the first nexthop: if there is a collision,
4415 		 * we have already failed to add the first nexthop:
4416 		 * fib6_add_rt2node() has rejected it; when replacing, old
4417 		 * nexthops have been replaced by first new, the rest should
4418 		 * be added to it.
4419 		 */
4420 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4421 						     NLM_F_REPLACE);
4422 		nhn++;
4423 	}
4424 
4425 	/* success ... tell user about new route */
4426 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4427 	goto cleanup;
4428 
4429 add_errout:
4430 	/* send notification for routes that were added so that
4431 	 * the delete notifications sent by ip6_route_del are
4432 	 * coherent
4433 	 */
4434 	if (rt_notif)
4435 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4436 
4437 	/* Delete routes that were already added */
4438 	list_for_each_entry(nh, &rt6_nh_list, next) {
4439 		if (err_nh == nh)
4440 			break;
4441 		ip6_route_del(&nh->r_cfg, extack);
4442 	}
4443 
4444 cleanup:
4445 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4446 		if (nh->fib6_info)
4447 			fib6_info_release(nh->fib6_info);
4448 		list_del(&nh->next);
4449 		kfree(nh);
4450 	}
4451 
4452 	return err;
4453 }
4454 
4455 static int ip6_route_multipath_del(struct fib6_config *cfg,
4456 				   struct netlink_ext_ack *extack)
4457 {
4458 	struct fib6_config r_cfg;
4459 	struct rtnexthop *rtnh;
4460 	int remaining;
4461 	int attrlen;
4462 	int err = 1, last_err = 0;
4463 
4464 	remaining = cfg->fc_mp_len;
4465 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4466 
4467 	/* Parse a Multipath Entry */
4468 	while (rtnh_ok(rtnh, remaining)) {
4469 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4470 		if (rtnh->rtnh_ifindex)
4471 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4472 
4473 		attrlen = rtnh_attrlen(rtnh);
4474 		if (attrlen > 0) {
4475 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4476 
4477 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4478 			if (nla) {
4479 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4480 				r_cfg.fc_flags |= RTF_GATEWAY;
4481 			}
4482 		}
4483 		err = ip6_route_del(&r_cfg, extack);
4484 		if (err)
4485 			last_err = err;
4486 
4487 		rtnh = rtnh_next(rtnh, &remaining);
4488 	}
4489 
4490 	return last_err;
4491 }
4492 
4493 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4494 			      struct netlink_ext_ack *extack)
4495 {
4496 	struct fib6_config cfg;
4497 	int err;
4498 
4499 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4500 	if (err < 0)
4501 		return err;
4502 
4503 	if (cfg.fc_mp)
4504 		return ip6_route_multipath_del(&cfg, extack);
4505 	else {
4506 		cfg.fc_delete_all_nh = 1;
4507 		return ip6_route_del(&cfg, extack);
4508 	}
4509 }
4510 
4511 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4512 			      struct netlink_ext_ack *extack)
4513 {
4514 	struct fib6_config cfg;
4515 	int err;
4516 
4517 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4518 	if (err < 0)
4519 		return err;
4520 
4521 	if (cfg.fc_mp)
4522 		return ip6_route_multipath_add(&cfg, extack);
4523 	else
4524 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4525 }
4526 
4527 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4528 {
4529 	int nexthop_len = 0;
4530 
4531 	if (rt->fib6_nsiblings) {
4532 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4533 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4534 			    + nla_total_size(16) /* RTA_GATEWAY */
4535 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4536 
4537 		nexthop_len *= rt->fib6_nsiblings;
4538 	}
4539 
4540 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4541 	       + nla_total_size(16) /* RTA_SRC */
4542 	       + nla_total_size(16) /* RTA_DST */
4543 	       + nla_total_size(16) /* RTA_GATEWAY */
4544 	       + nla_total_size(16) /* RTA_PREFSRC */
4545 	       + nla_total_size(4) /* RTA_TABLE */
4546 	       + nla_total_size(4) /* RTA_IIF */
4547 	       + nla_total_size(4) /* RTA_OIF */
4548 	       + nla_total_size(4) /* RTA_PRIORITY */
4549 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4550 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4551 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4552 	       + nla_total_size(1) /* RTA_PREF */
4553 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4554 	       + nexthop_len;
4555 }
4556 
4557 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4558 			    unsigned int *flags, bool skip_oif)
4559 {
4560 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4561 		*flags |= RTNH_F_DEAD;
4562 
4563 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4564 		*flags |= RTNH_F_LINKDOWN;
4565 
4566 		rcu_read_lock();
4567 		if (fib6_ignore_linkdown(rt))
4568 			*flags |= RTNH_F_DEAD;
4569 		rcu_read_unlock();
4570 	}
4571 
4572 	if (rt->fib6_flags & RTF_GATEWAY) {
4573 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4574 			goto nla_put_failure;
4575 	}
4576 
4577 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4578 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4579 		*flags |= RTNH_F_OFFLOAD;
4580 
4581 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4582 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4583 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4584 		goto nla_put_failure;
4585 
4586 	if (rt->fib6_nh.nh_lwtstate &&
4587 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4588 		goto nla_put_failure;
4589 
4590 	return 0;
4591 
4592 nla_put_failure:
4593 	return -EMSGSIZE;
4594 }
4595 
4596 /* add multipath next hop */
4597 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4598 {
4599 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4600 	struct rtnexthop *rtnh;
4601 	unsigned int flags = 0;
4602 
4603 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4604 	if (!rtnh)
4605 		goto nla_put_failure;
4606 
4607 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4608 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4609 
4610 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4611 		goto nla_put_failure;
4612 
4613 	rtnh->rtnh_flags = flags;
4614 
4615 	/* length of rtnetlink header + attributes */
4616 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4617 
4618 	return 0;
4619 
4620 nla_put_failure:
4621 	return -EMSGSIZE;
4622 }
4623 
4624 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4625 			 struct fib6_info *rt, struct dst_entry *dst,
4626 			 struct in6_addr *dest, struct in6_addr *src,
4627 			 int iif, int type, u32 portid, u32 seq,
4628 			 unsigned int flags)
4629 {
4630 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4631 	struct rt6key *rt6_dst, *rt6_src;
4632 	u32 *pmetrics, table, rt6_flags;
4633 	struct nlmsghdr *nlh;
4634 	struct rtmsg *rtm;
4635 	long expires = 0;
4636 
4637 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4638 	if (!nlh)
4639 		return -EMSGSIZE;
4640 
4641 	if (rt6) {
4642 		rt6_dst = &rt6->rt6i_dst;
4643 		rt6_src = &rt6->rt6i_src;
4644 		rt6_flags = rt6->rt6i_flags;
4645 	} else {
4646 		rt6_dst = &rt->fib6_dst;
4647 		rt6_src = &rt->fib6_src;
4648 		rt6_flags = rt->fib6_flags;
4649 	}
4650 
4651 	rtm = nlmsg_data(nlh);
4652 	rtm->rtm_family = AF_INET6;
4653 	rtm->rtm_dst_len = rt6_dst->plen;
4654 	rtm->rtm_src_len = rt6_src->plen;
4655 	rtm->rtm_tos = 0;
4656 	if (rt->fib6_table)
4657 		table = rt->fib6_table->tb6_id;
4658 	else
4659 		table = RT6_TABLE_UNSPEC;
4660 	rtm->rtm_table = table;
4661 	if (nla_put_u32(skb, RTA_TABLE, table))
4662 		goto nla_put_failure;
4663 
4664 	rtm->rtm_type = rt->fib6_type;
4665 	rtm->rtm_flags = 0;
4666 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4667 	rtm->rtm_protocol = rt->fib6_protocol;
4668 
4669 	if (rt6_flags & RTF_CACHE)
4670 		rtm->rtm_flags |= RTM_F_CLONED;
4671 
4672 	if (dest) {
4673 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4674 			goto nla_put_failure;
4675 		rtm->rtm_dst_len = 128;
4676 	} else if (rtm->rtm_dst_len)
4677 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4678 			goto nla_put_failure;
4679 #ifdef CONFIG_IPV6_SUBTREES
4680 	if (src) {
4681 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4682 			goto nla_put_failure;
4683 		rtm->rtm_src_len = 128;
4684 	} else if (rtm->rtm_src_len &&
4685 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4686 		goto nla_put_failure;
4687 #endif
4688 	if (iif) {
4689 #ifdef CONFIG_IPV6_MROUTE
4690 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4691 			int err = ip6mr_get_route(net, skb, rtm, portid);
4692 
4693 			if (err == 0)
4694 				return 0;
4695 			if (err < 0)
4696 				goto nla_put_failure;
4697 		} else
4698 #endif
4699 			if (nla_put_u32(skb, RTA_IIF, iif))
4700 				goto nla_put_failure;
4701 	} else if (dest) {
4702 		struct in6_addr saddr_buf;
4703 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4704 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4705 			goto nla_put_failure;
4706 	}
4707 
4708 	if (rt->fib6_prefsrc.plen) {
4709 		struct in6_addr saddr_buf;
4710 		saddr_buf = rt->fib6_prefsrc.addr;
4711 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4712 			goto nla_put_failure;
4713 	}
4714 
4715 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4716 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4717 		goto nla_put_failure;
4718 
4719 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4720 		goto nla_put_failure;
4721 
4722 	/* For multipath routes, walk the siblings list and add
4723 	 * each as a nexthop within RTA_MULTIPATH.
4724 	 */
4725 	if (rt6) {
4726 		if (rt6_flags & RTF_GATEWAY &&
4727 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4728 			goto nla_put_failure;
4729 
4730 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4731 			goto nla_put_failure;
4732 	} else if (rt->fib6_nsiblings) {
4733 		struct fib6_info *sibling, *next_sibling;
4734 		struct nlattr *mp;
4735 
4736 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4737 		if (!mp)
4738 			goto nla_put_failure;
4739 
4740 		if (rt6_add_nexthop(skb, rt) < 0)
4741 			goto nla_put_failure;
4742 
4743 		list_for_each_entry_safe(sibling, next_sibling,
4744 					 &rt->fib6_siblings, fib6_siblings) {
4745 			if (rt6_add_nexthop(skb, sibling) < 0)
4746 				goto nla_put_failure;
4747 		}
4748 
4749 		nla_nest_end(skb, mp);
4750 	} else {
4751 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4752 			goto nla_put_failure;
4753 	}
4754 
4755 	if (rt6_flags & RTF_EXPIRES) {
4756 		expires = dst ? dst->expires : rt->expires;
4757 		expires -= jiffies;
4758 	}
4759 
4760 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4761 		goto nla_put_failure;
4762 
4763 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4764 		goto nla_put_failure;
4765 
4766 
4767 	nlmsg_end(skb, nlh);
4768 	return 0;
4769 
4770 nla_put_failure:
4771 	nlmsg_cancel(skb, nlh);
4772 	return -EMSGSIZE;
4773 }
4774 
4775 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4776 			       const struct net_device *dev)
4777 {
4778 	if (f6i->fib6_nh.nh_dev == dev)
4779 		return true;
4780 
4781 	if (f6i->fib6_nsiblings) {
4782 		struct fib6_info *sibling, *next_sibling;
4783 
4784 		list_for_each_entry_safe(sibling, next_sibling,
4785 					 &f6i->fib6_siblings, fib6_siblings) {
4786 			if (sibling->fib6_nh.nh_dev == dev)
4787 				return true;
4788 		}
4789 	}
4790 
4791 	return false;
4792 }
4793 
4794 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4795 {
4796 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4797 	struct fib_dump_filter *filter = &arg->filter;
4798 	unsigned int flags = NLM_F_MULTI;
4799 	struct net *net = arg->net;
4800 
4801 	if (rt == net->ipv6.fib6_null_entry)
4802 		return 0;
4803 
4804 	if ((filter->flags & RTM_F_PREFIX) &&
4805 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4806 		/* success since this is not a prefix route */
4807 		return 1;
4808 	}
4809 	if (filter->filter_set) {
4810 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4811 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4812 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4813 			return 1;
4814 		}
4815 		flags |= NLM_F_DUMP_FILTERED;
4816 	}
4817 
4818 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4819 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4820 			     arg->cb->nlh->nlmsg_seq, flags);
4821 }
4822 
4823 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4824 			      struct netlink_ext_ack *extack)
4825 {
4826 	struct net *net = sock_net(in_skb->sk);
4827 	struct nlattr *tb[RTA_MAX+1];
4828 	int err, iif = 0, oif = 0;
4829 	struct fib6_info *from;
4830 	struct dst_entry *dst;
4831 	struct rt6_info *rt;
4832 	struct sk_buff *skb;
4833 	struct rtmsg *rtm;
4834 	struct flowi6 fl6 = {};
4835 	bool fibmatch;
4836 
4837 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4838 			  extack);
4839 	if (err < 0)
4840 		goto errout;
4841 
4842 	err = -EINVAL;
4843 	rtm = nlmsg_data(nlh);
4844 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4845 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4846 
4847 	if (tb[RTA_SRC]) {
4848 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4849 			goto errout;
4850 
4851 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4852 	}
4853 
4854 	if (tb[RTA_DST]) {
4855 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4856 			goto errout;
4857 
4858 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4859 	}
4860 
4861 	if (tb[RTA_IIF])
4862 		iif = nla_get_u32(tb[RTA_IIF]);
4863 
4864 	if (tb[RTA_OIF])
4865 		oif = nla_get_u32(tb[RTA_OIF]);
4866 
4867 	if (tb[RTA_MARK])
4868 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4869 
4870 	if (tb[RTA_UID])
4871 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4872 					   nla_get_u32(tb[RTA_UID]));
4873 	else
4874 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4875 
4876 	if (tb[RTA_SPORT])
4877 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4878 
4879 	if (tb[RTA_DPORT])
4880 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4881 
4882 	if (tb[RTA_IP_PROTO]) {
4883 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4884 						  &fl6.flowi6_proto, extack);
4885 		if (err)
4886 			goto errout;
4887 	}
4888 
4889 	if (iif) {
4890 		struct net_device *dev;
4891 		int flags = 0;
4892 
4893 		rcu_read_lock();
4894 
4895 		dev = dev_get_by_index_rcu(net, iif);
4896 		if (!dev) {
4897 			rcu_read_unlock();
4898 			err = -ENODEV;
4899 			goto errout;
4900 		}
4901 
4902 		fl6.flowi6_iif = iif;
4903 
4904 		if (!ipv6_addr_any(&fl6.saddr))
4905 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4906 
4907 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4908 
4909 		rcu_read_unlock();
4910 	} else {
4911 		fl6.flowi6_oif = oif;
4912 
4913 		dst = ip6_route_output(net, NULL, &fl6);
4914 	}
4915 
4916 
4917 	rt = container_of(dst, struct rt6_info, dst);
4918 	if (rt->dst.error) {
4919 		err = rt->dst.error;
4920 		ip6_rt_put(rt);
4921 		goto errout;
4922 	}
4923 
4924 	if (rt == net->ipv6.ip6_null_entry) {
4925 		err = rt->dst.error;
4926 		ip6_rt_put(rt);
4927 		goto errout;
4928 	}
4929 
4930 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4931 	if (!skb) {
4932 		ip6_rt_put(rt);
4933 		err = -ENOBUFS;
4934 		goto errout;
4935 	}
4936 
4937 	skb_dst_set(skb, &rt->dst);
4938 
4939 	rcu_read_lock();
4940 	from = rcu_dereference(rt->from);
4941 
4942 	if (fibmatch)
4943 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4944 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4945 				    nlh->nlmsg_seq, 0);
4946 	else
4947 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4948 				    &fl6.saddr, iif, RTM_NEWROUTE,
4949 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4950 				    0);
4951 	rcu_read_unlock();
4952 
4953 	if (err < 0) {
4954 		kfree_skb(skb);
4955 		goto errout;
4956 	}
4957 
4958 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4959 errout:
4960 	return err;
4961 }
4962 
4963 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4964 		     unsigned int nlm_flags)
4965 {
4966 	struct sk_buff *skb;
4967 	struct net *net = info->nl_net;
4968 	u32 seq;
4969 	int err;
4970 
4971 	err = -ENOBUFS;
4972 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4973 
4974 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4975 	if (!skb)
4976 		goto errout;
4977 
4978 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4979 			    event, info->portid, seq, nlm_flags);
4980 	if (err < 0) {
4981 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4982 		WARN_ON(err == -EMSGSIZE);
4983 		kfree_skb(skb);
4984 		goto errout;
4985 	}
4986 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4987 		    info->nlh, gfp_any());
4988 	return;
4989 errout:
4990 	if (err < 0)
4991 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4992 }
4993 
4994 static int ip6_route_dev_notify(struct notifier_block *this,
4995 				unsigned long event, void *ptr)
4996 {
4997 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4998 	struct net *net = dev_net(dev);
4999 
5000 	if (!(dev->flags & IFF_LOOPBACK))
5001 		return NOTIFY_OK;
5002 
5003 	if (event == NETDEV_REGISTER) {
5004 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5005 		net->ipv6.ip6_null_entry->dst.dev = dev;
5006 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5008 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5009 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5010 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5011 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5012 #endif
5013 	 } else if (event == NETDEV_UNREGISTER &&
5014 		    dev->reg_state != NETREG_UNREGISTERED) {
5015 		/* NETDEV_UNREGISTER could be fired for multiple times by
5016 		 * netdev_wait_allrefs(). Make sure we only call this once.
5017 		 */
5018 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5020 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5021 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5022 #endif
5023 	}
5024 
5025 	return NOTIFY_OK;
5026 }
5027 
5028 /*
5029  *	/proc
5030  */
5031 
5032 #ifdef CONFIG_PROC_FS
5033 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5034 {
5035 	struct net *net = (struct net *)seq->private;
5036 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5037 		   net->ipv6.rt6_stats->fib_nodes,
5038 		   net->ipv6.rt6_stats->fib_route_nodes,
5039 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5040 		   net->ipv6.rt6_stats->fib_rt_entries,
5041 		   net->ipv6.rt6_stats->fib_rt_cache,
5042 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5043 		   net->ipv6.rt6_stats->fib_discarded_routes);
5044 
5045 	return 0;
5046 }
5047 #endif	/* CONFIG_PROC_FS */
5048 
5049 #ifdef CONFIG_SYSCTL
5050 
5051 static
5052 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5053 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5054 {
5055 	struct net *net;
5056 	int delay;
5057 	if (!write)
5058 		return -EINVAL;
5059 
5060 	net = (struct net *)ctl->extra1;
5061 	delay = net->ipv6.sysctl.flush_delay;
5062 	proc_dointvec(ctl, write, buffer, lenp, ppos);
5063 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5064 	return 0;
5065 }
5066 
5067 static int zero;
5068 static int one = 1;
5069 
5070 static struct ctl_table ipv6_route_table_template[] = {
5071 	{
5072 		.procname	=	"flush",
5073 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5074 		.maxlen		=	sizeof(int),
5075 		.mode		=	0200,
5076 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5077 	},
5078 	{
5079 		.procname	=	"gc_thresh",
5080 		.data		=	&ip6_dst_ops_template.gc_thresh,
5081 		.maxlen		=	sizeof(int),
5082 		.mode		=	0644,
5083 		.proc_handler	=	proc_dointvec,
5084 	},
5085 	{
5086 		.procname	=	"max_size",
5087 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5088 		.maxlen		=	sizeof(int),
5089 		.mode		=	0644,
5090 		.proc_handler	=	proc_dointvec,
5091 	},
5092 	{
5093 		.procname	=	"gc_min_interval",
5094 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5095 		.maxlen		=	sizeof(int),
5096 		.mode		=	0644,
5097 		.proc_handler	=	proc_dointvec_jiffies,
5098 	},
5099 	{
5100 		.procname	=	"gc_timeout",
5101 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5102 		.maxlen		=	sizeof(int),
5103 		.mode		=	0644,
5104 		.proc_handler	=	proc_dointvec_jiffies,
5105 	},
5106 	{
5107 		.procname	=	"gc_interval",
5108 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5109 		.maxlen		=	sizeof(int),
5110 		.mode		=	0644,
5111 		.proc_handler	=	proc_dointvec_jiffies,
5112 	},
5113 	{
5114 		.procname	=	"gc_elasticity",
5115 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5116 		.maxlen		=	sizeof(int),
5117 		.mode		=	0644,
5118 		.proc_handler	=	proc_dointvec,
5119 	},
5120 	{
5121 		.procname	=	"mtu_expires",
5122 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5123 		.maxlen		=	sizeof(int),
5124 		.mode		=	0644,
5125 		.proc_handler	=	proc_dointvec_jiffies,
5126 	},
5127 	{
5128 		.procname	=	"min_adv_mss",
5129 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5130 		.maxlen		=	sizeof(int),
5131 		.mode		=	0644,
5132 		.proc_handler	=	proc_dointvec,
5133 	},
5134 	{
5135 		.procname	=	"gc_min_interval_ms",
5136 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5137 		.maxlen		=	sizeof(int),
5138 		.mode		=	0644,
5139 		.proc_handler	=	proc_dointvec_ms_jiffies,
5140 	},
5141 	{
5142 		.procname	=	"skip_notify_on_dev_down",
5143 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5144 		.maxlen		=	sizeof(int),
5145 		.mode		=	0644,
5146 		.proc_handler	=	proc_dointvec,
5147 		.extra1		=	&zero,
5148 		.extra2		=	&one,
5149 	},
5150 	{ }
5151 };
5152 
5153 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5154 {
5155 	struct ctl_table *table;
5156 
5157 	table = kmemdup(ipv6_route_table_template,
5158 			sizeof(ipv6_route_table_template),
5159 			GFP_KERNEL);
5160 
5161 	if (table) {
5162 		table[0].data = &net->ipv6.sysctl.flush_delay;
5163 		table[0].extra1 = net;
5164 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5165 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5166 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5167 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5168 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5169 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5170 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5171 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5172 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5173 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5174 
5175 		/* Don't export sysctls to unprivileged users */
5176 		if (net->user_ns != &init_user_ns)
5177 			table[0].procname = NULL;
5178 	}
5179 
5180 	return table;
5181 }
5182 #endif
5183 
5184 static int __net_init ip6_route_net_init(struct net *net)
5185 {
5186 	int ret = -ENOMEM;
5187 
5188 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5189 	       sizeof(net->ipv6.ip6_dst_ops));
5190 
5191 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5192 		goto out_ip6_dst_ops;
5193 
5194 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5195 					    sizeof(*net->ipv6.fib6_null_entry),
5196 					    GFP_KERNEL);
5197 	if (!net->ipv6.fib6_null_entry)
5198 		goto out_ip6_dst_entries;
5199 
5200 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5201 					   sizeof(*net->ipv6.ip6_null_entry),
5202 					   GFP_KERNEL);
5203 	if (!net->ipv6.ip6_null_entry)
5204 		goto out_fib6_null_entry;
5205 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5206 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5207 			 ip6_template_metrics, true);
5208 
5209 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5210 	net->ipv6.fib6_has_custom_rules = false;
5211 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5212 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5213 					       GFP_KERNEL);
5214 	if (!net->ipv6.ip6_prohibit_entry)
5215 		goto out_ip6_null_entry;
5216 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5217 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5218 			 ip6_template_metrics, true);
5219 
5220 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5221 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5222 					       GFP_KERNEL);
5223 	if (!net->ipv6.ip6_blk_hole_entry)
5224 		goto out_ip6_prohibit_entry;
5225 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5226 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5227 			 ip6_template_metrics, true);
5228 #endif
5229 
5230 	net->ipv6.sysctl.flush_delay = 0;
5231 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5232 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5233 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5234 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5235 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5236 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5237 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5238 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5239 
5240 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5241 
5242 	ret = 0;
5243 out:
5244 	return ret;
5245 
5246 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5247 out_ip6_prohibit_entry:
5248 	kfree(net->ipv6.ip6_prohibit_entry);
5249 out_ip6_null_entry:
5250 	kfree(net->ipv6.ip6_null_entry);
5251 #endif
5252 out_fib6_null_entry:
5253 	kfree(net->ipv6.fib6_null_entry);
5254 out_ip6_dst_entries:
5255 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5256 out_ip6_dst_ops:
5257 	goto out;
5258 }
5259 
5260 static void __net_exit ip6_route_net_exit(struct net *net)
5261 {
5262 	kfree(net->ipv6.fib6_null_entry);
5263 	kfree(net->ipv6.ip6_null_entry);
5264 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5265 	kfree(net->ipv6.ip6_prohibit_entry);
5266 	kfree(net->ipv6.ip6_blk_hole_entry);
5267 #endif
5268 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5269 }
5270 
5271 static int __net_init ip6_route_net_init_late(struct net *net)
5272 {
5273 #ifdef CONFIG_PROC_FS
5274 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5275 			sizeof(struct ipv6_route_iter));
5276 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5277 			rt6_stats_seq_show, NULL);
5278 #endif
5279 	return 0;
5280 }
5281 
5282 static void __net_exit ip6_route_net_exit_late(struct net *net)
5283 {
5284 #ifdef CONFIG_PROC_FS
5285 	remove_proc_entry("ipv6_route", net->proc_net);
5286 	remove_proc_entry("rt6_stats", net->proc_net);
5287 #endif
5288 }
5289 
5290 static struct pernet_operations ip6_route_net_ops = {
5291 	.init = ip6_route_net_init,
5292 	.exit = ip6_route_net_exit,
5293 };
5294 
5295 static int __net_init ipv6_inetpeer_init(struct net *net)
5296 {
5297 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5298 
5299 	if (!bp)
5300 		return -ENOMEM;
5301 	inet_peer_base_init(bp);
5302 	net->ipv6.peers = bp;
5303 	return 0;
5304 }
5305 
5306 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5307 {
5308 	struct inet_peer_base *bp = net->ipv6.peers;
5309 
5310 	net->ipv6.peers = NULL;
5311 	inetpeer_invalidate_tree(bp);
5312 	kfree(bp);
5313 }
5314 
5315 static struct pernet_operations ipv6_inetpeer_ops = {
5316 	.init	=	ipv6_inetpeer_init,
5317 	.exit	=	ipv6_inetpeer_exit,
5318 };
5319 
5320 static struct pernet_operations ip6_route_net_late_ops = {
5321 	.init = ip6_route_net_init_late,
5322 	.exit = ip6_route_net_exit_late,
5323 };
5324 
5325 static struct notifier_block ip6_route_dev_notifier = {
5326 	.notifier_call = ip6_route_dev_notify,
5327 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5328 };
5329 
5330 void __init ip6_route_init_special_entries(void)
5331 {
5332 	/* Registering of the loopback is done before this portion of code,
5333 	 * the loopback reference in rt6_info will not be taken, do it
5334 	 * manually for init_net */
5335 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5336 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5337 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5338   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5339 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5340 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5341 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5342 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5343   #endif
5344 }
5345 
5346 int __init ip6_route_init(void)
5347 {
5348 	int ret;
5349 	int cpu;
5350 
5351 	ret = -ENOMEM;
5352 	ip6_dst_ops_template.kmem_cachep =
5353 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5354 				  SLAB_HWCACHE_ALIGN, NULL);
5355 	if (!ip6_dst_ops_template.kmem_cachep)
5356 		goto out;
5357 
5358 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5359 	if (ret)
5360 		goto out_kmem_cache;
5361 
5362 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5363 	if (ret)
5364 		goto out_dst_entries;
5365 
5366 	ret = register_pernet_subsys(&ip6_route_net_ops);
5367 	if (ret)
5368 		goto out_register_inetpeer;
5369 
5370 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5371 
5372 	ret = fib6_init();
5373 	if (ret)
5374 		goto out_register_subsys;
5375 
5376 	ret = xfrm6_init();
5377 	if (ret)
5378 		goto out_fib6_init;
5379 
5380 	ret = fib6_rules_init();
5381 	if (ret)
5382 		goto xfrm6_init;
5383 
5384 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5385 	if (ret)
5386 		goto fib6_rules_init;
5387 
5388 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5389 				   inet6_rtm_newroute, NULL, 0);
5390 	if (ret < 0)
5391 		goto out_register_late_subsys;
5392 
5393 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5394 				   inet6_rtm_delroute, NULL, 0);
5395 	if (ret < 0)
5396 		goto out_register_late_subsys;
5397 
5398 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5399 				   inet6_rtm_getroute, NULL,
5400 				   RTNL_FLAG_DOIT_UNLOCKED);
5401 	if (ret < 0)
5402 		goto out_register_late_subsys;
5403 
5404 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5405 	if (ret)
5406 		goto out_register_late_subsys;
5407 
5408 	for_each_possible_cpu(cpu) {
5409 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5410 
5411 		INIT_LIST_HEAD(&ul->head);
5412 		spin_lock_init(&ul->lock);
5413 	}
5414 
5415 out:
5416 	return ret;
5417 
5418 out_register_late_subsys:
5419 	rtnl_unregister_all(PF_INET6);
5420 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5421 fib6_rules_init:
5422 	fib6_rules_cleanup();
5423 xfrm6_init:
5424 	xfrm6_fini();
5425 out_fib6_init:
5426 	fib6_gc_cleanup();
5427 out_register_subsys:
5428 	unregister_pernet_subsys(&ip6_route_net_ops);
5429 out_register_inetpeer:
5430 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5431 out_dst_entries:
5432 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5433 out_kmem_cache:
5434 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5435 	goto out;
5436 }
5437 
5438 void ip6_route_cleanup(void)
5439 {
5440 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5441 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5442 	fib6_rules_cleanup();
5443 	xfrm6_fini();
5444 	fib6_gc_cleanup();
5445 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5446 	unregister_pernet_subsys(&ip6_route_net_ops);
5447 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5448 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5449 }
5450