xref: /linux/net/ipv6/route.c (revision cd11d11286cba88aab5b1da1c83ee36e5b5cefb7)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 			 struct fib6_info *rt, struct dst_entry *dst,
109 			 struct in6_addr *dest, struct in6_addr *src,
110 			 int iif, int type, u32 portid, u32 seq,
111 			 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 					   struct in6_addr *daddr,
114 					   struct in6_addr *saddr);
115 
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 					   const struct in6_addr *prefix, int prefixlen,
119 					   const struct in6_addr *gwaddr,
120 					   struct net_device *dev,
121 					   unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 					   const struct in6_addr *prefix, int prefixlen,
124 					   const struct in6_addr *gwaddr,
125 					   struct net_device *dev);
126 #endif
127 
128 struct uncached_list {
129 	spinlock_t		lock;
130 	struct list_head	head;
131 };
132 
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134 
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138 
139 	rt->rt6i_uncached_list = ul;
140 
141 	spin_lock_bh(&ul->lock);
142 	list_add_tail(&rt->rt6i_uncached, &ul->head);
143 	spin_unlock_bh(&ul->lock);
144 }
145 
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148 	if (!list_empty(&rt->rt6i_uncached)) {
149 		struct uncached_list *ul = rt->rt6i_uncached_list;
150 		struct net *net = dev_net(rt->dst.dev);
151 
152 		spin_lock_bh(&ul->lock);
153 		list_del(&rt->rt6i_uncached);
154 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 		spin_unlock_bh(&ul->lock);
156 	}
157 }
158 
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161 	struct net_device *loopback_dev = net->loopback_dev;
162 	int cpu;
163 
164 	if (dev == loopback_dev)
165 		return;
166 
167 	for_each_possible_cpu(cpu) {
168 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 		struct rt6_info *rt;
170 
171 		spin_lock_bh(&ul->lock);
172 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 			struct inet6_dev *rt_idev = rt->rt6i_idev;
174 			struct net_device *rt_dev = rt->dst.dev;
175 
176 			if (rt_idev->dev == dev) {
177 				rt->rt6i_idev = in6_dev_get(loopback_dev);
178 				in6_dev_put(rt_idev);
179 			}
180 
181 			if (rt_dev == dev) {
182 				rt->dst.dev = loopback_dev;
183 				dev_hold(rt->dst.dev);
184 				dev_put(rt_dev);
185 			}
186 		}
187 		spin_unlock_bh(&ul->lock);
188 	}
189 }
190 
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	if (!ipv6_addr_any(p))
196 		return (const void *) p;
197 	else if (skb)
198 		return &ipv6_hdr(skb)->daddr;
199 	return daddr;
200 }
201 
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 				   struct net_device *dev,
204 				   struct sk_buff *skb,
205 				   const void *daddr)
206 {
207 	struct neighbour *n;
208 
209 	daddr = choose_neigh_daddr(gw, skb, daddr);
210 	n = __ipv6_neigh_lookup(dev, daddr);
211 	if (n)
212 		return n;
213 	return neigh_create(&nd_tbl, daddr, dev);
214 }
215 
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 					      struct sk_buff *skb,
218 					      const void *daddr)
219 {
220 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221 
222 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224 
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227 	struct net_device *dev = dst->dev;
228 	struct rt6_info *rt = (struct rt6_info *)dst;
229 
230 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 	if (!daddr)
232 		return;
233 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 		return;
235 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 		return;
237 	__ipv6_confirm_neigh(dev, daddr);
238 }
239 
240 static struct dst_ops ip6_dst_ops_template = {
241 	.family			=	AF_INET6,
242 	.gc			=	ip6_dst_gc,
243 	.gc_thresh		=	1024,
244 	.check			=	ip6_dst_check,
245 	.default_advmss		=	ip6_default_advmss,
246 	.mtu			=	ip6_mtu,
247 	.cow_metrics		=	dst_cow_metrics_generic,
248 	.destroy		=	ip6_dst_destroy,
249 	.ifdown			=	ip6_dst_ifdown,
250 	.negative_advice	=	ip6_negative_advice,
251 	.link_failure		=	ip6_link_failure,
252 	.update_pmtu		=	ip6_rt_update_pmtu,
253 	.redirect		=	rt6_do_redirect,
254 	.local_out		=	__ip6_local_out,
255 	.neigh_lookup		=	ip6_dst_neigh_lookup,
256 	.confirm_neigh		=	ip6_confirm_neigh,
257 };
258 
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262 
263 	return mtu ? : dst->dev->mtu;
264 }
265 
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 					 struct sk_buff *skb, u32 mtu)
268 {
269 }
270 
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 				      struct sk_buff *skb)
273 {
274 }
275 
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 	.family			=	AF_INET6,
278 	.destroy		=	ip6_dst_destroy,
279 	.check			=	ip6_dst_check,
280 	.mtu			=	ip6_blackhole_mtu,
281 	.default_advmss		=	ip6_default_advmss,
282 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
283 	.redirect		=	ip6_rt_blackhole_redirect,
284 	.cow_metrics		=	dst_cow_metrics_generic,
285 	.neigh_lookup		=	ip6_dst_neigh_lookup,
286 };
287 
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 	[RTAX_HOPLIMIT - 1] = 0,
290 };
291 
292 static const struct fib6_info fib6_null_entry_template = {
293 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
294 	.fib6_protocol  = RTPROT_KERNEL,
295 	.fib6_metric	= ~(u32)0,
296 	.fib6_ref	= ATOMIC_INIT(1),
297 	.fib6_type	= RTN_UNREACHABLE,
298 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
299 };
300 
301 static const struct rt6_info ip6_null_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -ENETUNREACH,
307 		.input		= ip6_pkt_discard,
308 		.output		= ip6_pkt_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 };
326 
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328 	.dst = {
329 		.__refcnt	= ATOMIC_INIT(1),
330 		.__use		= 1,
331 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
332 		.error		= -EINVAL,
333 		.input		= dst_discard,
334 		.output		= dst_discard_out,
335 	},
336 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
337 };
338 
339 #endif
340 
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343 	struct dst_entry *dst = &rt->dst;
344 
345 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 	INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348 
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 			       int flags)
352 {
353 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 					1, DST_OBSOLETE_FORCE_CHK, flags);
355 
356 	if (rt) {
357 		rt6_info_init(rt);
358 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359 	}
360 
361 	return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364 
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
368 	struct rt6_info *rt = (struct rt6_info *)dst;
369 	struct fib6_info *from;
370 	struct inet6_dev *idev;
371 
372 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
373 		kfree(p);
374 
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 					struct fib6_info *match,
433 					struct flowi6 *fl6, int oif,
434 					const struct sk_buff *skb,
435 					int strict)
436 {
437 	struct fib6_info *sibling, *next_sibling;
438 
439 	/* We might have already computed the hash for ICMPv6 errors. In such
440 	 * case it will always be non-zero. Otherwise now is the time to do it.
441 	 */
442 	if (!fl6->mp_hash)
443 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 
445 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
446 		return match;
447 
448 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 				 fib6_siblings) {
450 		int nh_upper_bound;
451 
452 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
453 		if (fl6->mp_hash > nh_upper_bound)
454 			continue;
455 		if (rt6_score_route(sibling, oif, strict) < 0)
456 			break;
457 		match = sibling;
458 		break;
459 	}
460 
461 	return match;
462 }
463 
464 /*
465  *	Route lookup. rcu_read_lock() should be held.
466  */
467 
468 static inline struct fib6_info *rt6_device_match(struct net *net,
469 						 struct fib6_info *rt,
470 						    const struct in6_addr *saddr,
471 						    int oif,
472 						    int flags)
473 {
474 	struct fib6_info *sprt;
475 
476 	if (!oif && ipv6_addr_any(saddr) &&
477 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478 		return rt;
479 
480 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
481 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 
483 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 			continue;
485 
486 		if (oif) {
487 			if (dev->ifindex == oif)
488 				return sprt;
489 		} else {
490 			if (ipv6_chk_addr(net, saddr, dev,
491 					  flags & RT6_LOOKUP_F_IFACE))
492 				return sprt;
493 		}
494 	}
495 
496 	if (oif && flags & RT6_LOOKUP_F_IFACE)
497 		return net->ipv6.fib6_null_entry;
498 
499 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
500 }
501 
502 #ifdef CONFIG_IPV6_ROUTER_PREF
503 struct __rt6_probe_work {
504 	struct work_struct work;
505 	struct in6_addr target;
506 	struct net_device *dev;
507 };
508 
509 static void rt6_probe_deferred(struct work_struct *w)
510 {
511 	struct in6_addr mcaddr;
512 	struct __rt6_probe_work *work =
513 		container_of(w, struct __rt6_probe_work, work);
514 
515 	addrconf_addr_solict_mult(&work->target, &mcaddr);
516 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
517 	dev_put(work->dev);
518 	kfree(work);
519 }
520 
521 static void rt6_probe(struct fib6_info *rt)
522 {
523 	struct __rt6_probe_work *work;
524 	const struct in6_addr *nh_gw;
525 	struct neighbour *neigh;
526 	struct net_device *dev;
527 
528 	/*
529 	 * Okay, this does not seem to be appropriate
530 	 * for now, however, we need to check if it
531 	 * is really so; aka Router Reachability Probing.
532 	 *
533 	 * Router Reachability Probe MUST be rate-limited
534 	 * to no more than one per minute.
535 	 */
536 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537 		return;
538 
539 	nh_gw = &rt->fib6_nh.nh_gw;
540 	dev = rt->fib6_nh.nh_dev;
541 	rcu_read_lock_bh();
542 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
543 	if (neigh) {
544 		struct inet6_dev *idev;
545 
546 		if (neigh->nud_state & NUD_VALID)
547 			goto out;
548 
549 		idev = __in6_dev_get(dev);
550 		work = NULL;
551 		write_lock(&neigh->lock);
552 		if (!(neigh->nud_state & NUD_VALID) &&
553 		    time_after(jiffies,
554 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
555 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
556 			if (work)
557 				__neigh_set_probe_once(neigh);
558 		}
559 		write_unlock(&neigh->lock);
560 	} else {
561 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 	}
563 
564 	if (work) {
565 		INIT_WORK(&work->work, rt6_probe_deferred);
566 		work->target = *nh_gw;
567 		dev_hold(dev);
568 		work->dev = dev;
569 		schedule_work(&work->work);
570 	}
571 
572 out:
573 	rcu_read_unlock_bh();
574 }
575 #else
576 static inline void rt6_probe(struct fib6_info *rt)
577 {
578 }
579 #endif
580 
581 /*
582  * Default Router Selection (RFC 2461 6.3.6)
583  */
584 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 {
586 	const struct net_device *dev = rt->fib6_nh.nh_dev;
587 
588 	if (!oif || dev->ifindex == oif)
589 		return 2;
590 	return 0;
591 }
592 
593 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 {
595 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
596 	struct neighbour *neigh;
597 
598 	if (rt->fib6_flags & RTF_NONEXTHOP ||
599 	    !(rt->fib6_flags & RTF_GATEWAY))
600 		return RT6_NUD_SUCCEED;
601 
602 	rcu_read_lock_bh();
603 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
604 					  &rt->fib6_nh.nh_gw);
605 	if (neigh) {
606 		read_lock(&neigh->lock);
607 		if (neigh->nud_state & NUD_VALID)
608 			ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610 		else if (!(neigh->nud_state & NUD_FAILED))
611 			ret = RT6_NUD_SUCCEED;
612 		else
613 			ret = RT6_NUD_FAIL_PROBE;
614 #endif
615 		read_unlock(&neigh->lock);
616 	} else {
617 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 	}
620 	rcu_read_unlock_bh();
621 
622 	return ret;
623 }
624 
625 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 {
627 	int m;
628 
629 	m = rt6_check_dev(rt, oif);
630 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
631 		return RT6_NUD_FAIL_HARD;
632 #ifdef CONFIG_IPV6_ROUTER_PREF
633 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 #endif
635 	if (strict & RT6_LOOKUP_F_REACHABLE) {
636 		int n = rt6_check_neigh(rt);
637 		if (n < 0)
638 			return n;
639 	}
640 	return m;
641 }
642 
643 /* called with rc_read_lock held */
644 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 {
646 	const struct net_device *dev = fib6_info_nh_dev(f6i);
647 	bool rc = false;
648 
649 	if (dev) {
650 		const struct inet6_dev *idev = __in6_dev_get(dev);
651 
652 		rc = !!idev->cnf.ignore_routes_with_linkdown;
653 	}
654 
655 	return rc;
656 }
657 
658 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
659 				   int *mpri, struct fib6_info *match,
660 				   bool *do_rr)
661 {
662 	int m;
663 	bool match_do_rr = false;
664 
665 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
666 		goto out;
667 
668 	if (fib6_ignore_linkdown(rt) &&
669 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
670 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671 		goto out;
672 
673 	if (fib6_check_expired(rt))
674 		goto out;
675 
676 	m = rt6_score_route(rt, oif, strict);
677 	if (m == RT6_NUD_FAIL_DO_RR) {
678 		match_do_rr = true;
679 		m = 0; /* lowest valid score */
680 	} else if (m == RT6_NUD_FAIL_HARD) {
681 		goto out;
682 	}
683 
684 	if (strict & RT6_LOOKUP_F_REACHABLE)
685 		rt6_probe(rt);
686 
687 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 	if (m > *mpri) {
689 		*do_rr = match_do_rr;
690 		*mpri = m;
691 		match = rt;
692 	}
693 out:
694 	return match;
695 }
696 
697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698 				     struct fib6_info *leaf,
699 				     struct fib6_info *rr_head,
700 				     u32 metric, int oif, int strict,
701 				     bool *do_rr)
702 {
703 	struct fib6_info *rt, *match, *cont;
704 	int mpri = -1;
705 
706 	match = NULL;
707 	cont = NULL;
708 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
709 		if (rt->fib6_metric != metric) {
710 			cont = rt;
711 			break;
712 		}
713 
714 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
715 	}
716 
717 	for (rt = leaf; rt && rt != rr_head;
718 	     rt = rcu_dereference(rt->fib6_next)) {
719 		if (rt->fib6_metric != metric) {
720 			cont = rt;
721 			break;
722 		}
723 
724 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 	}
726 
727 	if (match || !cont)
728 		return match;
729 
730 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
731 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 
733 	return match;
734 }
735 
736 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
737 				   int oif, int strict)
738 {
739 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
740 	struct fib6_info *match, *rt0;
741 	bool do_rr = false;
742 	int key_plen;
743 
744 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
745 		return net->ipv6.fib6_null_entry;
746 
747 	rt0 = rcu_dereference(fn->rr_ptr);
748 	if (!rt0)
749 		rt0 = leaf;
750 
751 	/* Double check to make sure fn is not an intermediate node
752 	 * and fn->leaf does not points to its child's leaf
753 	 * (This might happen if all routes under fn are deleted from
754 	 * the tree and fib6_repair_tree() is called on the node.)
755 	 */
756 	key_plen = rt0->fib6_dst.plen;
757 #ifdef CONFIG_IPV6_SUBTREES
758 	if (rt0->fib6_src.plen)
759 		key_plen = rt0->fib6_src.plen;
760 #endif
761 	if (fn->fn_bit != key_plen)
762 		return net->ipv6.fib6_null_entry;
763 
764 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 			     &do_rr);
766 
767 	if (do_rr) {
768 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 
770 		/* no entries matched; do round-robin */
771 		if (!next || next->fib6_metric != rt0->fib6_metric)
772 			next = leaf;
773 
774 		if (next != rt0) {
775 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
776 			/* make sure next is not being deleted from the tree */
777 			if (next->fib6_node)
778 				rcu_assign_pointer(fn->rr_ptr, next);
779 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 		}
781 	}
782 
783 	return match ? match : net->ipv6.fib6_null_entry;
784 }
785 
786 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 {
788 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
789 }
790 
791 #ifdef CONFIG_IPV6_ROUTE_INFO
792 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
793 		  const struct in6_addr *gwaddr)
794 {
795 	struct net *net = dev_net(dev);
796 	struct route_info *rinfo = (struct route_info *) opt;
797 	struct in6_addr prefix_buf, *prefix;
798 	unsigned int pref;
799 	unsigned long lifetime;
800 	struct fib6_info *rt;
801 
802 	if (len < sizeof(struct route_info)) {
803 		return -EINVAL;
804 	}
805 
806 	/* Sanity check for prefix_len and length */
807 	if (rinfo->length > 3) {
808 		return -EINVAL;
809 	} else if (rinfo->prefix_len > 128) {
810 		return -EINVAL;
811 	} else if (rinfo->prefix_len > 64) {
812 		if (rinfo->length < 2) {
813 			return -EINVAL;
814 		}
815 	} else if (rinfo->prefix_len > 0) {
816 		if (rinfo->length < 1) {
817 			return -EINVAL;
818 		}
819 	}
820 
821 	pref = rinfo->route_pref;
822 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
823 		return -EINVAL;
824 
825 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 
827 	if (rinfo->length == 3)
828 		prefix = (struct in6_addr *)rinfo->prefix;
829 	else {
830 		/* this function is safe */
831 		ipv6_addr_prefix(&prefix_buf,
832 				 (struct in6_addr *)rinfo->prefix,
833 				 rinfo->prefix_len);
834 		prefix = &prefix_buf;
835 	}
836 
837 	if (rinfo->prefix_len == 0)
838 		rt = rt6_get_dflt_router(net, gwaddr, dev);
839 	else
840 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
841 					gwaddr, dev);
842 
843 	if (rt && !lifetime) {
844 		ip6_del_rt(net, rt);
845 		rt = NULL;
846 	}
847 
848 	if (!rt && lifetime)
849 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
850 					dev, pref);
851 	else if (rt)
852 		rt->fib6_flags = RTF_ROUTEINFO |
853 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
854 
855 	if (rt) {
856 		if (!addrconf_finite_timeout(lifetime))
857 			fib6_clean_expires(rt);
858 		else
859 			fib6_set_expires(rt, jiffies + HZ * lifetime);
860 
861 		fib6_info_release(rt);
862 	}
863 	return 0;
864 }
865 #endif
866 
867 /*
868  *	Misc support functions
869  */
870 
871 /* called with rcu_lock held */
872 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 {
874 	struct net_device *dev = rt->fib6_nh.nh_dev;
875 
876 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
877 		/* for copies of local routes, dst->dev needs to be the
878 		 * device if it is a master device, the master device if
879 		 * device is enslaved, and the loopback as the default
880 		 */
881 		if (netif_is_l3_slave(dev) &&
882 		    !rt6_need_strict(&rt->fib6_dst.addr))
883 			dev = l3mdev_master_dev_rcu(dev);
884 		else if (!netif_is_l3_master(dev))
885 			dev = dev_net(dev)->loopback_dev;
886 		/* last case is netif_is_l3_master(dev) is true in which
887 		 * case we want dev returned to be dev
888 		 */
889 	}
890 
891 	return dev;
892 }
893 
894 static const int fib6_prop[RTN_MAX + 1] = {
895 	[RTN_UNSPEC]	= 0,
896 	[RTN_UNICAST]	= 0,
897 	[RTN_LOCAL]	= 0,
898 	[RTN_BROADCAST]	= 0,
899 	[RTN_ANYCAST]	= 0,
900 	[RTN_MULTICAST]	= 0,
901 	[RTN_BLACKHOLE]	= -EINVAL,
902 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
903 	[RTN_PROHIBIT]	= -EACCES,
904 	[RTN_THROW]	= -EAGAIN,
905 	[RTN_NAT]	= -EINVAL,
906 	[RTN_XRESOLVE]	= -EINVAL,
907 };
908 
909 static int ip6_rt_type_to_error(u8 fib6_type)
910 {
911 	return fib6_prop[fib6_type];
912 }
913 
914 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 {
916 	unsigned short flags = 0;
917 
918 	if (rt->dst_nocount)
919 		flags |= DST_NOCOUNT;
920 	if (rt->dst_nopolicy)
921 		flags |= DST_NOPOLICY;
922 	if (rt->dst_host)
923 		flags |= DST_HOST;
924 
925 	return flags;
926 }
927 
928 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 {
930 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 
932 	switch (ort->fib6_type) {
933 	case RTN_BLACKHOLE:
934 		rt->dst.output = dst_discard_out;
935 		rt->dst.input = dst_discard;
936 		break;
937 	case RTN_PROHIBIT:
938 		rt->dst.output = ip6_pkt_prohibit_out;
939 		rt->dst.input = ip6_pkt_prohibit;
940 		break;
941 	case RTN_THROW:
942 	case RTN_UNREACHABLE:
943 	default:
944 		rt->dst.output = ip6_pkt_discard_out;
945 		rt->dst.input = ip6_pkt_discard;
946 		break;
947 	}
948 }
949 
950 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 {
952 	if (ort->fib6_flags & RTF_REJECT) {
953 		ip6_rt_init_dst_reject(rt, ort);
954 		return;
955 	}
956 
957 	rt->dst.error = 0;
958 	rt->dst.output = ip6_output;
959 
960 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
961 		rt->dst.input = ip6_input;
962 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
963 		rt->dst.input = ip6_mc_input;
964 	} else {
965 		rt->dst.input = ip6_forward;
966 	}
967 
968 	if (ort->fib6_nh.nh_lwtstate) {
969 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
970 		lwtunnel_set_redirect(&rt->dst);
971 	}
972 
973 	rt->dst.lastuse = jiffies;
974 }
975 
976 /* Caller must already hold reference to @from */
977 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 {
979 	rt->rt6i_flags &= ~RTF_EXPIRES;
980 	rcu_assign_pointer(rt->from, from);
981 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
982 	if (from->fib6_metrics != &dst_default_metrics) {
983 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
984 		refcount_inc(&from->fib6_metrics->refcnt);
985 	}
986 }
987 
988 /* Caller must already hold reference to @ort */
989 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990 {
991 	struct net_device *dev = fib6_info_nh_dev(ort);
992 
993 	ip6_rt_init_dst(rt, ort);
994 
995 	rt->rt6i_dst = ort->fib6_dst;
996 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
997 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
998 	rt->rt6i_flags = ort->fib6_flags;
999 	rt6_set_from(rt, ort);
1000 #ifdef CONFIG_IPV6_SUBTREES
1001 	rt->rt6i_src = ort->fib6_src;
1002 #endif
1003 }
1004 
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006 					struct in6_addr *saddr)
1007 {
1008 	struct fib6_node *pn, *sn;
1009 	while (1) {
1010 		if (fn->fn_flags & RTN_TL_ROOT)
1011 			return NULL;
1012 		pn = rcu_dereference(fn->parent);
1013 		sn = FIB6_SUBTREE(pn);
1014 		if (sn && sn != fn)
1015 			fn = fib6_node_lookup(sn, NULL, saddr);
1016 		else
1017 			fn = pn;
1018 		if (fn->fn_flags & RTN_RTINFO)
1019 			return fn;
1020 	}
1021 }
1022 
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1024 			  bool null_fallback)
1025 {
1026 	struct rt6_info *rt = *prt;
1027 
1028 	if (dst_hold_safe(&rt->dst))
1029 		return true;
1030 	if (null_fallback) {
1031 		rt = net->ipv6.ip6_null_entry;
1032 		dst_hold(&rt->dst);
1033 	} else {
1034 		rt = NULL;
1035 	}
1036 	*prt = rt;
1037 	return false;
1038 }
1039 
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1042 {
1043 	unsigned short flags = fib6_info_dst_flags(rt);
1044 	struct net_device *dev = rt->fib6_nh.nh_dev;
1045 	struct rt6_info *nrt;
1046 
1047 	if (!fib6_info_hold_safe(rt))
1048 		return NULL;
1049 
1050 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1051 	if (nrt)
1052 		ip6_rt_copy_init(nrt, rt);
1053 	else
1054 		fib6_info_release(rt);
1055 
1056 	return nrt;
1057 }
1058 
1059 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1060 					     struct fib6_table *table,
1061 					     struct flowi6 *fl6,
1062 					     const struct sk_buff *skb,
1063 					     int flags)
1064 {
1065 	struct fib6_info *f6i;
1066 	struct fib6_node *fn;
1067 	struct rt6_info *rt;
1068 
1069 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1070 		flags &= ~RT6_LOOKUP_F_IFACE;
1071 
1072 	rcu_read_lock();
1073 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1074 restart:
1075 	f6i = rcu_dereference(fn->leaf);
1076 	if (!f6i) {
1077 		f6i = net->ipv6.fib6_null_entry;
1078 	} else {
1079 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1080 				      fl6->flowi6_oif, flags);
1081 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1082 			f6i = fib6_multipath_select(net, f6i, fl6,
1083 						    fl6->flowi6_oif, skb,
1084 						    flags);
1085 	}
1086 	if (f6i == net->ipv6.fib6_null_entry) {
1087 		fn = fib6_backtrack(fn, &fl6->saddr);
1088 		if (fn)
1089 			goto restart;
1090 	}
1091 
1092 	trace_fib6_table_lookup(net, f6i, table, fl6);
1093 
1094 	/* Search through exception table */
1095 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1096 	if (rt) {
1097 		if (ip6_hold_safe(net, &rt, true))
1098 			dst_use_noref(&rt->dst, jiffies);
1099 	} else if (f6i == net->ipv6.fib6_null_entry) {
1100 		rt = net->ipv6.ip6_null_entry;
1101 		dst_hold(&rt->dst);
1102 	} else {
1103 		rt = ip6_create_rt_rcu(f6i);
1104 		if (!rt) {
1105 			rt = net->ipv6.ip6_null_entry;
1106 			dst_hold(&rt->dst);
1107 		}
1108 	}
1109 
1110 	rcu_read_unlock();
1111 
1112 	return rt;
1113 }
1114 
1115 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1116 				   const struct sk_buff *skb, int flags)
1117 {
1118 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1119 }
1120 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1121 
1122 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1123 			    const struct in6_addr *saddr, int oif,
1124 			    const struct sk_buff *skb, int strict)
1125 {
1126 	struct flowi6 fl6 = {
1127 		.flowi6_oif = oif,
1128 		.daddr = *daddr,
1129 	};
1130 	struct dst_entry *dst;
1131 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1132 
1133 	if (saddr) {
1134 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1135 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1136 	}
1137 
1138 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1139 	if (dst->error == 0)
1140 		return (struct rt6_info *) dst;
1141 
1142 	dst_release(dst);
1143 
1144 	return NULL;
1145 }
1146 EXPORT_SYMBOL(rt6_lookup);
1147 
1148 /* ip6_ins_rt is called with FREE table->tb6_lock.
1149  * It takes new route entry, the addition fails by any reason the
1150  * route is released.
1151  * Caller must hold dst before calling it.
1152  */
1153 
1154 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1155 			struct netlink_ext_ack *extack)
1156 {
1157 	int err;
1158 	struct fib6_table *table;
1159 
1160 	table = rt->fib6_table;
1161 	spin_lock_bh(&table->tb6_lock);
1162 	err = fib6_add(&table->tb6_root, rt, info, extack);
1163 	spin_unlock_bh(&table->tb6_lock);
1164 
1165 	return err;
1166 }
1167 
1168 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1169 {
1170 	struct nl_info info = {	.nl_net = net, };
1171 
1172 	return __ip6_ins_rt(rt, &info, NULL);
1173 }
1174 
1175 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1176 					   const struct in6_addr *daddr,
1177 					   const struct in6_addr *saddr)
1178 {
1179 	struct net_device *dev;
1180 	struct rt6_info *rt;
1181 
1182 	/*
1183 	 *	Clone the route.
1184 	 */
1185 
1186 	if (!fib6_info_hold_safe(ort))
1187 		return NULL;
1188 
1189 	dev = ip6_rt_get_dev_rcu(ort);
1190 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1191 	if (!rt) {
1192 		fib6_info_release(ort);
1193 		return NULL;
1194 	}
1195 
1196 	ip6_rt_copy_init(rt, ort);
1197 	rt->rt6i_flags |= RTF_CACHE;
1198 	rt->dst.flags |= DST_HOST;
1199 	rt->rt6i_dst.addr = *daddr;
1200 	rt->rt6i_dst.plen = 128;
1201 
1202 	if (!rt6_is_gw_or_nonexthop(ort)) {
1203 		if (ort->fib6_dst.plen != 128 &&
1204 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1205 			rt->rt6i_flags |= RTF_ANYCAST;
1206 #ifdef CONFIG_IPV6_SUBTREES
1207 		if (rt->rt6i_src.plen && saddr) {
1208 			rt->rt6i_src.addr = *saddr;
1209 			rt->rt6i_src.plen = 128;
1210 		}
1211 #endif
1212 	}
1213 
1214 	return rt;
1215 }
1216 
1217 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1218 {
1219 	unsigned short flags = fib6_info_dst_flags(rt);
1220 	struct net_device *dev;
1221 	struct rt6_info *pcpu_rt;
1222 
1223 	if (!fib6_info_hold_safe(rt))
1224 		return NULL;
1225 
1226 	rcu_read_lock();
1227 	dev = ip6_rt_get_dev_rcu(rt);
1228 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1229 	rcu_read_unlock();
1230 	if (!pcpu_rt) {
1231 		fib6_info_release(rt);
1232 		return NULL;
1233 	}
1234 	ip6_rt_copy_init(pcpu_rt, rt);
1235 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1236 	return pcpu_rt;
1237 }
1238 
1239 /* It should be called with rcu_read_lock() acquired */
1240 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1241 {
1242 	struct rt6_info *pcpu_rt, **p;
1243 
1244 	p = this_cpu_ptr(rt->rt6i_pcpu);
1245 	pcpu_rt = *p;
1246 
1247 	if (pcpu_rt)
1248 		ip6_hold_safe(NULL, &pcpu_rt, false);
1249 
1250 	return pcpu_rt;
1251 }
1252 
1253 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1254 					    struct fib6_info *rt)
1255 {
1256 	struct rt6_info *pcpu_rt, *prev, **p;
1257 
1258 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1259 	if (!pcpu_rt) {
1260 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1261 		return net->ipv6.ip6_null_entry;
1262 	}
1263 
1264 	dst_hold(&pcpu_rt->dst);
1265 	p = this_cpu_ptr(rt->rt6i_pcpu);
1266 	prev = cmpxchg(p, NULL, pcpu_rt);
1267 	BUG_ON(prev);
1268 
1269 	return pcpu_rt;
1270 }
1271 
1272 /* exception hash table implementation
1273  */
1274 static DEFINE_SPINLOCK(rt6_exception_lock);
1275 
1276 /* Remove rt6_ex from hash table and free the memory
1277  * Caller must hold rt6_exception_lock
1278  */
1279 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1280 				 struct rt6_exception *rt6_ex)
1281 {
1282 	struct net *net;
1283 
1284 	if (!bucket || !rt6_ex)
1285 		return;
1286 
1287 	net = dev_net(rt6_ex->rt6i->dst.dev);
1288 	hlist_del_rcu(&rt6_ex->hlist);
1289 	dst_release(&rt6_ex->rt6i->dst);
1290 	kfree_rcu(rt6_ex, rcu);
1291 	WARN_ON_ONCE(!bucket->depth);
1292 	bucket->depth--;
1293 	net->ipv6.rt6_stats->fib_rt_cache--;
1294 }
1295 
1296 /* Remove oldest rt6_ex in bucket and free the memory
1297  * Caller must hold rt6_exception_lock
1298  */
1299 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1300 {
1301 	struct rt6_exception *rt6_ex, *oldest = NULL;
1302 
1303 	if (!bucket)
1304 		return;
1305 
1306 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1307 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1308 			oldest = rt6_ex;
1309 	}
1310 	rt6_remove_exception(bucket, oldest);
1311 }
1312 
1313 static u32 rt6_exception_hash(const struct in6_addr *dst,
1314 			      const struct in6_addr *src)
1315 {
1316 	static u32 seed __read_mostly;
1317 	u32 val;
1318 
1319 	net_get_random_once(&seed, sizeof(seed));
1320 	val = jhash(dst, sizeof(*dst), seed);
1321 
1322 #ifdef CONFIG_IPV6_SUBTREES
1323 	if (src)
1324 		val = jhash(src, sizeof(*src), val);
1325 #endif
1326 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1327 }
1328 
1329 /* Helper function to find the cached rt in the hash table
1330  * and update bucket pointer to point to the bucket for this
1331  * (daddr, saddr) pair
1332  * Caller must hold rt6_exception_lock
1333  */
1334 static struct rt6_exception *
1335 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1336 			      const struct in6_addr *daddr,
1337 			      const struct in6_addr *saddr)
1338 {
1339 	struct rt6_exception *rt6_ex;
1340 	u32 hval;
1341 
1342 	if (!(*bucket) || !daddr)
1343 		return NULL;
1344 
1345 	hval = rt6_exception_hash(daddr, saddr);
1346 	*bucket += hval;
1347 
1348 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1349 		struct rt6_info *rt6 = rt6_ex->rt6i;
1350 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1351 
1352 #ifdef CONFIG_IPV6_SUBTREES
1353 		if (matched && saddr)
1354 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1355 #endif
1356 		if (matched)
1357 			return rt6_ex;
1358 	}
1359 	return NULL;
1360 }
1361 
1362 /* Helper function to find the cached rt in the hash table
1363  * and update bucket pointer to point to the bucket for this
1364  * (daddr, saddr) pair
1365  * Caller must hold rcu_read_lock()
1366  */
1367 static struct rt6_exception *
1368 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1369 			 const struct in6_addr *daddr,
1370 			 const struct in6_addr *saddr)
1371 {
1372 	struct rt6_exception *rt6_ex;
1373 	u32 hval;
1374 
1375 	WARN_ON_ONCE(!rcu_read_lock_held());
1376 
1377 	if (!(*bucket) || !daddr)
1378 		return NULL;
1379 
1380 	hval = rt6_exception_hash(daddr, saddr);
1381 	*bucket += hval;
1382 
1383 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1384 		struct rt6_info *rt6 = rt6_ex->rt6i;
1385 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1386 
1387 #ifdef CONFIG_IPV6_SUBTREES
1388 		if (matched && saddr)
1389 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1390 #endif
1391 		if (matched)
1392 			return rt6_ex;
1393 	}
1394 	return NULL;
1395 }
1396 
1397 static unsigned int fib6_mtu(const struct fib6_info *rt)
1398 {
1399 	unsigned int mtu;
1400 
1401 	if (rt->fib6_pmtu) {
1402 		mtu = rt->fib6_pmtu;
1403 	} else {
1404 		struct net_device *dev = fib6_info_nh_dev(rt);
1405 		struct inet6_dev *idev;
1406 
1407 		rcu_read_lock();
1408 		idev = __in6_dev_get(dev);
1409 		mtu = idev->cnf.mtu6;
1410 		rcu_read_unlock();
1411 	}
1412 
1413 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1414 
1415 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1416 }
1417 
1418 static int rt6_insert_exception(struct rt6_info *nrt,
1419 				struct fib6_info *ort)
1420 {
1421 	struct net *net = dev_net(nrt->dst.dev);
1422 	struct rt6_exception_bucket *bucket;
1423 	struct in6_addr *src_key = NULL;
1424 	struct rt6_exception *rt6_ex;
1425 	int err = 0;
1426 
1427 	spin_lock_bh(&rt6_exception_lock);
1428 
1429 	if (ort->exception_bucket_flushed) {
1430 		err = -EINVAL;
1431 		goto out;
1432 	}
1433 
1434 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1435 					lockdep_is_held(&rt6_exception_lock));
1436 	if (!bucket) {
1437 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1438 				 GFP_ATOMIC);
1439 		if (!bucket) {
1440 			err = -ENOMEM;
1441 			goto out;
1442 		}
1443 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1444 	}
1445 
1446 #ifdef CONFIG_IPV6_SUBTREES
1447 	/* rt6i_src.plen != 0 indicates ort is in subtree
1448 	 * and exception table is indexed by a hash of
1449 	 * both rt6i_dst and rt6i_src.
1450 	 * Otherwise, the exception table is indexed by
1451 	 * a hash of only rt6i_dst.
1452 	 */
1453 	if (ort->fib6_src.plen)
1454 		src_key = &nrt->rt6i_src.addr;
1455 #endif
1456 	/* rt6_mtu_change() might lower mtu on ort.
1457 	 * Only insert this exception route if its mtu
1458 	 * is less than ort's mtu value.
1459 	 */
1460 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1461 		err = -EINVAL;
1462 		goto out;
1463 	}
1464 
1465 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1466 					       src_key);
1467 	if (rt6_ex)
1468 		rt6_remove_exception(bucket, rt6_ex);
1469 
1470 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1471 	if (!rt6_ex) {
1472 		err = -ENOMEM;
1473 		goto out;
1474 	}
1475 	rt6_ex->rt6i = nrt;
1476 	rt6_ex->stamp = jiffies;
1477 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1478 	bucket->depth++;
1479 	net->ipv6.rt6_stats->fib_rt_cache++;
1480 
1481 	if (bucket->depth > FIB6_MAX_DEPTH)
1482 		rt6_exception_remove_oldest(bucket);
1483 
1484 out:
1485 	spin_unlock_bh(&rt6_exception_lock);
1486 
1487 	/* Update fn->fn_sernum to invalidate all cached dst */
1488 	if (!err) {
1489 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1490 		fib6_update_sernum(net, ort);
1491 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1492 		fib6_force_start_gc(net);
1493 	}
1494 
1495 	return err;
1496 }
1497 
1498 void rt6_flush_exceptions(struct fib6_info *rt)
1499 {
1500 	struct rt6_exception_bucket *bucket;
1501 	struct rt6_exception *rt6_ex;
1502 	struct hlist_node *tmp;
1503 	int i;
1504 
1505 	spin_lock_bh(&rt6_exception_lock);
1506 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1507 	rt->exception_bucket_flushed = 1;
1508 
1509 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1510 				    lockdep_is_held(&rt6_exception_lock));
1511 	if (!bucket)
1512 		goto out;
1513 
1514 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1515 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1516 			rt6_remove_exception(bucket, rt6_ex);
1517 		WARN_ON_ONCE(bucket->depth);
1518 		bucket++;
1519 	}
1520 
1521 out:
1522 	spin_unlock_bh(&rt6_exception_lock);
1523 }
1524 
1525 /* Find cached rt in the hash table inside passed in rt
1526  * Caller has to hold rcu_read_lock()
1527  */
1528 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1529 					   struct in6_addr *daddr,
1530 					   struct in6_addr *saddr)
1531 {
1532 	struct rt6_exception_bucket *bucket;
1533 	struct in6_addr *src_key = NULL;
1534 	struct rt6_exception *rt6_ex;
1535 	struct rt6_info *res = NULL;
1536 
1537 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1538 
1539 #ifdef CONFIG_IPV6_SUBTREES
1540 	/* rt6i_src.plen != 0 indicates rt is in subtree
1541 	 * and exception table is indexed by a hash of
1542 	 * both rt6i_dst and rt6i_src.
1543 	 * Otherwise, the exception table is indexed by
1544 	 * a hash of only rt6i_dst.
1545 	 */
1546 	if (rt->fib6_src.plen)
1547 		src_key = saddr;
1548 #endif
1549 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1550 
1551 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1552 		res = rt6_ex->rt6i;
1553 
1554 	return res;
1555 }
1556 
1557 /* Remove the passed in cached rt from the hash table that contains it */
1558 static int rt6_remove_exception_rt(struct rt6_info *rt)
1559 {
1560 	struct rt6_exception_bucket *bucket;
1561 	struct in6_addr *src_key = NULL;
1562 	struct rt6_exception *rt6_ex;
1563 	struct fib6_info *from;
1564 	int err;
1565 
1566 	from = rcu_dereference(rt->from);
1567 	if (!from ||
1568 	    !(rt->rt6i_flags & RTF_CACHE))
1569 		return -EINVAL;
1570 
1571 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1572 		return -ENOENT;
1573 
1574 	spin_lock_bh(&rt6_exception_lock);
1575 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1576 				    lockdep_is_held(&rt6_exception_lock));
1577 #ifdef CONFIG_IPV6_SUBTREES
1578 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1579 	 * and exception table is indexed by a hash of
1580 	 * both rt6i_dst and rt6i_src.
1581 	 * Otherwise, the exception table is indexed by
1582 	 * a hash of only rt6i_dst.
1583 	 */
1584 	if (from->fib6_src.plen)
1585 		src_key = &rt->rt6i_src.addr;
1586 #endif
1587 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1588 					       &rt->rt6i_dst.addr,
1589 					       src_key);
1590 	if (rt6_ex) {
1591 		rt6_remove_exception(bucket, rt6_ex);
1592 		err = 0;
1593 	} else {
1594 		err = -ENOENT;
1595 	}
1596 
1597 	spin_unlock_bh(&rt6_exception_lock);
1598 	return err;
1599 }
1600 
1601 /* Find rt6_ex which contains the passed in rt cache and
1602  * refresh its stamp
1603  */
1604 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1605 {
1606 	struct rt6_exception_bucket *bucket;
1607 	struct fib6_info *from = rt->from;
1608 	struct in6_addr *src_key = NULL;
1609 	struct rt6_exception *rt6_ex;
1610 
1611 	if (!from ||
1612 	    !(rt->rt6i_flags & RTF_CACHE))
1613 		return;
1614 
1615 	rcu_read_lock();
1616 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1617 
1618 #ifdef CONFIG_IPV6_SUBTREES
1619 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1620 	 * and exception table is indexed by a hash of
1621 	 * both rt6i_dst and rt6i_src.
1622 	 * Otherwise, the exception table is indexed by
1623 	 * a hash of only rt6i_dst.
1624 	 */
1625 	if (from->fib6_src.plen)
1626 		src_key = &rt->rt6i_src.addr;
1627 #endif
1628 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1629 					  &rt->rt6i_dst.addr,
1630 					  src_key);
1631 	if (rt6_ex)
1632 		rt6_ex->stamp = jiffies;
1633 
1634 	rcu_read_unlock();
1635 }
1636 
1637 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1638 					 struct rt6_info *rt, int mtu)
1639 {
1640 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1641 	 * lowest MTU in the path: always allow updating the route PMTU to
1642 	 * reflect PMTU decreases.
1643 	 *
1644 	 * If the new MTU is higher, and the route PMTU is equal to the local
1645 	 * MTU, this means the old MTU is the lowest in the path, so allow
1646 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1647 	 * handle this.
1648 	 */
1649 
1650 	if (dst_mtu(&rt->dst) >= mtu)
1651 		return true;
1652 
1653 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1654 		return true;
1655 
1656 	return false;
1657 }
1658 
1659 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1660 				       struct fib6_info *rt, int mtu)
1661 {
1662 	struct rt6_exception_bucket *bucket;
1663 	struct rt6_exception *rt6_ex;
1664 	int i;
1665 
1666 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 					lockdep_is_held(&rt6_exception_lock));
1668 
1669 	if (!bucket)
1670 		return;
1671 
1672 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1673 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1674 			struct rt6_info *entry = rt6_ex->rt6i;
1675 
1676 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1677 			 * route), the metrics of its rt->from have already
1678 			 * been updated.
1679 			 */
1680 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1681 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1682 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1683 		}
1684 		bucket++;
1685 	}
1686 }
1687 
1688 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1689 
1690 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1691 					struct in6_addr *gateway)
1692 {
1693 	struct rt6_exception_bucket *bucket;
1694 	struct rt6_exception *rt6_ex;
1695 	struct hlist_node *tmp;
1696 	int i;
1697 
1698 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1699 		return;
1700 
1701 	spin_lock_bh(&rt6_exception_lock);
1702 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1703 				     lockdep_is_held(&rt6_exception_lock));
1704 
1705 	if (bucket) {
1706 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1707 			hlist_for_each_entry_safe(rt6_ex, tmp,
1708 						  &bucket->chain, hlist) {
1709 				struct rt6_info *entry = rt6_ex->rt6i;
1710 
1711 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1712 				    RTF_CACHE_GATEWAY &&
1713 				    ipv6_addr_equal(gateway,
1714 						    &entry->rt6i_gateway)) {
1715 					rt6_remove_exception(bucket, rt6_ex);
1716 				}
1717 			}
1718 			bucket++;
1719 		}
1720 	}
1721 
1722 	spin_unlock_bh(&rt6_exception_lock);
1723 }
1724 
1725 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1726 				      struct rt6_exception *rt6_ex,
1727 				      struct fib6_gc_args *gc_args,
1728 				      unsigned long now)
1729 {
1730 	struct rt6_info *rt = rt6_ex->rt6i;
1731 
1732 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1733 	 * even if others have still references to them, so that on next
1734 	 * dst_check() such references can be dropped.
1735 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1736 	 * expired, independently from their aging, as per RFC 8201 section 4
1737 	 */
1738 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1739 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1740 			RT6_TRACE("aging clone %p\n", rt);
1741 			rt6_remove_exception(bucket, rt6_ex);
1742 			return;
1743 		}
1744 	} else if (time_after(jiffies, rt->dst.expires)) {
1745 		RT6_TRACE("purging expired route %p\n", rt);
1746 		rt6_remove_exception(bucket, rt6_ex);
1747 		return;
1748 	}
1749 
1750 	if (rt->rt6i_flags & RTF_GATEWAY) {
1751 		struct neighbour *neigh;
1752 		__u8 neigh_flags = 0;
1753 
1754 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1755 		if (neigh)
1756 			neigh_flags = neigh->flags;
1757 
1758 		if (!(neigh_flags & NTF_ROUTER)) {
1759 			RT6_TRACE("purging route %p via non-router but gateway\n",
1760 				  rt);
1761 			rt6_remove_exception(bucket, rt6_ex);
1762 			return;
1763 		}
1764 	}
1765 
1766 	gc_args->more++;
1767 }
1768 
1769 void rt6_age_exceptions(struct fib6_info *rt,
1770 			struct fib6_gc_args *gc_args,
1771 			unsigned long now)
1772 {
1773 	struct rt6_exception_bucket *bucket;
1774 	struct rt6_exception *rt6_ex;
1775 	struct hlist_node *tmp;
1776 	int i;
1777 
1778 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1779 		return;
1780 
1781 	rcu_read_lock_bh();
1782 	spin_lock(&rt6_exception_lock);
1783 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1784 				    lockdep_is_held(&rt6_exception_lock));
1785 
1786 	if (bucket) {
1787 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1788 			hlist_for_each_entry_safe(rt6_ex, tmp,
1789 						  &bucket->chain, hlist) {
1790 				rt6_age_examine_exception(bucket, rt6_ex,
1791 							  gc_args, now);
1792 			}
1793 			bucket++;
1794 		}
1795 	}
1796 	spin_unlock(&rt6_exception_lock);
1797 	rcu_read_unlock_bh();
1798 }
1799 
1800 /* must be called with rcu lock held */
1801 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1802 				    int oif, struct flowi6 *fl6, int strict)
1803 {
1804 	struct fib6_node *fn, *saved_fn;
1805 	struct fib6_info *f6i;
1806 
1807 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1808 	saved_fn = fn;
1809 
1810 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1811 		oif = 0;
1812 
1813 redo_rt6_select:
1814 	f6i = rt6_select(net, fn, oif, strict);
1815 	if (f6i == net->ipv6.fib6_null_entry) {
1816 		fn = fib6_backtrack(fn, &fl6->saddr);
1817 		if (fn)
1818 			goto redo_rt6_select;
1819 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1820 			/* also consider unreachable route */
1821 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1822 			fn = saved_fn;
1823 			goto redo_rt6_select;
1824 		}
1825 	}
1826 
1827 	trace_fib6_table_lookup(net, f6i, table, fl6);
1828 
1829 	return f6i;
1830 }
1831 
1832 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1833 			       int oif, struct flowi6 *fl6,
1834 			       const struct sk_buff *skb, int flags)
1835 {
1836 	struct fib6_info *f6i;
1837 	struct rt6_info *rt;
1838 	int strict = 0;
1839 
1840 	strict |= flags & RT6_LOOKUP_F_IFACE;
1841 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1842 	if (net->ipv6.devconf_all->forwarding == 0)
1843 		strict |= RT6_LOOKUP_F_REACHABLE;
1844 
1845 	rcu_read_lock();
1846 
1847 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1848 	if (f6i->fib6_nsiblings)
1849 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1850 
1851 	if (f6i == net->ipv6.fib6_null_entry) {
1852 		rt = net->ipv6.ip6_null_entry;
1853 		rcu_read_unlock();
1854 		dst_hold(&rt->dst);
1855 		return rt;
1856 	}
1857 
1858 	/*Search through exception table */
1859 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1860 	if (rt) {
1861 		if (ip6_hold_safe(net, &rt, true))
1862 			dst_use_noref(&rt->dst, jiffies);
1863 
1864 		rcu_read_unlock();
1865 		return rt;
1866 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1867 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1868 		/* Create a RTF_CACHE clone which will not be
1869 		 * owned by the fib6 tree.  It is for the special case where
1870 		 * the daddr in the skb during the neighbor look-up is different
1871 		 * from the fl6->daddr used to look-up route here.
1872 		 */
1873 		struct rt6_info *uncached_rt;
1874 
1875 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1876 
1877 		rcu_read_unlock();
1878 
1879 		if (uncached_rt) {
1880 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1881 			 * No need for another dst_hold()
1882 			 */
1883 			rt6_uncached_list_add(uncached_rt);
1884 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1885 		} else {
1886 			uncached_rt = net->ipv6.ip6_null_entry;
1887 			dst_hold(&uncached_rt->dst);
1888 		}
1889 
1890 		return uncached_rt;
1891 	} else {
1892 		/* Get a percpu copy */
1893 
1894 		struct rt6_info *pcpu_rt;
1895 
1896 		local_bh_disable();
1897 		pcpu_rt = rt6_get_pcpu_route(f6i);
1898 
1899 		if (!pcpu_rt)
1900 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1901 
1902 		local_bh_enable();
1903 		rcu_read_unlock();
1904 
1905 		return pcpu_rt;
1906 	}
1907 }
1908 EXPORT_SYMBOL_GPL(ip6_pol_route);
1909 
1910 static struct rt6_info *ip6_pol_route_input(struct net *net,
1911 					    struct fib6_table *table,
1912 					    struct flowi6 *fl6,
1913 					    const struct sk_buff *skb,
1914 					    int flags)
1915 {
1916 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1917 }
1918 
1919 struct dst_entry *ip6_route_input_lookup(struct net *net,
1920 					 struct net_device *dev,
1921 					 struct flowi6 *fl6,
1922 					 const struct sk_buff *skb,
1923 					 int flags)
1924 {
1925 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1926 		flags |= RT6_LOOKUP_F_IFACE;
1927 
1928 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1929 }
1930 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1931 
1932 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1933 				  struct flow_keys *keys,
1934 				  struct flow_keys *flkeys)
1935 {
1936 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1937 	const struct ipv6hdr *key_iph = outer_iph;
1938 	struct flow_keys *_flkeys = flkeys;
1939 	const struct ipv6hdr *inner_iph;
1940 	const struct icmp6hdr *icmph;
1941 	struct ipv6hdr _inner_iph;
1942 	struct icmp6hdr _icmph;
1943 
1944 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1945 		goto out;
1946 
1947 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1948 				   sizeof(_icmph), &_icmph);
1949 	if (!icmph)
1950 		goto out;
1951 
1952 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1953 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1954 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1955 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1956 		goto out;
1957 
1958 	inner_iph = skb_header_pointer(skb,
1959 				       skb_transport_offset(skb) + sizeof(*icmph),
1960 				       sizeof(_inner_iph), &_inner_iph);
1961 	if (!inner_iph)
1962 		goto out;
1963 
1964 	key_iph = inner_iph;
1965 	_flkeys = NULL;
1966 out:
1967 	if (_flkeys) {
1968 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1969 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1970 		keys->tags.flow_label = _flkeys->tags.flow_label;
1971 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1972 	} else {
1973 		keys->addrs.v6addrs.src = key_iph->saddr;
1974 		keys->addrs.v6addrs.dst = key_iph->daddr;
1975 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1976 		keys->basic.ip_proto = key_iph->nexthdr;
1977 	}
1978 }
1979 
1980 /* if skb is set it will be used and fl6 can be NULL */
1981 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1982 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1983 {
1984 	struct flow_keys hash_keys;
1985 	u32 mhash;
1986 
1987 	switch (ip6_multipath_hash_policy(net)) {
1988 	case 0:
1989 		memset(&hash_keys, 0, sizeof(hash_keys));
1990 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1991 		if (skb) {
1992 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1993 		} else {
1994 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1995 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1996 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1997 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1998 		}
1999 		break;
2000 	case 1:
2001 		if (skb) {
2002 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2003 			struct flow_keys keys;
2004 
2005 			/* short-circuit if we already have L4 hash present */
2006 			if (skb->l4_hash)
2007 				return skb_get_hash_raw(skb) >> 1;
2008 
2009 			memset(&hash_keys, 0, sizeof(hash_keys));
2010 
2011                         if (!flkeys) {
2012 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2013 				flkeys = &keys;
2014 			}
2015 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2016 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2017 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2018 			hash_keys.ports.src = flkeys->ports.src;
2019 			hash_keys.ports.dst = flkeys->ports.dst;
2020 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2021 		} else {
2022 			memset(&hash_keys, 0, sizeof(hash_keys));
2023 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2024 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2025 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2026 			hash_keys.ports.src = fl6->fl6_sport;
2027 			hash_keys.ports.dst = fl6->fl6_dport;
2028 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2029 		}
2030 		break;
2031 	}
2032 	mhash = flow_hash_from_keys(&hash_keys);
2033 
2034 	return mhash >> 1;
2035 }
2036 
2037 void ip6_route_input(struct sk_buff *skb)
2038 {
2039 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2040 	struct net *net = dev_net(skb->dev);
2041 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2042 	struct ip_tunnel_info *tun_info;
2043 	struct flowi6 fl6 = {
2044 		.flowi6_iif = skb->dev->ifindex,
2045 		.daddr = iph->daddr,
2046 		.saddr = iph->saddr,
2047 		.flowlabel = ip6_flowinfo(iph),
2048 		.flowi6_mark = skb->mark,
2049 		.flowi6_proto = iph->nexthdr,
2050 	};
2051 	struct flow_keys *flkeys = NULL, _flkeys;
2052 
2053 	tun_info = skb_tunnel_info(skb);
2054 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2055 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2056 
2057 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2058 		flkeys = &_flkeys;
2059 
2060 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2061 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2062 	skb_dst_drop(skb);
2063 	skb_dst_set(skb,
2064 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2065 }
2066 
2067 static struct rt6_info *ip6_pol_route_output(struct net *net,
2068 					     struct fib6_table *table,
2069 					     struct flowi6 *fl6,
2070 					     const struct sk_buff *skb,
2071 					     int flags)
2072 {
2073 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2074 }
2075 
2076 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2077 					 struct flowi6 *fl6, int flags)
2078 {
2079 	bool any_src;
2080 
2081 	if (ipv6_addr_type(&fl6->daddr) &
2082 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2083 		struct dst_entry *dst;
2084 
2085 		dst = l3mdev_link_scope_lookup(net, fl6);
2086 		if (dst)
2087 			return dst;
2088 	}
2089 
2090 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2091 
2092 	any_src = ipv6_addr_any(&fl6->saddr);
2093 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2094 	    (fl6->flowi6_oif && any_src))
2095 		flags |= RT6_LOOKUP_F_IFACE;
2096 
2097 	if (!any_src)
2098 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2099 	else if (sk)
2100 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2101 
2102 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2103 }
2104 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2105 
2106 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2107 {
2108 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2109 	struct net_device *loopback_dev = net->loopback_dev;
2110 	struct dst_entry *new = NULL;
2111 
2112 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2113 		       DST_OBSOLETE_DEAD, 0);
2114 	if (rt) {
2115 		rt6_info_init(rt);
2116 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2117 
2118 		new = &rt->dst;
2119 		new->__use = 1;
2120 		new->input = dst_discard;
2121 		new->output = dst_discard_out;
2122 
2123 		dst_copy_metrics(new, &ort->dst);
2124 
2125 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2126 		rt->rt6i_gateway = ort->rt6i_gateway;
2127 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2128 
2129 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2130 #ifdef CONFIG_IPV6_SUBTREES
2131 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2132 #endif
2133 	}
2134 
2135 	dst_release(dst_orig);
2136 	return new ? new : ERR_PTR(-ENOMEM);
2137 }
2138 
2139 /*
2140  *	Destination cache support functions
2141  */
2142 
2143 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2144 {
2145 	u32 rt_cookie = 0;
2146 
2147 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2148 		return false;
2149 
2150 	if (fib6_check_expired(f6i))
2151 		return false;
2152 
2153 	return true;
2154 }
2155 
2156 static struct dst_entry *rt6_check(struct rt6_info *rt,
2157 				   struct fib6_info *from,
2158 				   u32 cookie)
2159 {
2160 	u32 rt_cookie = 0;
2161 
2162 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2163 	    rt_cookie != cookie)
2164 		return NULL;
2165 
2166 	if (rt6_check_expired(rt))
2167 		return NULL;
2168 
2169 	return &rt->dst;
2170 }
2171 
2172 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2173 					    struct fib6_info *from,
2174 					    u32 cookie)
2175 {
2176 	if (!__rt6_check_expired(rt) &&
2177 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2178 	    fib6_check(from, cookie))
2179 		return &rt->dst;
2180 	else
2181 		return NULL;
2182 }
2183 
2184 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2185 {
2186 	struct dst_entry *dst_ret;
2187 	struct fib6_info *from;
2188 	struct rt6_info *rt;
2189 
2190 	rt = container_of(dst, struct rt6_info, dst);
2191 
2192 	rcu_read_lock();
2193 
2194 	/* All IPV6 dsts are created with ->obsolete set to the value
2195 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2196 	 * into this function always.
2197 	 */
2198 
2199 	from = rcu_dereference(rt->from);
2200 
2201 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2202 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2203 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2204 	else
2205 		dst_ret = rt6_check(rt, from, cookie);
2206 
2207 	rcu_read_unlock();
2208 
2209 	return dst_ret;
2210 }
2211 
2212 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2213 {
2214 	struct rt6_info *rt = (struct rt6_info *) dst;
2215 
2216 	if (rt) {
2217 		if (rt->rt6i_flags & RTF_CACHE) {
2218 			rcu_read_lock();
2219 			if (rt6_check_expired(rt)) {
2220 				rt6_remove_exception_rt(rt);
2221 				dst = NULL;
2222 			}
2223 			rcu_read_unlock();
2224 		} else {
2225 			dst_release(dst);
2226 			dst = NULL;
2227 		}
2228 	}
2229 	return dst;
2230 }
2231 
2232 static void ip6_link_failure(struct sk_buff *skb)
2233 {
2234 	struct rt6_info *rt;
2235 
2236 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2237 
2238 	rt = (struct rt6_info *) skb_dst(skb);
2239 	if (rt) {
2240 		rcu_read_lock();
2241 		if (rt->rt6i_flags & RTF_CACHE) {
2242 			if (dst_hold_safe(&rt->dst))
2243 				rt6_remove_exception_rt(rt);
2244 		} else {
2245 			struct fib6_info *from;
2246 			struct fib6_node *fn;
2247 
2248 			from = rcu_dereference(rt->from);
2249 			if (from) {
2250 				fn = rcu_dereference(from->fib6_node);
2251 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2252 					fn->fn_sernum = -1;
2253 			}
2254 		}
2255 		rcu_read_unlock();
2256 	}
2257 }
2258 
2259 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2260 {
2261 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2262 		struct fib6_info *from;
2263 
2264 		rcu_read_lock();
2265 		from = rcu_dereference(rt0->from);
2266 		if (from)
2267 			rt0->dst.expires = from->expires;
2268 		rcu_read_unlock();
2269 	}
2270 
2271 	dst_set_expires(&rt0->dst, timeout);
2272 	rt0->rt6i_flags |= RTF_EXPIRES;
2273 }
2274 
2275 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2276 {
2277 	struct net *net = dev_net(rt->dst.dev);
2278 
2279 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2280 	rt->rt6i_flags |= RTF_MODIFIED;
2281 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2282 }
2283 
2284 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2285 {
2286 	bool from_set;
2287 
2288 	rcu_read_lock();
2289 	from_set = !!rcu_dereference(rt->from);
2290 	rcu_read_unlock();
2291 
2292 	return !(rt->rt6i_flags & RTF_CACHE) &&
2293 		(rt->rt6i_flags & RTF_PCPU || from_set);
2294 }
2295 
2296 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2297 				 const struct ipv6hdr *iph, u32 mtu)
2298 {
2299 	const struct in6_addr *daddr, *saddr;
2300 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2301 
2302 	if (dst_metric_locked(dst, RTAX_MTU))
2303 		return;
2304 
2305 	if (iph) {
2306 		daddr = &iph->daddr;
2307 		saddr = &iph->saddr;
2308 	} else if (sk) {
2309 		daddr = &sk->sk_v6_daddr;
2310 		saddr = &inet6_sk(sk)->saddr;
2311 	} else {
2312 		daddr = NULL;
2313 		saddr = NULL;
2314 	}
2315 	dst_confirm_neigh(dst, daddr);
2316 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2317 	if (mtu >= dst_mtu(dst))
2318 		return;
2319 
2320 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2321 		rt6_do_update_pmtu(rt6, mtu);
2322 		/* update rt6_ex->stamp for cache */
2323 		if (rt6->rt6i_flags & RTF_CACHE)
2324 			rt6_update_exception_stamp_rt(rt6);
2325 	} else if (daddr) {
2326 		struct fib6_info *from;
2327 		struct rt6_info *nrt6;
2328 
2329 		rcu_read_lock();
2330 		from = rcu_dereference(rt6->from);
2331 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2332 		if (nrt6) {
2333 			rt6_do_update_pmtu(nrt6, mtu);
2334 			if (rt6_insert_exception(nrt6, from))
2335 				dst_release_immediate(&nrt6->dst);
2336 		}
2337 		rcu_read_unlock();
2338 	}
2339 }
2340 
2341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2342 			       struct sk_buff *skb, u32 mtu)
2343 {
2344 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2345 }
2346 
2347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2348 		     int oif, u32 mark, kuid_t uid)
2349 {
2350 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2351 	struct dst_entry *dst;
2352 	struct flowi6 fl6;
2353 
2354 	memset(&fl6, 0, sizeof(fl6));
2355 	fl6.flowi6_oif = oif;
2356 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2357 	fl6.daddr = iph->daddr;
2358 	fl6.saddr = iph->saddr;
2359 	fl6.flowlabel = ip6_flowinfo(iph);
2360 	fl6.flowi6_uid = uid;
2361 
2362 	dst = ip6_route_output(net, NULL, &fl6);
2363 	if (!dst->error)
2364 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2365 	dst_release(dst);
2366 }
2367 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2368 
2369 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2370 {
2371 	struct dst_entry *dst;
2372 
2373 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2374 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2375 
2376 	dst = __sk_dst_get(sk);
2377 	if (!dst || !dst->obsolete ||
2378 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2379 		return;
2380 
2381 	bh_lock_sock(sk);
2382 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2383 		ip6_datagram_dst_update(sk, false);
2384 	bh_unlock_sock(sk);
2385 }
2386 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2387 
2388 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2389 			   const struct flowi6 *fl6)
2390 {
2391 #ifdef CONFIG_IPV6_SUBTREES
2392 	struct ipv6_pinfo *np = inet6_sk(sk);
2393 #endif
2394 
2395 	ip6_dst_store(sk, dst,
2396 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2397 		      &sk->sk_v6_daddr : NULL,
2398 #ifdef CONFIG_IPV6_SUBTREES
2399 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2400 		      &np->saddr :
2401 #endif
2402 		      NULL);
2403 }
2404 
2405 /* Handle redirects */
2406 struct ip6rd_flowi {
2407 	struct flowi6 fl6;
2408 	struct in6_addr gateway;
2409 };
2410 
2411 static struct rt6_info *__ip6_route_redirect(struct net *net,
2412 					     struct fib6_table *table,
2413 					     struct flowi6 *fl6,
2414 					     const struct sk_buff *skb,
2415 					     int flags)
2416 {
2417 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2418 	struct rt6_info *ret = NULL, *rt_cache;
2419 	struct fib6_info *rt;
2420 	struct fib6_node *fn;
2421 
2422 	/* Get the "current" route for this destination and
2423 	 * check if the redirect has come from appropriate router.
2424 	 *
2425 	 * RFC 4861 specifies that redirects should only be
2426 	 * accepted if they come from the nexthop to the target.
2427 	 * Due to the way the routes are chosen, this notion
2428 	 * is a bit fuzzy and one might need to check all possible
2429 	 * routes.
2430 	 */
2431 
2432 	rcu_read_lock();
2433 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2434 restart:
2435 	for_each_fib6_node_rt_rcu(fn) {
2436 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2437 			continue;
2438 		if (fib6_check_expired(rt))
2439 			continue;
2440 		if (rt->fib6_flags & RTF_REJECT)
2441 			break;
2442 		if (!(rt->fib6_flags & RTF_GATEWAY))
2443 			continue;
2444 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2445 			continue;
2446 		/* rt_cache's gateway might be different from its 'parent'
2447 		 * in the case of an ip redirect.
2448 		 * So we keep searching in the exception table if the gateway
2449 		 * is different.
2450 		 */
2451 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2452 			rt_cache = rt6_find_cached_rt(rt,
2453 						      &fl6->daddr,
2454 						      &fl6->saddr);
2455 			if (rt_cache &&
2456 			    ipv6_addr_equal(&rdfl->gateway,
2457 					    &rt_cache->rt6i_gateway)) {
2458 				ret = rt_cache;
2459 				break;
2460 			}
2461 			continue;
2462 		}
2463 		break;
2464 	}
2465 
2466 	if (!rt)
2467 		rt = net->ipv6.fib6_null_entry;
2468 	else if (rt->fib6_flags & RTF_REJECT) {
2469 		ret = net->ipv6.ip6_null_entry;
2470 		goto out;
2471 	}
2472 
2473 	if (rt == net->ipv6.fib6_null_entry) {
2474 		fn = fib6_backtrack(fn, &fl6->saddr);
2475 		if (fn)
2476 			goto restart;
2477 	}
2478 
2479 out:
2480 	if (ret)
2481 		ip6_hold_safe(net, &ret, true);
2482 	else
2483 		ret = ip6_create_rt_rcu(rt);
2484 
2485 	rcu_read_unlock();
2486 
2487 	trace_fib6_table_lookup(net, rt, table, fl6);
2488 	return ret;
2489 };
2490 
2491 static struct dst_entry *ip6_route_redirect(struct net *net,
2492 					    const struct flowi6 *fl6,
2493 					    const struct sk_buff *skb,
2494 					    const struct in6_addr *gateway)
2495 {
2496 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2497 	struct ip6rd_flowi rdfl;
2498 
2499 	rdfl.fl6 = *fl6;
2500 	rdfl.gateway = *gateway;
2501 
2502 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2503 				flags, __ip6_route_redirect);
2504 }
2505 
2506 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2507 		  kuid_t uid)
2508 {
2509 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2510 	struct dst_entry *dst;
2511 	struct flowi6 fl6;
2512 
2513 	memset(&fl6, 0, sizeof(fl6));
2514 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2515 	fl6.flowi6_oif = oif;
2516 	fl6.flowi6_mark = mark;
2517 	fl6.daddr = iph->daddr;
2518 	fl6.saddr = iph->saddr;
2519 	fl6.flowlabel = ip6_flowinfo(iph);
2520 	fl6.flowi6_uid = uid;
2521 
2522 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2523 	rt6_do_redirect(dst, NULL, skb);
2524 	dst_release(dst);
2525 }
2526 EXPORT_SYMBOL_GPL(ip6_redirect);
2527 
2528 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2529 			    u32 mark)
2530 {
2531 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2532 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2533 	struct dst_entry *dst;
2534 	struct flowi6 fl6;
2535 
2536 	memset(&fl6, 0, sizeof(fl6));
2537 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2538 	fl6.flowi6_oif = oif;
2539 	fl6.flowi6_mark = mark;
2540 	fl6.daddr = msg->dest;
2541 	fl6.saddr = iph->daddr;
2542 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2543 
2544 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2545 	rt6_do_redirect(dst, NULL, skb);
2546 	dst_release(dst);
2547 }
2548 
2549 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2550 {
2551 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2552 		     sk->sk_uid);
2553 }
2554 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2555 
2556 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2557 {
2558 	struct net_device *dev = dst->dev;
2559 	unsigned int mtu = dst_mtu(dst);
2560 	struct net *net = dev_net(dev);
2561 
2562 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2563 
2564 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2565 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2566 
2567 	/*
2568 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2569 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2570 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2571 	 * rely only on pmtu discovery"
2572 	 */
2573 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2574 		mtu = IPV6_MAXPLEN;
2575 	return mtu;
2576 }
2577 
2578 static unsigned int ip6_mtu(const struct dst_entry *dst)
2579 {
2580 	struct inet6_dev *idev;
2581 	unsigned int mtu;
2582 
2583 	mtu = dst_metric_raw(dst, RTAX_MTU);
2584 	if (mtu)
2585 		goto out;
2586 
2587 	mtu = IPV6_MIN_MTU;
2588 
2589 	rcu_read_lock();
2590 	idev = __in6_dev_get(dst->dev);
2591 	if (idev)
2592 		mtu = idev->cnf.mtu6;
2593 	rcu_read_unlock();
2594 
2595 out:
2596 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2597 
2598 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2599 }
2600 
2601 /* MTU selection:
2602  * 1. mtu on route is locked - use it
2603  * 2. mtu from nexthop exception
2604  * 3. mtu from egress device
2605  *
2606  * based on ip6_dst_mtu_forward and exception logic of
2607  * rt6_find_cached_rt; called with rcu_read_lock
2608  */
2609 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2610 		      struct in6_addr *saddr)
2611 {
2612 	struct rt6_exception_bucket *bucket;
2613 	struct rt6_exception *rt6_ex;
2614 	struct in6_addr *src_key;
2615 	struct inet6_dev *idev;
2616 	u32 mtu = 0;
2617 
2618 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2619 		mtu = f6i->fib6_pmtu;
2620 		if (mtu)
2621 			goto out;
2622 	}
2623 
2624 	src_key = NULL;
2625 #ifdef CONFIG_IPV6_SUBTREES
2626 	if (f6i->fib6_src.plen)
2627 		src_key = saddr;
2628 #endif
2629 
2630 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2631 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2632 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2633 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2634 
2635 	if (likely(!mtu)) {
2636 		struct net_device *dev = fib6_info_nh_dev(f6i);
2637 
2638 		mtu = IPV6_MIN_MTU;
2639 		idev = __in6_dev_get(dev);
2640 		if (idev && idev->cnf.mtu6 > mtu)
2641 			mtu = idev->cnf.mtu6;
2642 	}
2643 
2644 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2645 out:
2646 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2647 }
2648 
2649 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2650 				  struct flowi6 *fl6)
2651 {
2652 	struct dst_entry *dst;
2653 	struct rt6_info *rt;
2654 	struct inet6_dev *idev = in6_dev_get(dev);
2655 	struct net *net = dev_net(dev);
2656 
2657 	if (unlikely(!idev))
2658 		return ERR_PTR(-ENODEV);
2659 
2660 	rt = ip6_dst_alloc(net, dev, 0);
2661 	if (unlikely(!rt)) {
2662 		in6_dev_put(idev);
2663 		dst = ERR_PTR(-ENOMEM);
2664 		goto out;
2665 	}
2666 
2667 	rt->dst.flags |= DST_HOST;
2668 	rt->dst.input = ip6_input;
2669 	rt->dst.output  = ip6_output;
2670 	rt->rt6i_gateway  = fl6->daddr;
2671 	rt->rt6i_dst.addr = fl6->daddr;
2672 	rt->rt6i_dst.plen = 128;
2673 	rt->rt6i_idev     = idev;
2674 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2675 
2676 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2677 	 * do proper release of the net_device
2678 	 */
2679 	rt6_uncached_list_add(rt);
2680 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2681 
2682 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2683 
2684 out:
2685 	return dst;
2686 }
2687 
2688 static int ip6_dst_gc(struct dst_ops *ops)
2689 {
2690 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2691 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2692 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2693 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2694 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2695 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2696 	int entries;
2697 
2698 	entries = dst_entries_get_fast(ops);
2699 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2700 	    entries <= rt_max_size)
2701 		goto out;
2702 
2703 	net->ipv6.ip6_rt_gc_expire++;
2704 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2705 	entries = dst_entries_get_slow(ops);
2706 	if (entries < ops->gc_thresh)
2707 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2708 out:
2709 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2710 	return entries > rt_max_size;
2711 }
2712 
2713 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2714 			       struct fib6_config *cfg)
2715 {
2716 	struct dst_metrics *p;
2717 
2718 	if (!cfg->fc_mx)
2719 		return 0;
2720 
2721 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2722 	if (unlikely(!p))
2723 		return -ENOMEM;
2724 
2725 	refcount_set(&p->refcnt, 1);
2726 	rt->fib6_metrics = p;
2727 
2728 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2729 }
2730 
2731 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2732 					    struct fib6_config *cfg,
2733 					    const struct in6_addr *gw_addr,
2734 					    u32 tbid, int flags)
2735 {
2736 	struct flowi6 fl6 = {
2737 		.flowi6_oif = cfg->fc_ifindex,
2738 		.daddr = *gw_addr,
2739 		.saddr = cfg->fc_prefsrc,
2740 	};
2741 	struct fib6_table *table;
2742 	struct rt6_info *rt;
2743 
2744 	table = fib6_get_table(net, tbid);
2745 	if (!table)
2746 		return NULL;
2747 
2748 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2749 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2750 
2751 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2752 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2753 
2754 	/* if table lookup failed, fall back to full lookup */
2755 	if (rt == net->ipv6.ip6_null_entry) {
2756 		ip6_rt_put(rt);
2757 		rt = NULL;
2758 	}
2759 
2760 	return rt;
2761 }
2762 
2763 static int ip6_route_check_nh_onlink(struct net *net,
2764 				     struct fib6_config *cfg,
2765 				     const struct net_device *dev,
2766 				     struct netlink_ext_ack *extack)
2767 {
2768 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2769 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2770 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2771 	struct rt6_info *grt;
2772 	int err;
2773 
2774 	err = 0;
2775 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2776 	if (grt) {
2777 		if (!grt->dst.error &&
2778 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2779 			NL_SET_ERR_MSG(extack,
2780 				       "Nexthop has invalid gateway or device mismatch");
2781 			err = -EINVAL;
2782 		}
2783 
2784 		ip6_rt_put(grt);
2785 	}
2786 
2787 	return err;
2788 }
2789 
2790 static int ip6_route_check_nh(struct net *net,
2791 			      struct fib6_config *cfg,
2792 			      struct net_device **_dev,
2793 			      struct inet6_dev **idev)
2794 {
2795 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2796 	struct net_device *dev = _dev ? *_dev : NULL;
2797 	struct rt6_info *grt = NULL;
2798 	int err = -EHOSTUNREACH;
2799 
2800 	if (cfg->fc_table) {
2801 		int flags = RT6_LOOKUP_F_IFACE;
2802 
2803 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2804 					  cfg->fc_table, flags);
2805 		if (grt) {
2806 			if (grt->rt6i_flags & RTF_GATEWAY ||
2807 			    (dev && dev != grt->dst.dev)) {
2808 				ip6_rt_put(grt);
2809 				grt = NULL;
2810 			}
2811 		}
2812 	}
2813 
2814 	if (!grt)
2815 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2816 
2817 	if (!grt)
2818 		goto out;
2819 
2820 	if (dev) {
2821 		if (dev != grt->dst.dev) {
2822 			ip6_rt_put(grt);
2823 			goto out;
2824 		}
2825 	} else {
2826 		*_dev = dev = grt->dst.dev;
2827 		*idev = grt->rt6i_idev;
2828 		dev_hold(dev);
2829 		in6_dev_hold(grt->rt6i_idev);
2830 	}
2831 
2832 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2833 		err = 0;
2834 
2835 	ip6_rt_put(grt);
2836 
2837 out:
2838 	return err;
2839 }
2840 
2841 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2842 			   struct net_device **_dev, struct inet6_dev **idev,
2843 			   struct netlink_ext_ack *extack)
2844 {
2845 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2846 	int gwa_type = ipv6_addr_type(gw_addr);
2847 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2848 	const struct net_device *dev = *_dev;
2849 	bool need_addr_check = !dev;
2850 	int err = -EINVAL;
2851 
2852 	/* if gw_addr is local we will fail to detect this in case
2853 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2854 	 * will return already-added prefix route via interface that
2855 	 * prefix route was assigned to, which might be non-loopback.
2856 	 */
2857 	if (dev &&
2858 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2859 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2860 		goto out;
2861 	}
2862 
2863 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2864 		/* IPv6 strictly inhibits using not link-local
2865 		 * addresses as nexthop address.
2866 		 * Otherwise, router will not able to send redirects.
2867 		 * It is very good, but in some (rare!) circumstances
2868 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2869 		 * some exceptions. --ANK
2870 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2871 		 * addressing
2872 		 */
2873 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2874 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2875 			goto out;
2876 		}
2877 
2878 		if (cfg->fc_flags & RTNH_F_ONLINK)
2879 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2880 		else
2881 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2882 
2883 		if (err)
2884 			goto out;
2885 	}
2886 
2887 	/* reload in case device was changed */
2888 	dev = *_dev;
2889 
2890 	err = -EINVAL;
2891 	if (!dev) {
2892 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2893 		goto out;
2894 	} else if (dev->flags & IFF_LOOPBACK) {
2895 		NL_SET_ERR_MSG(extack,
2896 			       "Egress device can not be loopback device for this route");
2897 		goto out;
2898 	}
2899 
2900 	/* if we did not check gw_addr above, do so now that the
2901 	 * egress device has been resolved.
2902 	 */
2903 	if (need_addr_check &&
2904 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2905 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2906 		goto out;
2907 	}
2908 
2909 	err = 0;
2910 out:
2911 	return err;
2912 }
2913 
2914 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2915 					      gfp_t gfp_flags,
2916 					      struct netlink_ext_ack *extack)
2917 {
2918 	struct net *net = cfg->fc_nlinfo.nl_net;
2919 	struct fib6_info *rt = NULL;
2920 	struct net_device *dev = NULL;
2921 	struct inet6_dev *idev = NULL;
2922 	struct fib6_table *table;
2923 	int addr_type;
2924 	int err = -EINVAL;
2925 
2926 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2927 	if (cfg->fc_flags & RTF_PCPU) {
2928 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2929 		goto out;
2930 	}
2931 
2932 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2933 	if (cfg->fc_flags & RTF_CACHE) {
2934 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2935 		goto out;
2936 	}
2937 
2938 	if (cfg->fc_type > RTN_MAX) {
2939 		NL_SET_ERR_MSG(extack, "Invalid route type");
2940 		goto out;
2941 	}
2942 
2943 	if (cfg->fc_dst_len > 128) {
2944 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2945 		goto out;
2946 	}
2947 	if (cfg->fc_src_len > 128) {
2948 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2949 		goto out;
2950 	}
2951 #ifndef CONFIG_IPV6_SUBTREES
2952 	if (cfg->fc_src_len) {
2953 		NL_SET_ERR_MSG(extack,
2954 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2955 		goto out;
2956 	}
2957 #endif
2958 	if (cfg->fc_ifindex) {
2959 		err = -ENODEV;
2960 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2961 		if (!dev)
2962 			goto out;
2963 		idev = in6_dev_get(dev);
2964 		if (!idev)
2965 			goto out;
2966 	}
2967 
2968 	if (cfg->fc_metric == 0)
2969 		cfg->fc_metric = IP6_RT_PRIO_USER;
2970 
2971 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2972 		if (!dev) {
2973 			NL_SET_ERR_MSG(extack,
2974 				       "Nexthop device required for onlink");
2975 			err = -ENODEV;
2976 			goto out;
2977 		}
2978 
2979 		if (!(dev->flags & IFF_UP)) {
2980 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2981 			err = -ENETDOWN;
2982 			goto out;
2983 		}
2984 	}
2985 
2986 	err = -ENOBUFS;
2987 	if (cfg->fc_nlinfo.nlh &&
2988 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2989 		table = fib6_get_table(net, cfg->fc_table);
2990 		if (!table) {
2991 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2992 			table = fib6_new_table(net, cfg->fc_table);
2993 		}
2994 	} else {
2995 		table = fib6_new_table(net, cfg->fc_table);
2996 	}
2997 
2998 	if (!table)
2999 		goto out;
3000 
3001 	err = -ENOMEM;
3002 	rt = fib6_info_alloc(gfp_flags);
3003 	if (!rt)
3004 		goto out;
3005 
3006 	if (cfg->fc_flags & RTF_ADDRCONF)
3007 		rt->dst_nocount = true;
3008 
3009 	err = ip6_convert_metrics(net, rt, cfg);
3010 	if (err < 0)
3011 		goto out;
3012 
3013 	if (cfg->fc_flags & RTF_EXPIRES)
3014 		fib6_set_expires(rt, jiffies +
3015 				clock_t_to_jiffies(cfg->fc_expires));
3016 	else
3017 		fib6_clean_expires(rt);
3018 
3019 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3020 		cfg->fc_protocol = RTPROT_BOOT;
3021 	rt->fib6_protocol = cfg->fc_protocol;
3022 
3023 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3024 
3025 	if (cfg->fc_encap) {
3026 		struct lwtunnel_state *lwtstate;
3027 
3028 		err = lwtunnel_build_state(cfg->fc_encap_type,
3029 					   cfg->fc_encap, AF_INET6, cfg,
3030 					   &lwtstate, extack);
3031 		if (err)
3032 			goto out;
3033 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3034 	}
3035 
3036 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3037 	rt->fib6_dst.plen = cfg->fc_dst_len;
3038 	if (rt->fib6_dst.plen == 128)
3039 		rt->dst_host = true;
3040 
3041 #ifdef CONFIG_IPV6_SUBTREES
3042 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3043 	rt->fib6_src.plen = cfg->fc_src_len;
3044 #endif
3045 
3046 	rt->fib6_metric = cfg->fc_metric;
3047 	rt->fib6_nh.nh_weight = 1;
3048 
3049 	rt->fib6_type = cfg->fc_type;
3050 
3051 	/* We cannot add true routes via loopback here,
3052 	   they would result in kernel looping; promote them to reject routes
3053 	 */
3054 	if ((cfg->fc_flags & RTF_REJECT) ||
3055 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3056 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3057 	     !(cfg->fc_flags & RTF_LOCAL))) {
3058 		/* hold loopback dev/idev if we haven't done so. */
3059 		if (dev != net->loopback_dev) {
3060 			if (dev) {
3061 				dev_put(dev);
3062 				in6_dev_put(idev);
3063 			}
3064 			dev = net->loopback_dev;
3065 			dev_hold(dev);
3066 			idev = in6_dev_get(dev);
3067 			if (!idev) {
3068 				err = -ENODEV;
3069 				goto out;
3070 			}
3071 		}
3072 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3073 		goto install_route;
3074 	}
3075 
3076 	if (cfg->fc_flags & RTF_GATEWAY) {
3077 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3078 		if (err)
3079 			goto out;
3080 
3081 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3082 	}
3083 
3084 	err = -ENODEV;
3085 	if (!dev)
3086 		goto out;
3087 
3088 	if (idev->cnf.disable_ipv6) {
3089 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3090 		err = -EACCES;
3091 		goto out;
3092 	}
3093 
3094 	if (!(dev->flags & IFF_UP)) {
3095 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3096 		err = -ENETDOWN;
3097 		goto out;
3098 	}
3099 
3100 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3101 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3102 			NL_SET_ERR_MSG(extack, "Invalid source address");
3103 			err = -EINVAL;
3104 			goto out;
3105 		}
3106 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3107 		rt->fib6_prefsrc.plen = 128;
3108 	} else
3109 		rt->fib6_prefsrc.plen = 0;
3110 
3111 	rt->fib6_flags = cfg->fc_flags;
3112 
3113 install_route:
3114 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3115 	    !netif_carrier_ok(dev))
3116 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3117 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3118 	rt->fib6_nh.nh_dev = dev;
3119 	rt->fib6_table = table;
3120 
3121 	if (idev)
3122 		in6_dev_put(idev);
3123 
3124 	return rt;
3125 out:
3126 	if (dev)
3127 		dev_put(dev);
3128 	if (idev)
3129 		in6_dev_put(idev);
3130 
3131 	fib6_info_release(rt);
3132 	return ERR_PTR(err);
3133 }
3134 
3135 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3136 		  struct netlink_ext_ack *extack)
3137 {
3138 	struct fib6_info *rt;
3139 	int err;
3140 
3141 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3142 	if (IS_ERR(rt))
3143 		return PTR_ERR(rt);
3144 
3145 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3146 	fib6_info_release(rt);
3147 
3148 	return err;
3149 }
3150 
3151 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3152 {
3153 	struct net *net = info->nl_net;
3154 	struct fib6_table *table;
3155 	int err;
3156 
3157 	if (rt == net->ipv6.fib6_null_entry) {
3158 		err = -ENOENT;
3159 		goto out;
3160 	}
3161 
3162 	table = rt->fib6_table;
3163 	spin_lock_bh(&table->tb6_lock);
3164 	err = fib6_del(rt, info);
3165 	spin_unlock_bh(&table->tb6_lock);
3166 
3167 out:
3168 	fib6_info_release(rt);
3169 	return err;
3170 }
3171 
3172 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3173 {
3174 	struct nl_info info = { .nl_net = net };
3175 
3176 	return __ip6_del_rt(rt, &info);
3177 }
3178 
3179 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3180 {
3181 	struct nl_info *info = &cfg->fc_nlinfo;
3182 	struct net *net = info->nl_net;
3183 	struct sk_buff *skb = NULL;
3184 	struct fib6_table *table;
3185 	int err = -ENOENT;
3186 
3187 	if (rt == net->ipv6.fib6_null_entry)
3188 		goto out_put;
3189 	table = rt->fib6_table;
3190 	spin_lock_bh(&table->tb6_lock);
3191 
3192 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3193 		struct fib6_info *sibling, *next_sibling;
3194 
3195 		/* prefer to send a single notification with all hops */
3196 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3197 		if (skb) {
3198 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3199 
3200 			if (rt6_fill_node(net, skb, rt, NULL,
3201 					  NULL, NULL, 0, RTM_DELROUTE,
3202 					  info->portid, seq, 0) < 0) {
3203 				kfree_skb(skb);
3204 				skb = NULL;
3205 			} else
3206 				info->skip_notify = 1;
3207 		}
3208 
3209 		list_for_each_entry_safe(sibling, next_sibling,
3210 					 &rt->fib6_siblings,
3211 					 fib6_siblings) {
3212 			err = fib6_del(sibling, info);
3213 			if (err)
3214 				goto out_unlock;
3215 		}
3216 	}
3217 
3218 	err = fib6_del(rt, info);
3219 out_unlock:
3220 	spin_unlock_bh(&table->tb6_lock);
3221 out_put:
3222 	fib6_info_release(rt);
3223 
3224 	if (skb) {
3225 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3226 			    info->nlh, gfp_any());
3227 	}
3228 	return err;
3229 }
3230 
3231 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3232 {
3233 	int rc = -ESRCH;
3234 
3235 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3236 		goto out;
3237 
3238 	if (cfg->fc_flags & RTF_GATEWAY &&
3239 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3240 		goto out;
3241 	if (dst_hold_safe(&rt->dst))
3242 		rc = rt6_remove_exception_rt(rt);
3243 out:
3244 	return rc;
3245 }
3246 
3247 static int ip6_route_del(struct fib6_config *cfg,
3248 			 struct netlink_ext_ack *extack)
3249 {
3250 	struct rt6_info *rt_cache;
3251 	struct fib6_table *table;
3252 	struct fib6_info *rt;
3253 	struct fib6_node *fn;
3254 	int err = -ESRCH;
3255 
3256 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3257 	if (!table) {
3258 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3259 		return err;
3260 	}
3261 
3262 	rcu_read_lock();
3263 
3264 	fn = fib6_locate(&table->tb6_root,
3265 			 &cfg->fc_dst, cfg->fc_dst_len,
3266 			 &cfg->fc_src, cfg->fc_src_len,
3267 			 !(cfg->fc_flags & RTF_CACHE));
3268 
3269 	if (fn) {
3270 		for_each_fib6_node_rt_rcu(fn) {
3271 			if (cfg->fc_flags & RTF_CACHE) {
3272 				int rc;
3273 
3274 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3275 							      &cfg->fc_src);
3276 				if (rt_cache) {
3277 					rc = ip6_del_cached_rt(rt_cache, cfg);
3278 					if (rc != -ESRCH) {
3279 						rcu_read_unlock();
3280 						return rc;
3281 					}
3282 				}
3283 				continue;
3284 			}
3285 			if (cfg->fc_ifindex &&
3286 			    (!rt->fib6_nh.nh_dev ||
3287 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3288 				continue;
3289 			if (cfg->fc_flags & RTF_GATEWAY &&
3290 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3291 				continue;
3292 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3293 				continue;
3294 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3295 				continue;
3296 			if (!fib6_info_hold_safe(rt))
3297 				continue;
3298 			rcu_read_unlock();
3299 
3300 			/* if gateway was specified only delete the one hop */
3301 			if (cfg->fc_flags & RTF_GATEWAY)
3302 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3303 
3304 			return __ip6_del_rt_siblings(rt, cfg);
3305 		}
3306 	}
3307 	rcu_read_unlock();
3308 
3309 	return err;
3310 }
3311 
3312 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3313 {
3314 	struct netevent_redirect netevent;
3315 	struct rt6_info *rt, *nrt = NULL;
3316 	struct ndisc_options ndopts;
3317 	struct inet6_dev *in6_dev;
3318 	struct neighbour *neigh;
3319 	struct fib6_info *from;
3320 	struct rd_msg *msg;
3321 	int optlen, on_link;
3322 	u8 *lladdr;
3323 
3324 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3325 	optlen -= sizeof(*msg);
3326 
3327 	if (optlen < 0) {
3328 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3329 		return;
3330 	}
3331 
3332 	msg = (struct rd_msg *)icmp6_hdr(skb);
3333 
3334 	if (ipv6_addr_is_multicast(&msg->dest)) {
3335 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3336 		return;
3337 	}
3338 
3339 	on_link = 0;
3340 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3341 		on_link = 1;
3342 	} else if (ipv6_addr_type(&msg->target) !=
3343 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3344 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3345 		return;
3346 	}
3347 
3348 	in6_dev = __in6_dev_get(skb->dev);
3349 	if (!in6_dev)
3350 		return;
3351 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3352 		return;
3353 
3354 	/* RFC2461 8.1:
3355 	 *	The IP source address of the Redirect MUST be the same as the current
3356 	 *	first-hop router for the specified ICMP Destination Address.
3357 	 */
3358 
3359 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3360 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3361 		return;
3362 	}
3363 
3364 	lladdr = NULL;
3365 	if (ndopts.nd_opts_tgt_lladdr) {
3366 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3367 					     skb->dev);
3368 		if (!lladdr) {
3369 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3370 			return;
3371 		}
3372 	}
3373 
3374 	rt = (struct rt6_info *) dst;
3375 	if (rt->rt6i_flags & RTF_REJECT) {
3376 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3377 		return;
3378 	}
3379 
3380 	/* Redirect received -> path was valid.
3381 	 * Look, redirects are sent only in response to data packets,
3382 	 * so that this nexthop apparently is reachable. --ANK
3383 	 */
3384 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3385 
3386 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3387 	if (!neigh)
3388 		return;
3389 
3390 	/*
3391 	 *	We have finally decided to accept it.
3392 	 */
3393 
3394 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3395 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3396 		     NEIGH_UPDATE_F_OVERRIDE|
3397 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3398 				     NEIGH_UPDATE_F_ISROUTER)),
3399 		     NDISC_REDIRECT, &ndopts);
3400 
3401 	rcu_read_lock();
3402 	from = rcu_dereference(rt->from);
3403 	/* This fib6_info_hold() is safe here because we hold reference to rt
3404 	 * and rt already holds reference to fib6_info.
3405 	 */
3406 	fib6_info_hold(from);
3407 	rcu_read_unlock();
3408 
3409 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3410 	if (!nrt)
3411 		goto out;
3412 
3413 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3414 	if (on_link)
3415 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3416 
3417 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3418 
3419 	/* No need to remove rt from the exception table if rt is
3420 	 * a cached route because rt6_insert_exception() will
3421 	 * takes care of it
3422 	 */
3423 	if (rt6_insert_exception(nrt, from)) {
3424 		dst_release_immediate(&nrt->dst);
3425 		goto out;
3426 	}
3427 
3428 	netevent.old = &rt->dst;
3429 	netevent.new = &nrt->dst;
3430 	netevent.daddr = &msg->dest;
3431 	netevent.neigh = neigh;
3432 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3433 
3434 out:
3435 	fib6_info_release(from);
3436 	neigh_release(neigh);
3437 }
3438 
3439 #ifdef CONFIG_IPV6_ROUTE_INFO
3440 static struct fib6_info *rt6_get_route_info(struct net *net,
3441 					   const struct in6_addr *prefix, int prefixlen,
3442 					   const struct in6_addr *gwaddr,
3443 					   struct net_device *dev)
3444 {
3445 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3446 	int ifindex = dev->ifindex;
3447 	struct fib6_node *fn;
3448 	struct fib6_info *rt = NULL;
3449 	struct fib6_table *table;
3450 
3451 	table = fib6_get_table(net, tb_id);
3452 	if (!table)
3453 		return NULL;
3454 
3455 	rcu_read_lock();
3456 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3457 	if (!fn)
3458 		goto out;
3459 
3460 	for_each_fib6_node_rt_rcu(fn) {
3461 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3462 			continue;
3463 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3464 			continue;
3465 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3466 			continue;
3467 		if (!fib6_info_hold_safe(rt))
3468 			continue;
3469 		break;
3470 	}
3471 out:
3472 	rcu_read_unlock();
3473 	return rt;
3474 }
3475 
3476 static struct fib6_info *rt6_add_route_info(struct net *net,
3477 					   const struct in6_addr *prefix, int prefixlen,
3478 					   const struct in6_addr *gwaddr,
3479 					   struct net_device *dev,
3480 					   unsigned int pref)
3481 {
3482 	struct fib6_config cfg = {
3483 		.fc_metric	= IP6_RT_PRIO_USER,
3484 		.fc_ifindex	= dev->ifindex,
3485 		.fc_dst_len	= prefixlen,
3486 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3487 				  RTF_UP | RTF_PREF(pref),
3488 		.fc_protocol = RTPROT_RA,
3489 		.fc_type = RTN_UNICAST,
3490 		.fc_nlinfo.portid = 0,
3491 		.fc_nlinfo.nlh = NULL,
3492 		.fc_nlinfo.nl_net = net,
3493 	};
3494 
3495 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3496 	cfg.fc_dst = *prefix;
3497 	cfg.fc_gateway = *gwaddr;
3498 
3499 	/* We should treat it as a default route if prefix length is 0. */
3500 	if (!prefixlen)
3501 		cfg.fc_flags |= RTF_DEFAULT;
3502 
3503 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3504 
3505 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3506 }
3507 #endif
3508 
3509 struct fib6_info *rt6_get_dflt_router(struct net *net,
3510 				     const struct in6_addr *addr,
3511 				     struct net_device *dev)
3512 {
3513 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3514 	struct fib6_info *rt;
3515 	struct fib6_table *table;
3516 
3517 	table = fib6_get_table(net, tb_id);
3518 	if (!table)
3519 		return NULL;
3520 
3521 	rcu_read_lock();
3522 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3523 		if (dev == rt->fib6_nh.nh_dev &&
3524 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3525 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3526 			break;
3527 	}
3528 	if (rt && !fib6_info_hold_safe(rt))
3529 		rt = NULL;
3530 	rcu_read_unlock();
3531 	return rt;
3532 }
3533 
3534 struct fib6_info *rt6_add_dflt_router(struct net *net,
3535 				     const struct in6_addr *gwaddr,
3536 				     struct net_device *dev,
3537 				     unsigned int pref)
3538 {
3539 	struct fib6_config cfg = {
3540 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3541 		.fc_metric	= IP6_RT_PRIO_USER,
3542 		.fc_ifindex	= dev->ifindex,
3543 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3544 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3545 		.fc_protocol = RTPROT_RA,
3546 		.fc_type = RTN_UNICAST,
3547 		.fc_nlinfo.portid = 0,
3548 		.fc_nlinfo.nlh = NULL,
3549 		.fc_nlinfo.nl_net = net,
3550 	};
3551 
3552 	cfg.fc_gateway = *gwaddr;
3553 
3554 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3555 		struct fib6_table *table;
3556 
3557 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3558 		if (table)
3559 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3560 	}
3561 
3562 	return rt6_get_dflt_router(net, gwaddr, dev);
3563 }
3564 
3565 static void __rt6_purge_dflt_routers(struct net *net,
3566 				     struct fib6_table *table)
3567 {
3568 	struct fib6_info *rt;
3569 
3570 restart:
3571 	rcu_read_lock();
3572 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3573 		struct net_device *dev = fib6_info_nh_dev(rt);
3574 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3575 
3576 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3577 		    (!idev || idev->cnf.accept_ra != 2) &&
3578 		    fib6_info_hold_safe(rt)) {
3579 			rcu_read_unlock();
3580 			ip6_del_rt(net, rt);
3581 			goto restart;
3582 		}
3583 	}
3584 	rcu_read_unlock();
3585 
3586 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3587 }
3588 
3589 void rt6_purge_dflt_routers(struct net *net)
3590 {
3591 	struct fib6_table *table;
3592 	struct hlist_head *head;
3593 	unsigned int h;
3594 
3595 	rcu_read_lock();
3596 
3597 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3598 		head = &net->ipv6.fib_table_hash[h];
3599 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3600 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3601 				__rt6_purge_dflt_routers(net, table);
3602 		}
3603 	}
3604 
3605 	rcu_read_unlock();
3606 }
3607 
3608 static void rtmsg_to_fib6_config(struct net *net,
3609 				 struct in6_rtmsg *rtmsg,
3610 				 struct fib6_config *cfg)
3611 {
3612 	memset(cfg, 0, sizeof(*cfg));
3613 
3614 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3615 			 : RT6_TABLE_MAIN;
3616 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3617 	cfg->fc_metric = rtmsg->rtmsg_metric;
3618 	cfg->fc_expires = rtmsg->rtmsg_info;
3619 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3620 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3621 	cfg->fc_flags = rtmsg->rtmsg_flags;
3622 	cfg->fc_type = rtmsg->rtmsg_type;
3623 
3624 	cfg->fc_nlinfo.nl_net = net;
3625 
3626 	cfg->fc_dst = rtmsg->rtmsg_dst;
3627 	cfg->fc_src = rtmsg->rtmsg_src;
3628 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3629 }
3630 
3631 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3632 {
3633 	struct fib6_config cfg;
3634 	struct in6_rtmsg rtmsg;
3635 	int err;
3636 
3637 	switch (cmd) {
3638 	case SIOCADDRT:		/* Add a route */
3639 	case SIOCDELRT:		/* Delete a route */
3640 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3641 			return -EPERM;
3642 		err = copy_from_user(&rtmsg, arg,
3643 				     sizeof(struct in6_rtmsg));
3644 		if (err)
3645 			return -EFAULT;
3646 
3647 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3648 
3649 		rtnl_lock();
3650 		switch (cmd) {
3651 		case SIOCADDRT:
3652 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3653 			break;
3654 		case SIOCDELRT:
3655 			err = ip6_route_del(&cfg, NULL);
3656 			break;
3657 		default:
3658 			err = -EINVAL;
3659 		}
3660 		rtnl_unlock();
3661 
3662 		return err;
3663 	}
3664 
3665 	return -EINVAL;
3666 }
3667 
3668 /*
3669  *	Drop the packet on the floor
3670  */
3671 
3672 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3673 {
3674 	int type;
3675 	struct dst_entry *dst = skb_dst(skb);
3676 	switch (ipstats_mib_noroutes) {
3677 	case IPSTATS_MIB_INNOROUTES:
3678 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3679 		if (type == IPV6_ADDR_ANY) {
3680 			IP6_INC_STATS(dev_net(dst->dev),
3681 				      __in6_dev_get_safely(skb->dev),
3682 				      IPSTATS_MIB_INADDRERRORS);
3683 			break;
3684 		}
3685 		/* FALLTHROUGH */
3686 	case IPSTATS_MIB_OUTNOROUTES:
3687 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3688 			      ipstats_mib_noroutes);
3689 		break;
3690 	}
3691 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3692 	kfree_skb(skb);
3693 	return 0;
3694 }
3695 
3696 static int ip6_pkt_discard(struct sk_buff *skb)
3697 {
3698 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3699 }
3700 
3701 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3702 {
3703 	skb->dev = skb_dst(skb)->dev;
3704 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3705 }
3706 
3707 static int ip6_pkt_prohibit(struct sk_buff *skb)
3708 {
3709 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3710 }
3711 
3712 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3713 {
3714 	skb->dev = skb_dst(skb)->dev;
3715 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3716 }
3717 
3718 /*
3719  *	Allocate a dst for local (unicast / anycast) address.
3720  */
3721 
3722 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3723 				     struct inet6_dev *idev,
3724 				     const struct in6_addr *addr,
3725 				     bool anycast, gfp_t gfp_flags)
3726 {
3727 	u32 tb_id;
3728 	struct net_device *dev = idev->dev;
3729 	struct fib6_info *f6i;
3730 
3731 	f6i = fib6_info_alloc(gfp_flags);
3732 	if (!f6i)
3733 		return ERR_PTR(-ENOMEM);
3734 
3735 	f6i->dst_nocount = true;
3736 	f6i->dst_host = true;
3737 	f6i->fib6_protocol = RTPROT_KERNEL;
3738 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3739 	if (anycast) {
3740 		f6i->fib6_type = RTN_ANYCAST;
3741 		f6i->fib6_flags |= RTF_ANYCAST;
3742 	} else {
3743 		f6i->fib6_type = RTN_LOCAL;
3744 		f6i->fib6_flags |= RTF_LOCAL;
3745 	}
3746 
3747 	f6i->fib6_nh.nh_gw = *addr;
3748 	dev_hold(dev);
3749 	f6i->fib6_nh.nh_dev = dev;
3750 	f6i->fib6_dst.addr = *addr;
3751 	f6i->fib6_dst.plen = 128;
3752 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3753 	f6i->fib6_table = fib6_get_table(net, tb_id);
3754 
3755 	return f6i;
3756 }
3757 
3758 /* remove deleted ip from prefsrc entries */
3759 struct arg_dev_net_ip {
3760 	struct net_device *dev;
3761 	struct net *net;
3762 	struct in6_addr *addr;
3763 };
3764 
3765 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3766 {
3767 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3768 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3769 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3770 
3771 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3772 	    rt != net->ipv6.fib6_null_entry &&
3773 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3774 		spin_lock_bh(&rt6_exception_lock);
3775 		/* remove prefsrc entry */
3776 		rt->fib6_prefsrc.plen = 0;
3777 		spin_unlock_bh(&rt6_exception_lock);
3778 	}
3779 	return 0;
3780 }
3781 
3782 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3783 {
3784 	struct net *net = dev_net(ifp->idev->dev);
3785 	struct arg_dev_net_ip adni = {
3786 		.dev = ifp->idev->dev,
3787 		.net = net,
3788 		.addr = &ifp->addr,
3789 	};
3790 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3791 }
3792 
3793 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3794 
3795 /* Remove routers and update dst entries when gateway turn into host. */
3796 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3797 {
3798 	struct in6_addr *gateway = (struct in6_addr *)arg;
3799 
3800 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3801 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3802 		return -1;
3803 	}
3804 
3805 	/* Further clean up cached routes in exception table.
3806 	 * This is needed because cached route may have a different
3807 	 * gateway than its 'parent' in the case of an ip redirect.
3808 	 */
3809 	rt6_exceptions_clean_tohost(rt, gateway);
3810 
3811 	return 0;
3812 }
3813 
3814 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3815 {
3816 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3817 }
3818 
3819 struct arg_netdev_event {
3820 	const struct net_device *dev;
3821 	union {
3822 		unsigned int nh_flags;
3823 		unsigned long event;
3824 	};
3825 };
3826 
3827 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3828 {
3829 	struct fib6_info *iter;
3830 	struct fib6_node *fn;
3831 
3832 	fn = rcu_dereference_protected(rt->fib6_node,
3833 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3834 	iter = rcu_dereference_protected(fn->leaf,
3835 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3836 	while (iter) {
3837 		if (iter->fib6_metric == rt->fib6_metric &&
3838 		    rt6_qualify_for_ecmp(iter))
3839 			return iter;
3840 		iter = rcu_dereference_protected(iter->fib6_next,
3841 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3842 	}
3843 
3844 	return NULL;
3845 }
3846 
3847 static bool rt6_is_dead(const struct fib6_info *rt)
3848 {
3849 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3850 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3851 	     fib6_ignore_linkdown(rt)))
3852 		return true;
3853 
3854 	return false;
3855 }
3856 
3857 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3858 {
3859 	struct fib6_info *iter;
3860 	int total = 0;
3861 
3862 	if (!rt6_is_dead(rt))
3863 		total += rt->fib6_nh.nh_weight;
3864 
3865 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3866 		if (!rt6_is_dead(iter))
3867 			total += iter->fib6_nh.nh_weight;
3868 	}
3869 
3870 	return total;
3871 }
3872 
3873 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3874 {
3875 	int upper_bound = -1;
3876 
3877 	if (!rt6_is_dead(rt)) {
3878 		*weight += rt->fib6_nh.nh_weight;
3879 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3880 						    total) - 1;
3881 	}
3882 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3883 }
3884 
3885 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3886 {
3887 	struct fib6_info *iter;
3888 	int weight = 0;
3889 
3890 	rt6_upper_bound_set(rt, &weight, total);
3891 
3892 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3893 		rt6_upper_bound_set(iter, &weight, total);
3894 }
3895 
3896 void rt6_multipath_rebalance(struct fib6_info *rt)
3897 {
3898 	struct fib6_info *first;
3899 	int total;
3900 
3901 	/* In case the entire multipath route was marked for flushing,
3902 	 * then there is no need to rebalance upon the removal of every
3903 	 * sibling route.
3904 	 */
3905 	if (!rt->fib6_nsiblings || rt->should_flush)
3906 		return;
3907 
3908 	/* During lookup routes are evaluated in order, so we need to
3909 	 * make sure upper bounds are assigned from the first sibling
3910 	 * onwards.
3911 	 */
3912 	first = rt6_multipath_first_sibling(rt);
3913 	if (WARN_ON_ONCE(!first))
3914 		return;
3915 
3916 	total = rt6_multipath_total_weight(first);
3917 	rt6_multipath_upper_bound_set(first, total);
3918 }
3919 
3920 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3921 {
3922 	const struct arg_netdev_event *arg = p_arg;
3923 	struct net *net = dev_net(arg->dev);
3924 
3925 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3926 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3927 		fib6_update_sernum_upto_root(net, rt);
3928 		rt6_multipath_rebalance(rt);
3929 	}
3930 
3931 	return 0;
3932 }
3933 
3934 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3935 {
3936 	struct arg_netdev_event arg = {
3937 		.dev = dev,
3938 		{
3939 			.nh_flags = nh_flags,
3940 		},
3941 	};
3942 
3943 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3944 		arg.nh_flags |= RTNH_F_LINKDOWN;
3945 
3946 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3947 }
3948 
3949 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3950 				   const struct net_device *dev)
3951 {
3952 	struct fib6_info *iter;
3953 
3954 	if (rt->fib6_nh.nh_dev == dev)
3955 		return true;
3956 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3957 		if (iter->fib6_nh.nh_dev == dev)
3958 			return true;
3959 
3960 	return false;
3961 }
3962 
3963 static void rt6_multipath_flush(struct fib6_info *rt)
3964 {
3965 	struct fib6_info *iter;
3966 
3967 	rt->should_flush = 1;
3968 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3969 		iter->should_flush = 1;
3970 }
3971 
3972 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3973 					     const struct net_device *down_dev)
3974 {
3975 	struct fib6_info *iter;
3976 	unsigned int dead = 0;
3977 
3978 	if (rt->fib6_nh.nh_dev == down_dev ||
3979 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3980 		dead++;
3981 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3982 		if (iter->fib6_nh.nh_dev == down_dev ||
3983 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3984 			dead++;
3985 
3986 	return dead;
3987 }
3988 
3989 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3990 				       const struct net_device *dev,
3991 				       unsigned int nh_flags)
3992 {
3993 	struct fib6_info *iter;
3994 
3995 	if (rt->fib6_nh.nh_dev == dev)
3996 		rt->fib6_nh.nh_flags |= nh_flags;
3997 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3998 		if (iter->fib6_nh.nh_dev == dev)
3999 			iter->fib6_nh.nh_flags |= nh_flags;
4000 }
4001 
4002 /* called with write lock held for table with rt */
4003 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4004 {
4005 	const struct arg_netdev_event *arg = p_arg;
4006 	const struct net_device *dev = arg->dev;
4007 	struct net *net = dev_net(dev);
4008 
4009 	if (rt == net->ipv6.fib6_null_entry)
4010 		return 0;
4011 
4012 	switch (arg->event) {
4013 	case NETDEV_UNREGISTER:
4014 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4015 	case NETDEV_DOWN:
4016 		if (rt->should_flush)
4017 			return -1;
4018 		if (!rt->fib6_nsiblings)
4019 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4020 		if (rt6_multipath_uses_dev(rt, dev)) {
4021 			unsigned int count;
4022 
4023 			count = rt6_multipath_dead_count(rt, dev);
4024 			if (rt->fib6_nsiblings + 1 == count) {
4025 				rt6_multipath_flush(rt);
4026 				return -1;
4027 			}
4028 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4029 						   RTNH_F_LINKDOWN);
4030 			fib6_update_sernum(net, rt);
4031 			rt6_multipath_rebalance(rt);
4032 		}
4033 		return -2;
4034 	case NETDEV_CHANGE:
4035 		if (rt->fib6_nh.nh_dev != dev ||
4036 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4037 			break;
4038 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4039 		rt6_multipath_rebalance(rt);
4040 		break;
4041 	}
4042 
4043 	return 0;
4044 }
4045 
4046 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4047 {
4048 	struct arg_netdev_event arg = {
4049 		.dev = dev,
4050 		{
4051 			.event = event,
4052 		},
4053 	};
4054 
4055 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4056 }
4057 
4058 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4059 {
4060 	rt6_sync_down_dev(dev, event);
4061 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4062 	neigh_ifdown(&nd_tbl, dev);
4063 }
4064 
4065 struct rt6_mtu_change_arg {
4066 	struct net_device *dev;
4067 	unsigned int mtu;
4068 };
4069 
4070 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4071 {
4072 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4073 	struct inet6_dev *idev;
4074 
4075 	/* In IPv6 pmtu discovery is not optional,
4076 	   so that RTAX_MTU lock cannot disable it.
4077 	   We still use this lock to block changes
4078 	   caused by addrconf/ndisc.
4079 	*/
4080 
4081 	idev = __in6_dev_get(arg->dev);
4082 	if (!idev)
4083 		return 0;
4084 
4085 	/* For administrative MTU increase, there is no way to discover
4086 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4087 	   Since RFC 1981 doesn't include administrative MTU increase
4088 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4089 	 */
4090 	if (rt->fib6_nh.nh_dev == arg->dev &&
4091 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4092 		u32 mtu = rt->fib6_pmtu;
4093 
4094 		if (mtu >= arg->mtu ||
4095 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4096 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4097 
4098 		spin_lock_bh(&rt6_exception_lock);
4099 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4100 		spin_unlock_bh(&rt6_exception_lock);
4101 	}
4102 	return 0;
4103 }
4104 
4105 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4106 {
4107 	struct rt6_mtu_change_arg arg = {
4108 		.dev = dev,
4109 		.mtu = mtu,
4110 	};
4111 
4112 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4113 }
4114 
4115 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4116 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4117 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4118 	[RTA_OIF]               = { .type = NLA_U32 },
4119 	[RTA_IIF]		= { .type = NLA_U32 },
4120 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4121 	[RTA_METRICS]           = { .type = NLA_NESTED },
4122 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4123 	[RTA_PREF]              = { .type = NLA_U8 },
4124 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4125 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4126 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4127 	[RTA_UID]		= { .type = NLA_U32 },
4128 	[RTA_MARK]		= { .type = NLA_U32 },
4129 	[RTA_TABLE]		= { .type = NLA_U32 },
4130 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4131 	[RTA_SPORT]		= { .type = NLA_U16 },
4132 	[RTA_DPORT]		= { .type = NLA_U16 },
4133 };
4134 
4135 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4136 			      struct fib6_config *cfg,
4137 			      struct netlink_ext_ack *extack)
4138 {
4139 	struct rtmsg *rtm;
4140 	struct nlattr *tb[RTA_MAX+1];
4141 	unsigned int pref;
4142 	int err;
4143 
4144 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4145 			  NULL);
4146 	if (err < 0)
4147 		goto errout;
4148 
4149 	err = -EINVAL;
4150 	rtm = nlmsg_data(nlh);
4151 	memset(cfg, 0, sizeof(*cfg));
4152 
4153 	cfg->fc_table = rtm->rtm_table;
4154 	cfg->fc_dst_len = rtm->rtm_dst_len;
4155 	cfg->fc_src_len = rtm->rtm_src_len;
4156 	cfg->fc_flags = RTF_UP;
4157 	cfg->fc_protocol = rtm->rtm_protocol;
4158 	cfg->fc_type = rtm->rtm_type;
4159 
4160 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4161 	    rtm->rtm_type == RTN_BLACKHOLE ||
4162 	    rtm->rtm_type == RTN_PROHIBIT ||
4163 	    rtm->rtm_type == RTN_THROW)
4164 		cfg->fc_flags |= RTF_REJECT;
4165 
4166 	if (rtm->rtm_type == RTN_LOCAL)
4167 		cfg->fc_flags |= RTF_LOCAL;
4168 
4169 	if (rtm->rtm_flags & RTM_F_CLONED)
4170 		cfg->fc_flags |= RTF_CACHE;
4171 
4172 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4173 
4174 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4175 	cfg->fc_nlinfo.nlh = nlh;
4176 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4177 
4178 	if (tb[RTA_GATEWAY]) {
4179 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4180 		cfg->fc_flags |= RTF_GATEWAY;
4181 	}
4182 
4183 	if (tb[RTA_DST]) {
4184 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4185 
4186 		if (nla_len(tb[RTA_DST]) < plen)
4187 			goto errout;
4188 
4189 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4190 	}
4191 
4192 	if (tb[RTA_SRC]) {
4193 		int plen = (rtm->rtm_src_len + 7) >> 3;
4194 
4195 		if (nla_len(tb[RTA_SRC]) < plen)
4196 			goto errout;
4197 
4198 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4199 	}
4200 
4201 	if (tb[RTA_PREFSRC])
4202 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4203 
4204 	if (tb[RTA_OIF])
4205 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4206 
4207 	if (tb[RTA_PRIORITY])
4208 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4209 
4210 	if (tb[RTA_METRICS]) {
4211 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4212 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4213 	}
4214 
4215 	if (tb[RTA_TABLE])
4216 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4217 
4218 	if (tb[RTA_MULTIPATH]) {
4219 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4220 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4221 
4222 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4223 						     cfg->fc_mp_len, extack);
4224 		if (err < 0)
4225 			goto errout;
4226 	}
4227 
4228 	if (tb[RTA_PREF]) {
4229 		pref = nla_get_u8(tb[RTA_PREF]);
4230 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4231 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4232 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4233 		cfg->fc_flags |= RTF_PREF(pref);
4234 	}
4235 
4236 	if (tb[RTA_ENCAP])
4237 		cfg->fc_encap = tb[RTA_ENCAP];
4238 
4239 	if (tb[RTA_ENCAP_TYPE]) {
4240 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4241 
4242 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4243 		if (err < 0)
4244 			goto errout;
4245 	}
4246 
4247 	if (tb[RTA_EXPIRES]) {
4248 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4249 
4250 		if (addrconf_finite_timeout(timeout)) {
4251 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4252 			cfg->fc_flags |= RTF_EXPIRES;
4253 		}
4254 	}
4255 
4256 	err = 0;
4257 errout:
4258 	return err;
4259 }
4260 
4261 struct rt6_nh {
4262 	struct fib6_info *fib6_info;
4263 	struct fib6_config r_cfg;
4264 	struct list_head next;
4265 };
4266 
4267 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4268 {
4269 	struct rt6_nh *nh;
4270 
4271 	list_for_each_entry(nh, rt6_nh_list, next) {
4272 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4273 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4274 		        nh->r_cfg.fc_ifindex);
4275 	}
4276 }
4277 
4278 static int ip6_route_info_append(struct net *net,
4279 				 struct list_head *rt6_nh_list,
4280 				 struct fib6_info *rt,
4281 				 struct fib6_config *r_cfg)
4282 {
4283 	struct rt6_nh *nh;
4284 	int err = -EEXIST;
4285 
4286 	list_for_each_entry(nh, rt6_nh_list, next) {
4287 		/* check if fib6_info already exists */
4288 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4289 			return err;
4290 	}
4291 
4292 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4293 	if (!nh)
4294 		return -ENOMEM;
4295 	nh->fib6_info = rt;
4296 	err = ip6_convert_metrics(net, rt, r_cfg);
4297 	if (err) {
4298 		kfree(nh);
4299 		return err;
4300 	}
4301 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4302 	list_add_tail(&nh->next, rt6_nh_list);
4303 
4304 	return 0;
4305 }
4306 
4307 static void ip6_route_mpath_notify(struct fib6_info *rt,
4308 				   struct fib6_info *rt_last,
4309 				   struct nl_info *info,
4310 				   __u16 nlflags)
4311 {
4312 	/* if this is an APPEND route, then rt points to the first route
4313 	 * inserted and rt_last points to last route inserted. Userspace
4314 	 * wants a consistent dump of the route which starts at the first
4315 	 * nexthop. Since sibling routes are always added at the end of
4316 	 * the list, find the first sibling of the last route appended
4317 	 */
4318 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4319 		rt = list_first_entry(&rt_last->fib6_siblings,
4320 				      struct fib6_info,
4321 				      fib6_siblings);
4322 	}
4323 
4324 	if (rt)
4325 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4326 }
4327 
4328 static int ip6_route_multipath_add(struct fib6_config *cfg,
4329 				   struct netlink_ext_ack *extack)
4330 {
4331 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4332 	struct nl_info *info = &cfg->fc_nlinfo;
4333 	struct fib6_config r_cfg;
4334 	struct rtnexthop *rtnh;
4335 	struct fib6_info *rt;
4336 	struct rt6_nh *err_nh;
4337 	struct rt6_nh *nh, *nh_safe;
4338 	__u16 nlflags;
4339 	int remaining;
4340 	int attrlen;
4341 	int err = 1;
4342 	int nhn = 0;
4343 	int replace = (cfg->fc_nlinfo.nlh &&
4344 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4345 	LIST_HEAD(rt6_nh_list);
4346 
4347 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4348 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4349 		nlflags |= NLM_F_APPEND;
4350 
4351 	remaining = cfg->fc_mp_len;
4352 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4353 
4354 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4355 	 * fib6_info structs per nexthop
4356 	 */
4357 	while (rtnh_ok(rtnh, remaining)) {
4358 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4359 		if (rtnh->rtnh_ifindex)
4360 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4361 
4362 		attrlen = rtnh_attrlen(rtnh);
4363 		if (attrlen > 0) {
4364 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4365 
4366 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4367 			if (nla) {
4368 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4369 				r_cfg.fc_flags |= RTF_GATEWAY;
4370 			}
4371 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4372 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4373 			if (nla)
4374 				r_cfg.fc_encap_type = nla_get_u16(nla);
4375 		}
4376 
4377 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4378 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4379 		if (IS_ERR(rt)) {
4380 			err = PTR_ERR(rt);
4381 			rt = NULL;
4382 			goto cleanup;
4383 		}
4384 		if (!rt6_qualify_for_ecmp(rt)) {
4385 			err = -EINVAL;
4386 			NL_SET_ERR_MSG(extack,
4387 				       "Device only routes can not be added for IPv6 using the multipath API.");
4388 			fib6_info_release(rt);
4389 			goto cleanup;
4390 		}
4391 
4392 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4393 
4394 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4395 					    rt, &r_cfg);
4396 		if (err) {
4397 			fib6_info_release(rt);
4398 			goto cleanup;
4399 		}
4400 
4401 		rtnh = rtnh_next(rtnh, &remaining);
4402 	}
4403 
4404 	/* for add and replace send one notification with all nexthops.
4405 	 * Skip the notification in fib6_add_rt2node and send one with
4406 	 * the full route when done
4407 	 */
4408 	info->skip_notify = 1;
4409 
4410 	err_nh = NULL;
4411 	list_for_each_entry(nh, &rt6_nh_list, next) {
4412 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4413 		fib6_info_release(nh->fib6_info);
4414 
4415 		if (!err) {
4416 			/* save reference to last route successfully inserted */
4417 			rt_last = nh->fib6_info;
4418 
4419 			/* save reference to first route for notification */
4420 			if (!rt_notif)
4421 				rt_notif = nh->fib6_info;
4422 		}
4423 
4424 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4425 		nh->fib6_info = NULL;
4426 		if (err) {
4427 			if (replace && nhn)
4428 				ip6_print_replace_route_err(&rt6_nh_list);
4429 			err_nh = nh;
4430 			goto add_errout;
4431 		}
4432 
4433 		/* Because each route is added like a single route we remove
4434 		 * these flags after the first nexthop: if there is a collision,
4435 		 * we have already failed to add the first nexthop:
4436 		 * fib6_add_rt2node() has rejected it; when replacing, old
4437 		 * nexthops have been replaced by first new, the rest should
4438 		 * be added to it.
4439 		 */
4440 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4441 						     NLM_F_REPLACE);
4442 		nhn++;
4443 	}
4444 
4445 	/* success ... tell user about new route */
4446 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4447 	goto cleanup;
4448 
4449 add_errout:
4450 	/* send notification for routes that were added so that
4451 	 * the delete notifications sent by ip6_route_del are
4452 	 * coherent
4453 	 */
4454 	if (rt_notif)
4455 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4456 
4457 	/* Delete routes that were already added */
4458 	list_for_each_entry(nh, &rt6_nh_list, next) {
4459 		if (err_nh == nh)
4460 			break;
4461 		ip6_route_del(&nh->r_cfg, extack);
4462 	}
4463 
4464 cleanup:
4465 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4466 		if (nh->fib6_info)
4467 			fib6_info_release(nh->fib6_info);
4468 		list_del(&nh->next);
4469 		kfree(nh);
4470 	}
4471 
4472 	return err;
4473 }
4474 
4475 static int ip6_route_multipath_del(struct fib6_config *cfg,
4476 				   struct netlink_ext_ack *extack)
4477 {
4478 	struct fib6_config r_cfg;
4479 	struct rtnexthop *rtnh;
4480 	int remaining;
4481 	int attrlen;
4482 	int err = 1, last_err = 0;
4483 
4484 	remaining = cfg->fc_mp_len;
4485 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4486 
4487 	/* Parse a Multipath Entry */
4488 	while (rtnh_ok(rtnh, remaining)) {
4489 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4490 		if (rtnh->rtnh_ifindex)
4491 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4492 
4493 		attrlen = rtnh_attrlen(rtnh);
4494 		if (attrlen > 0) {
4495 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4496 
4497 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4498 			if (nla) {
4499 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4500 				r_cfg.fc_flags |= RTF_GATEWAY;
4501 			}
4502 		}
4503 		err = ip6_route_del(&r_cfg, extack);
4504 		if (err)
4505 			last_err = err;
4506 
4507 		rtnh = rtnh_next(rtnh, &remaining);
4508 	}
4509 
4510 	return last_err;
4511 }
4512 
4513 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4514 			      struct netlink_ext_ack *extack)
4515 {
4516 	struct fib6_config cfg;
4517 	int err;
4518 
4519 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4520 	if (err < 0)
4521 		return err;
4522 
4523 	if (cfg.fc_mp)
4524 		return ip6_route_multipath_del(&cfg, extack);
4525 	else {
4526 		cfg.fc_delete_all_nh = 1;
4527 		return ip6_route_del(&cfg, extack);
4528 	}
4529 }
4530 
4531 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4532 			      struct netlink_ext_ack *extack)
4533 {
4534 	struct fib6_config cfg;
4535 	int err;
4536 
4537 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4538 	if (err < 0)
4539 		return err;
4540 
4541 	if (cfg.fc_mp)
4542 		return ip6_route_multipath_add(&cfg, extack);
4543 	else
4544 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4545 }
4546 
4547 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4548 {
4549 	int nexthop_len = 0;
4550 
4551 	if (rt->fib6_nsiblings) {
4552 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4553 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4554 			    + nla_total_size(16) /* RTA_GATEWAY */
4555 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4556 
4557 		nexthop_len *= rt->fib6_nsiblings;
4558 	}
4559 
4560 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4561 	       + nla_total_size(16) /* RTA_SRC */
4562 	       + nla_total_size(16) /* RTA_DST */
4563 	       + nla_total_size(16) /* RTA_GATEWAY */
4564 	       + nla_total_size(16) /* RTA_PREFSRC */
4565 	       + nla_total_size(4) /* RTA_TABLE */
4566 	       + nla_total_size(4) /* RTA_IIF */
4567 	       + nla_total_size(4) /* RTA_OIF */
4568 	       + nla_total_size(4) /* RTA_PRIORITY */
4569 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4570 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4571 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4572 	       + nla_total_size(1) /* RTA_PREF */
4573 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4574 	       + nexthop_len;
4575 }
4576 
4577 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4578 			    unsigned int *flags, bool skip_oif)
4579 {
4580 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4581 		*flags |= RTNH_F_DEAD;
4582 
4583 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4584 		*flags |= RTNH_F_LINKDOWN;
4585 
4586 		rcu_read_lock();
4587 		if (fib6_ignore_linkdown(rt))
4588 			*flags |= RTNH_F_DEAD;
4589 		rcu_read_unlock();
4590 	}
4591 
4592 	if (rt->fib6_flags & RTF_GATEWAY) {
4593 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4594 			goto nla_put_failure;
4595 	}
4596 
4597 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4598 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4599 		*flags |= RTNH_F_OFFLOAD;
4600 
4601 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4602 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4603 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4604 		goto nla_put_failure;
4605 
4606 	if (rt->fib6_nh.nh_lwtstate &&
4607 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4608 		goto nla_put_failure;
4609 
4610 	return 0;
4611 
4612 nla_put_failure:
4613 	return -EMSGSIZE;
4614 }
4615 
4616 /* add multipath next hop */
4617 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4618 {
4619 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4620 	struct rtnexthop *rtnh;
4621 	unsigned int flags = 0;
4622 
4623 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4624 	if (!rtnh)
4625 		goto nla_put_failure;
4626 
4627 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4628 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4629 
4630 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4631 		goto nla_put_failure;
4632 
4633 	rtnh->rtnh_flags = flags;
4634 
4635 	/* length of rtnetlink header + attributes */
4636 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4637 
4638 	return 0;
4639 
4640 nla_put_failure:
4641 	return -EMSGSIZE;
4642 }
4643 
4644 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4645 			 struct fib6_info *rt, struct dst_entry *dst,
4646 			 struct in6_addr *dest, struct in6_addr *src,
4647 			 int iif, int type, u32 portid, u32 seq,
4648 			 unsigned int flags)
4649 {
4650 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4651 	struct rt6key *rt6_dst, *rt6_src;
4652 	u32 *pmetrics, table, rt6_flags;
4653 	struct nlmsghdr *nlh;
4654 	struct rtmsg *rtm;
4655 	long expires = 0;
4656 
4657 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4658 	if (!nlh)
4659 		return -EMSGSIZE;
4660 
4661 	if (rt6) {
4662 		rt6_dst = &rt6->rt6i_dst;
4663 		rt6_src = &rt6->rt6i_src;
4664 		rt6_flags = rt6->rt6i_flags;
4665 	} else {
4666 		rt6_dst = &rt->fib6_dst;
4667 		rt6_src = &rt->fib6_src;
4668 		rt6_flags = rt->fib6_flags;
4669 	}
4670 
4671 	rtm = nlmsg_data(nlh);
4672 	rtm->rtm_family = AF_INET6;
4673 	rtm->rtm_dst_len = rt6_dst->plen;
4674 	rtm->rtm_src_len = rt6_src->plen;
4675 	rtm->rtm_tos = 0;
4676 	if (rt->fib6_table)
4677 		table = rt->fib6_table->tb6_id;
4678 	else
4679 		table = RT6_TABLE_UNSPEC;
4680 	rtm->rtm_table = table;
4681 	if (nla_put_u32(skb, RTA_TABLE, table))
4682 		goto nla_put_failure;
4683 
4684 	rtm->rtm_type = rt->fib6_type;
4685 	rtm->rtm_flags = 0;
4686 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4687 	rtm->rtm_protocol = rt->fib6_protocol;
4688 
4689 	if (rt6_flags & RTF_CACHE)
4690 		rtm->rtm_flags |= RTM_F_CLONED;
4691 
4692 	if (dest) {
4693 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4694 			goto nla_put_failure;
4695 		rtm->rtm_dst_len = 128;
4696 	} else if (rtm->rtm_dst_len)
4697 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4698 			goto nla_put_failure;
4699 #ifdef CONFIG_IPV6_SUBTREES
4700 	if (src) {
4701 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4702 			goto nla_put_failure;
4703 		rtm->rtm_src_len = 128;
4704 	} else if (rtm->rtm_src_len &&
4705 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4706 		goto nla_put_failure;
4707 #endif
4708 	if (iif) {
4709 #ifdef CONFIG_IPV6_MROUTE
4710 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4711 			int err = ip6mr_get_route(net, skb, rtm, portid);
4712 
4713 			if (err == 0)
4714 				return 0;
4715 			if (err < 0)
4716 				goto nla_put_failure;
4717 		} else
4718 #endif
4719 			if (nla_put_u32(skb, RTA_IIF, iif))
4720 				goto nla_put_failure;
4721 	} else if (dest) {
4722 		struct in6_addr saddr_buf;
4723 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4724 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4725 			goto nla_put_failure;
4726 	}
4727 
4728 	if (rt->fib6_prefsrc.plen) {
4729 		struct in6_addr saddr_buf;
4730 		saddr_buf = rt->fib6_prefsrc.addr;
4731 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4732 			goto nla_put_failure;
4733 	}
4734 
4735 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4736 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4737 		goto nla_put_failure;
4738 
4739 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4740 		goto nla_put_failure;
4741 
4742 	/* For multipath routes, walk the siblings list and add
4743 	 * each as a nexthop within RTA_MULTIPATH.
4744 	 */
4745 	if (rt6) {
4746 		if (rt6_flags & RTF_GATEWAY &&
4747 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4748 			goto nla_put_failure;
4749 
4750 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4751 			goto nla_put_failure;
4752 	} else if (rt->fib6_nsiblings) {
4753 		struct fib6_info *sibling, *next_sibling;
4754 		struct nlattr *mp;
4755 
4756 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4757 		if (!mp)
4758 			goto nla_put_failure;
4759 
4760 		if (rt6_add_nexthop(skb, rt) < 0)
4761 			goto nla_put_failure;
4762 
4763 		list_for_each_entry_safe(sibling, next_sibling,
4764 					 &rt->fib6_siblings, fib6_siblings) {
4765 			if (rt6_add_nexthop(skb, sibling) < 0)
4766 				goto nla_put_failure;
4767 		}
4768 
4769 		nla_nest_end(skb, mp);
4770 	} else {
4771 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4772 			goto nla_put_failure;
4773 	}
4774 
4775 	if (rt6_flags & RTF_EXPIRES) {
4776 		expires = dst ? dst->expires : rt->expires;
4777 		expires -= jiffies;
4778 	}
4779 
4780 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4781 		goto nla_put_failure;
4782 
4783 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4784 		goto nla_put_failure;
4785 
4786 
4787 	nlmsg_end(skb, nlh);
4788 	return 0;
4789 
4790 nla_put_failure:
4791 	nlmsg_cancel(skb, nlh);
4792 	return -EMSGSIZE;
4793 }
4794 
4795 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4796 {
4797 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4798 	struct net *net = arg->net;
4799 
4800 	if (rt == net->ipv6.fib6_null_entry)
4801 		return 0;
4802 
4803 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4804 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4805 
4806 		/* user wants prefix routes only */
4807 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4808 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4809 			/* success since this is not a prefix route */
4810 			return 1;
4811 		}
4812 	}
4813 
4814 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4815 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4816 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4817 }
4818 
4819 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4820 			      struct netlink_ext_ack *extack)
4821 {
4822 	struct net *net = sock_net(in_skb->sk);
4823 	struct nlattr *tb[RTA_MAX+1];
4824 	int err, iif = 0, oif = 0;
4825 	struct fib6_info *from;
4826 	struct dst_entry *dst;
4827 	struct rt6_info *rt;
4828 	struct sk_buff *skb;
4829 	struct rtmsg *rtm;
4830 	struct flowi6 fl6;
4831 	bool fibmatch;
4832 
4833 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4834 			  extack);
4835 	if (err < 0)
4836 		goto errout;
4837 
4838 	err = -EINVAL;
4839 	memset(&fl6, 0, sizeof(fl6));
4840 	rtm = nlmsg_data(nlh);
4841 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4842 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4843 
4844 	if (tb[RTA_SRC]) {
4845 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4846 			goto errout;
4847 
4848 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4849 	}
4850 
4851 	if (tb[RTA_DST]) {
4852 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4853 			goto errout;
4854 
4855 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4856 	}
4857 
4858 	if (tb[RTA_IIF])
4859 		iif = nla_get_u32(tb[RTA_IIF]);
4860 
4861 	if (tb[RTA_OIF])
4862 		oif = nla_get_u32(tb[RTA_OIF]);
4863 
4864 	if (tb[RTA_MARK])
4865 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4866 
4867 	if (tb[RTA_UID])
4868 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4869 					   nla_get_u32(tb[RTA_UID]));
4870 	else
4871 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4872 
4873 	if (tb[RTA_SPORT])
4874 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4875 
4876 	if (tb[RTA_DPORT])
4877 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4878 
4879 	if (tb[RTA_IP_PROTO]) {
4880 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4881 						  &fl6.flowi6_proto, extack);
4882 		if (err)
4883 			goto errout;
4884 	}
4885 
4886 	if (iif) {
4887 		struct net_device *dev;
4888 		int flags = 0;
4889 
4890 		rcu_read_lock();
4891 
4892 		dev = dev_get_by_index_rcu(net, iif);
4893 		if (!dev) {
4894 			rcu_read_unlock();
4895 			err = -ENODEV;
4896 			goto errout;
4897 		}
4898 
4899 		fl6.flowi6_iif = iif;
4900 
4901 		if (!ipv6_addr_any(&fl6.saddr))
4902 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4903 
4904 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4905 
4906 		rcu_read_unlock();
4907 	} else {
4908 		fl6.flowi6_oif = oif;
4909 
4910 		dst = ip6_route_output(net, NULL, &fl6);
4911 	}
4912 
4913 
4914 	rt = container_of(dst, struct rt6_info, dst);
4915 	if (rt->dst.error) {
4916 		err = rt->dst.error;
4917 		ip6_rt_put(rt);
4918 		goto errout;
4919 	}
4920 
4921 	if (rt == net->ipv6.ip6_null_entry) {
4922 		err = rt->dst.error;
4923 		ip6_rt_put(rt);
4924 		goto errout;
4925 	}
4926 
4927 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4928 	if (!skb) {
4929 		ip6_rt_put(rt);
4930 		err = -ENOBUFS;
4931 		goto errout;
4932 	}
4933 
4934 	skb_dst_set(skb, &rt->dst);
4935 
4936 	rcu_read_lock();
4937 	from = rcu_dereference(rt->from);
4938 
4939 	if (fibmatch)
4940 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4941 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4942 				    nlh->nlmsg_seq, 0);
4943 	else
4944 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4945 				    &fl6.saddr, iif, RTM_NEWROUTE,
4946 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4947 				    0);
4948 	rcu_read_unlock();
4949 
4950 	if (err < 0) {
4951 		kfree_skb(skb);
4952 		goto errout;
4953 	}
4954 
4955 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4956 errout:
4957 	return err;
4958 }
4959 
4960 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4961 		     unsigned int nlm_flags)
4962 {
4963 	struct sk_buff *skb;
4964 	struct net *net = info->nl_net;
4965 	u32 seq;
4966 	int err;
4967 
4968 	err = -ENOBUFS;
4969 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4970 
4971 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4972 	if (!skb)
4973 		goto errout;
4974 
4975 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4976 			    event, info->portid, seq, nlm_flags);
4977 	if (err < 0) {
4978 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4979 		WARN_ON(err == -EMSGSIZE);
4980 		kfree_skb(skb);
4981 		goto errout;
4982 	}
4983 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4984 		    info->nlh, gfp_any());
4985 	return;
4986 errout:
4987 	if (err < 0)
4988 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4989 }
4990 
4991 static int ip6_route_dev_notify(struct notifier_block *this,
4992 				unsigned long event, void *ptr)
4993 {
4994 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4995 	struct net *net = dev_net(dev);
4996 
4997 	if (!(dev->flags & IFF_LOOPBACK))
4998 		return NOTIFY_OK;
4999 
5000 	if (event == NETDEV_REGISTER) {
5001 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5002 		net->ipv6.ip6_null_entry->dst.dev = dev;
5003 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5004 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5005 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5006 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5007 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5008 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5009 #endif
5010 	 } else if (event == NETDEV_UNREGISTER &&
5011 		    dev->reg_state != NETREG_UNREGISTERED) {
5012 		/* NETDEV_UNREGISTER could be fired for multiple times by
5013 		 * netdev_wait_allrefs(). Make sure we only call this once.
5014 		 */
5015 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5016 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5017 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5018 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5019 #endif
5020 	}
5021 
5022 	return NOTIFY_OK;
5023 }
5024 
5025 /*
5026  *	/proc
5027  */
5028 
5029 #ifdef CONFIG_PROC_FS
5030 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5031 {
5032 	struct net *net = (struct net *)seq->private;
5033 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5034 		   net->ipv6.rt6_stats->fib_nodes,
5035 		   net->ipv6.rt6_stats->fib_route_nodes,
5036 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5037 		   net->ipv6.rt6_stats->fib_rt_entries,
5038 		   net->ipv6.rt6_stats->fib_rt_cache,
5039 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5040 		   net->ipv6.rt6_stats->fib_discarded_routes);
5041 
5042 	return 0;
5043 }
5044 #endif	/* CONFIG_PROC_FS */
5045 
5046 #ifdef CONFIG_SYSCTL
5047 
5048 static
5049 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5050 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5051 {
5052 	struct net *net;
5053 	int delay;
5054 	if (!write)
5055 		return -EINVAL;
5056 
5057 	net = (struct net *)ctl->extra1;
5058 	delay = net->ipv6.sysctl.flush_delay;
5059 	proc_dointvec(ctl, write, buffer, lenp, ppos);
5060 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5061 	return 0;
5062 }
5063 
5064 struct ctl_table ipv6_route_table_template[] = {
5065 	{
5066 		.procname	=	"flush",
5067 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5068 		.maxlen		=	sizeof(int),
5069 		.mode		=	0200,
5070 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5071 	},
5072 	{
5073 		.procname	=	"gc_thresh",
5074 		.data		=	&ip6_dst_ops_template.gc_thresh,
5075 		.maxlen		=	sizeof(int),
5076 		.mode		=	0644,
5077 		.proc_handler	=	proc_dointvec,
5078 	},
5079 	{
5080 		.procname	=	"max_size",
5081 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5082 		.maxlen		=	sizeof(int),
5083 		.mode		=	0644,
5084 		.proc_handler	=	proc_dointvec,
5085 	},
5086 	{
5087 		.procname	=	"gc_min_interval",
5088 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5089 		.maxlen		=	sizeof(int),
5090 		.mode		=	0644,
5091 		.proc_handler	=	proc_dointvec_jiffies,
5092 	},
5093 	{
5094 		.procname	=	"gc_timeout",
5095 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5096 		.maxlen		=	sizeof(int),
5097 		.mode		=	0644,
5098 		.proc_handler	=	proc_dointvec_jiffies,
5099 	},
5100 	{
5101 		.procname	=	"gc_interval",
5102 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5103 		.maxlen		=	sizeof(int),
5104 		.mode		=	0644,
5105 		.proc_handler	=	proc_dointvec_jiffies,
5106 	},
5107 	{
5108 		.procname	=	"gc_elasticity",
5109 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5110 		.maxlen		=	sizeof(int),
5111 		.mode		=	0644,
5112 		.proc_handler	=	proc_dointvec,
5113 	},
5114 	{
5115 		.procname	=	"mtu_expires",
5116 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5117 		.maxlen		=	sizeof(int),
5118 		.mode		=	0644,
5119 		.proc_handler	=	proc_dointvec_jiffies,
5120 	},
5121 	{
5122 		.procname	=	"min_adv_mss",
5123 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5124 		.maxlen		=	sizeof(int),
5125 		.mode		=	0644,
5126 		.proc_handler	=	proc_dointvec,
5127 	},
5128 	{
5129 		.procname	=	"gc_min_interval_ms",
5130 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5131 		.maxlen		=	sizeof(int),
5132 		.mode		=	0644,
5133 		.proc_handler	=	proc_dointvec_ms_jiffies,
5134 	},
5135 	{ }
5136 };
5137 
5138 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5139 {
5140 	struct ctl_table *table;
5141 
5142 	table = kmemdup(ipv6_route_table_template,
5143 			sizeof(ipv6_route_table_template),
5144 			GFP_KERNEL);
5145 
5146 	if (table) {
5147 		table[0].data = &net->ipv6.sysctl.flush_delay;
5148 		table[0].extra1 = net;
5149 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5150 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5151 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5152 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5153 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5154 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5155 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5156 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5157 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5158 
5159 		/* Don't export sysctls to unprivileged users */
5160 		if (net->user_ns != &init_user_ns)
5161 			table[0].procname = NULL;
5162 	}
5163 
5164 	return table;
5165 }
5166 #endif
5167 
5168 static int __net_init ip6_route_net_init(struct net *net)
5169 {
5170 	int ret = -ENOMEM;
5171 
5172 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5173 	       sizeof(net->ipv6.ip6_dst_ops));
5174 
5175 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5176 		goto out_ip6_dst_ops;
5177 
5178 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5179 					    sizeof(*net->ipv6.fib6_null_entry),
5180 					    GFP_KERNEL);
5181 	if (!net->ipv6.fib6_null_entry)
5182 		goto out_ip6_dst_entries;
5183 
5184 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5185 					   sizeof(*net->ipv6.ip6_null_entry),
5186 					   GFP_KERNEL);
5187 	if (!net->ipv6.ip6_null_entry)
5188 		goto out_fib6_null_entry;
5189 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5190 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5191 			 ip6_template_metrics, true);
5192 
5193 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5194 	net->ipv6.fib6_has_custom_rules = false;
5195 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5196 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5197 					       GFP_KERNEL);
5198 	if (!net->ipv6.ip6_prohibit_entry)
5199 		goto out_ip6_null_entry;
5200 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5201 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5202 			 ip6_template_metrics, true);
5203 
5204 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5205 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5206 					       GFP_KERNEL);
5207 	if (!net->ipv6.ip6_blk_hole_entry)
5208 		goto out_ip6_prohibit_entry;
5209 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5210 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5211 			 ip6_template_metrics, true);
5212 #endif
5213 
5214 	net->ipv6.sysctl.flush_delay = 0;
5215 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5216 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5217 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5218 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5219 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5220 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5221 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5222 
5223 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5224 
5225 	ret = 0;
5226 out:
5227 	return ret;
5228 
5229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5230 out_ip6_prohibit_entry:
5231 	kfree(net->ipv6.ip6_prohibit_entry);
5232 out_ip6_null_entry:
5233 	kfree(net->ipv6.ip6_null_entry);
5234 #endif
5235 out_fib6_null_entry:
5236 	kfree(net->ipv6.fib6_null_entry);
5237 out_ip6_dst_entries:
5238 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5239 out_ip6_dst_ops:
5240 	goto out;
5241 }
5242 
5243 static void __net_exit ip6_route_net_exit(struct net *net)
5244 {
5245 	kfree(net->ipv6.fib6_null_entry);
5246 	kfree(net->ipv6.ip6_null_entry);
5247 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5248 	kfree(net->ipv6.ip6_prohibit_entry);
5249 	kfree(net->ipv6.ip6_blk_hole_entry);
5250 #endif
5251 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5252 }
5253 
5254 static int __net_init ip6_route_net_init_late(struct net *net)
5255 {
5256 #ifdef CONFIG_PROC_FS
5257 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5258 			sizeof(struct ipv6_route_iter));
5259 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5260 			rt6_stats_seq_show, NULL);
5261 #endif
5262 	return 0;
5263 }
5264 
5265 static void __net_exit ip6_route_net_exit_late(struct net *net)
5266 {
5267 #ifdef CONFIG_PROC_FS
5268 	remove_proc_entry("ipv6_route", net->proc_net);
5269 	remove_proc_entry("rt6_stats", net->proc_net);
5270 #endif
5271 }
5272 
5273 static struct pernet_operations ip6_route_net_ops = {
5274 	.init = ip6_route_net_init,
5275 	.exit = ip6_route_net_exit,
5276 };
5277 
5278 static int __net_init ipv6_inetpeer_init(struct net *net)
5279 {
5280 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5281 
5282 	if (!bp)
5283 		return -ENOMEM;
5284 	inet_peer_base_init(bp);
5285 	net->ipv6.peers = bp;
5286 	return 0;
5287 }
5288 
5289 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5290 {
5291 	struct inet_peer_base *bp = net->ipv6.peers;
5292 
5293 	net->ipv6.peers = NULL;
5294 	inetpeer_invalidate_tree(bp);
5295 	kfree(bp);
5296 }
5297 
5298 static struct pernet_operations ipv6_inetpeer_ops = {
5299 	.init	=	ipv6_inetpeer_init,
5300 	.exit	=	ipv6_inetpeer_exit,
5301 };
5302 
5303 static struct pernet_operations ip6_route_net_late_ops = {
5304 	.init = ip6_route_net_init_late,
5305 	.exit = ip6_route_net_exit_late,
5306 };
5307 
5308 static struct notifier_block ip6_route_dev_notifier = {
5309 	.notifier_call = ip6_route_dev_notify,
5310 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5311 };
5312 
5313 void __init ip6_route_init_special_entries(void)
5314 {
5315 	/* Registering of the loopback is done before this portion of code,
5316 	 * the loopback reference in rt6_info will not be taken, do it
5317 	 * manually for init_net */
5318 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5319 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5320 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5321   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5322 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5323 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5324 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5325 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5326   #endif
5327 }
5328 
5329 int __init ip6_route_init(void)
5330 {
5331 	int ret;
5332 	int cpu;
5333 
5334 	ret = -ENOMEM;
5335 	ip6_dst_ops_template.kmem_cachep =
5336 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5337 				  SLAB_HWCACHE_ALIGN, NULL);
5338 	if (!ip6_dst_ops_template.kmem_cachep)
5339 		goto out;
5340 
5341 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5342 	if (ret)
5343 		goto out_kmem_cache;
5344 
5345 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5346 	if (ret)
5347 		goto out_dst_entries;
5348 
5349 	ret = register_pernet_subsys(&ip6_route_net_ops);
5350 	if (ret)
5351 		goto out_register_inetpeer;
5352 
5353 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5354 
5355 	ret = fib6_init();
5356 	if (ret)
5357 		goto out_register_subsys;
5358 
5359 	ret = xfrm6_init();
5360 	if (ret)
5361 		goto out_fib6_init;
5362 
5363 	ret = fib6_rules_init();
5364 	if (ret)
5365 		goto xfrm6_init;
5366 
5367 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5368 	if (ret)
5369 		goto fib6_rules_init;
5370 
5371 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5372 				   inet6_rtm_newroute, NULL, 0);
5373 	if (ret < 0)
5374 		goto out_register_late_subsys;
5375 
5376 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5377 				   inet6_rtm_delroute, NULL, 0);
5378 	if (ret < 0)
5379 		goto out_register_late_subsys;
5380 
5381 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5382 				   inet6_rtm_getroute, NULL,
5383 				   RTNL_FLAG_DOIT_UNLOCKED);
5384 	if (ret < 0)
5385 		goto out_register_late_subsys;
5386 
5387 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5388 	if (ret)
5389 		goto out_register_late_subsys;
5390 
5391 	for_each_possible_cpu(cpu) {
5392 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5393 
5394 		INIT_LIST_HEAD(&ul->head);
5395 		spin_lock_init(&ul->lock);
5396 	}
5397 
5398 out:
5399 	return ret;
5400 
5401 out_register_late_subsys:
5402 	rtnl_unregister_all(PF_INET6);
5403 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5404 fib6_rules_init:
5405 	fib6_rules_cleanup();
5406 xfrm6_init:
5407 	xfrm6_fini();
5408 out_fib6_init:
5409 	fib6_gc_cleanup();
5410 out_register_subsys:
5411 	unregister_pernet_subsys(&ip6_route_net_ops);
5412 out_register_inetpeer:
5413 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5414 out_dst_entries:
5415 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5416 out_kmem_cache:
5417 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5418 	goto out;
5419 }
5420 
5421 void ip6_route_cleanup(void)
5422 {
5423 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5424 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5425 	fib6_rules_cleanup();
5426 	xfrm6_fini();
5427 	fib6_gc_cleanup();
5428 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5429 	unregister_pernet_subsys(&ip6_route_net_ops);
5430 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5431 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5432 }
5433