xref: /linux/net/ipv6/route.c (revision 402cb8dda949d9b8c0df20ad2527d139faad7ca1)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 					     struct rt6_info *match,
455 					     struct flowi6 *fl6, int oif,
456 					     const struct sk_buff *skb,
457 					     int strict)
458 {
459 	struct rt6_info *sibling, *next_sibling;
460 
461 	/* We might have already computed the hash for ICMPv6 errors. In such
462 	 * case it will always be non-zero. Otherwise now is the time to do it.
463 	 */
464 	if (!fl6->mp_hash)
465 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466 
467 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 		return match;
469 
470 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 				 rt6i_siblings) {
472 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 			continue;
474 		if (rt6_score_route(sibling, oif, strict) < 0)
475 			break;
476 		match = sibling;
477 		break;
478 	}
479 
480 	return match;
481 }
482 
483 /*
484  *	Route lookup. rcu_read_lock() should be held.
485  */
486 
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488 						    struct rt6_info *rt,
489 						    const struct in6_addr *saddr,
490 						    int oif,
491 						    int flags)
492 {
493 	struct rt6_info *local = NULL;
494 	struct rt6_info *sprt;
495 
496 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 		return rt;
498 
499 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 		struct net_device *dev = sprt->dst.dev;
501 
502 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 			continue;
504 
505 		if (oif) {
506 			if (dev->ifindex == oif)
507 				return sprt;
508 			if (dev->flags & IFF_LOOPBACK) {
509 				if (!sprt->rt6i_idev ||
510 				    sprt->rt6i_idev->dev->ifindex != oif) {
511 					if (flags & RT6_LOOKUP_F_IFACE)
512 						continue;
513 					if (local &&
514 					    local->rt6i_idev->dev->ifindex == oif)
515 						continue;
516 				}
517 				local = sprt;
518 			}
519 		} else {
520 			if (ipv6_chk_addr(net, saddr, dev,
521 					  flags & RT6_LOOKUP_F_IFACE))
522 				return sprt;
523 		}
524 	}
525 
526 	if (oif) {
527 		if (local)
528 			return local;
529 
530 		if (flags & RT6_LOOKUP_F_IFACE)
531 			return net->ipv6.ip6_null_entry;
532 	}
533 
534 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536 
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 	struct work_struct work;
540 	struct in6_addr target;
541 	struct net_device *dev;
542 };
543 
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546 	struct in6_addr mcaddr;
547 	struct __rt6_probe_work *work =
548 		container_of(w, struct __rt6_probe_work, work);
549 
550 	addrconf_addr_solict_mult(&work->target, &mcaddr);
551 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552 	dev_put(work->dev);
553 	kfree(work);
554 }
555 
556 static void rt6_probe(struct rt6_info *rt)
557 {
558 	struct __rt6_probe_work *work;
559 	struct neighbour *neigh;
560 	/*
561 	 * Okay, this does not seem to be appropriate
562 	 * for now, however, we need to check if it
563 	 * is really so; aka Router Reachability Probing.
564 	 *
565 	 * Router Reachability Probe MUST be rate-limited
566 	 * to no more than one per minute.
567 	 */
568 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569 		return;
570 	rcu_read_lock_bh();
571 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 	if (neigh) {
573 		if (neigh->nud_state & NUD_VALID)
574 			goto out;
575 
576 		work = NULL;
577 		write_lock(&neigh->lock);
578 		if (!(neigh->nud_state & NUD_VALID) &&
579 		    time_after(jiffies,
580 			       neigh->updated +
581 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 			if (work)
584 				__neigh_set_probe_once(neigh);
585 		}
586 		write_unlock(&neigh->lock);
587 	} else {
588 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 	}
590 
591 	if (work) {
592 		INIT_WORK(&work->work, rt6_probe_deferred);
593 		work->target = rt->rt6i_gateway;
594 		dev_hold(rt->dst.dev);
595 		work->dev = rt->dst.dev;
596 		schedule_work(&work->work);
597 	}
598 
599 out:
600 	rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607 
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613 	struct net_device *dev = rt->dst.dev;
614 	if (!oif || dev->ifindex == oif)
615 		return 2;
616 	if ((dev->flags & IFF_LOOPBACK) &&
617 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 		return 1;
619 	return 0;
620 }
621 
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624 	struct neighbour *neigh;
625 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626 
627 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 	    !(rt->rt6i_flags & RTF_GATEWAY))
629 		return RT6_NUD_SUCCEED;
630 
631 	rcu_read_lock_bh();
632 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 	if (neigh) {
634 		read_lock(&neigh->lock);
635 		if (neigh->nud_state & NUD_VALID)
636 			ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 		else if (!(neigh->nud_state & NUD_FAILED))
639 			ret = RT6_NUD_SUCCEED;
640 		else
641 			ret = RT6_NUD_FAIL_PROBE;
642 #endif
643 		read_unlock(&neigh->lock);
644 	} else {
645 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647 	}
648 	rcu_read_unlock_bh();
649 
650 	return ret;
651 }
652 
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654 			   int strict)
655 {
656 	int m;
657 
658 	m = rt6_check_dev(rt, oif);
659 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 		return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664 	if (strict & RT6_LOOKUP_F_REACHABLE) {
665 		int n = rt6_check_neigh(rt);
666 		if (n < 0)
667 			return n;
668 	}
669 	return m;
670 }
671 
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 				   int *mpri, struct rt6_info *match,
674 				   bool *do_rr)
675 {
676 	int m;
677 	bool match_do_rr = false;
678 	struct inet6_dev *idev = rt->rt6i_idev;
679 
680 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 		goto out;
682 
683 	if (idev->cnf.ignore_routes_with_linkdown &&
684 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686 		goto out;
687 
688 	if (rt6_check_expired(rt))
689 		goto out;
690 
691 	m = rt6_score_route(rt, oif, strict);
692 	if (m == RT6_NUD_FAIL_DO_RR) {
693 		match_do_rr = true;
694 		m = 0; /* lowest valid score */
695 	} else if (m == RT6_NUD_FAIL_HARD) {
696 		goto out;
697 	}
698 
699 	if (strict & RT6_LOOKUP_F_REACHABLE)
700 		rt6_probe(rt);
701 
702 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
703 	if (m > *mpri) {
704 		*do_rr = match_do_rr;
705 		*mpri = m;
706 		match = rt;
707 	}
708 out:
709 	return match;
710 }
711 
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 				     struct rt6_info *leaf,
714 				     struct rt6_info *rr_head,
715 				     u32 metric, int oif, int strict,
716 				     bool *do_rr)
717 {
718 	struct rt6_info *rt, *match, *cont;
719 	int mpri = -1;
720 
721 	match = NULL;
722 	cont = NULL;
723 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 		if (rt->rt6i_metric != metric) {
725 			cont = rt;
726 			break;
727 		}
728 
729 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 	}
731 
732 	for (rt = leaf; rt && rt != rr_head;
733 	     rt = rcu_dereference(rt->rt6_next)) {
734 		if (rt->rt6i_metric != metric) {
735 			cont = rt;
736 			break;
737 		}
738 
739 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
740 	}
741 
742 	if (match || !cont)
743 		return match;
744 
745 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
747 
748 	return match;
749 }
750 
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 				   int oif, int strict)
753 {
754 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 	struct rt6_info *match, *rt0;
756 	bool do_rr = false;
757 	int key_plen;
758 
759 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 		return net->ipv6.ip6_null_entry;
761 
762 	rt0 = rcu_dereference(fn->rr_ptr);
763 	if (!rt0)
764 		rt0 = leaf;
765 
766 	/* Double check to make sure fn is not an intermediate node
767 	 * and fn->leaf does not points to its child's leaf
768 	 * (This might happen if all routes under fn are deleted from
769 	 * the tree and fib6_repair_tree() is called on the node.)
770 	 */
771 	key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 	if (rt0->rt6i_src.plen)
774 		key_plen = rt0->rt6i_src.plen;
775 #endif
776 	if (fn->fn_bit != key_plen)
777 		return net->ipv6.ip6_null_entry;
778 
779 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780 			     &do_rr);
781 
782 	if (do_rr) {
783 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784 
785 		/* no entries matched; do round-robin */
786 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
787 			next = leaf;
788 
789 		if (next != rt0) {
790 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 			/* make sure next is not being deleted from the tree */
792 			if (next->rt6i_node)
793 				rcu_assign_pointer(fn->rr_ptr, next);
794 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 		}
796 	}
797 
798 	return match ? match : net->ipv6.ip6_null_entry;
799 }
800 
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805 
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 		  const struct in6_addr *gwaddr)
809 {
810 	struct net *net = dev_net(dev);
811 	struct route_info *rinfo = (struct route_info *) opt;
812 	struct in6_addr prefix_buf, *prefix;
813 	unsigned int pref;
814 	unsigned long lifetime;
815 	struct rt6_info *rt;
816 
817 	if (len < sizeof(struct route_info)) {
818 		return -EINVAL;
819 	}
820 
821 	/* Sanity check for prefix_len and length */
822 	if (rinfo->length > 3) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 128) {
825 		return -EINVAL;
826 	} else if (rinfo->prefix_len > 64) {
827 		if (rinfo->length < 2) {
828 			return -EINVAL;
829 		}
830 	} else if (rinfo->prefix_len > 0) {
831 		if (rinfo->length < 1) {
832 			return -EINVAL;
833 		}
834 	}
835 
836 	pref = rinfo->route_pref;
837 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
838 		return -EINVAL;
839 
840 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841 
842 	if (rinfo->length == 3)
843 		prefix = (struct in6_addr *)rinfo->prefix;
844 	else {
845 		/* this function is safe */
846 		ipv6_addr_prefix(&prefix_buf,
847 				 (struct in6_addr *)rinfo->prefix,
848 				 rinfo->prefix_len);
849 		prefix = &prefix_buf;
850 	}
851 
852 	if (rinfo->prefix_len == 0)
853 		rt = rt6_get_dflt_router(gwaddr, dev);
854 	else
855 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856 					gwaddr, dev);
857 
858 	if (rt && !lifetime) {
859 		ip6_del_rt(rt);
860 		rt = NULL;
861 	}
862 
863 	if (!rt && lifetime)
864 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 					dev, pref);
866 	else if (rt)
867 		rt->rt6i_flags = RTF_ROUTEINFO |
868 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869 
870 	if (rt) {
871 		if (!addrconf_finite_timeout(lifetime))
872 			rt6_clean_expires(rt);
873 		else
874 			rt6_set_expires(rt, jiffies + HZ * lifetime);
875 
876 		ip6_rt_put(rt);
877 	}
878 	return 0;
879 }
880 #endif
881 
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 					struct in6_addr *saddr)
884 {
885 	struct fib6_node *pn, *sn;
886 	while (1) {
887 		if (fn->fn_flags & RTN_TL_ROOT)
888 			return NULL;
889 		pn = rcu_dereference(fn->parent);
890 		sn = FIB6_SUBTREE(pn);
891 		if (sn && sn != fn)
892 			fn = fib6_lookup(sn, NULL, saddr);
893 		else
894 			fn = pn;
895 		if (fn->fn_flags & RTN_RTINFO)
896 			return fn;
897 	}
898 }
899 
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 			  bool null_fallback)
902 {
903 	struct rt6_info *rt = *prt;
904 
905 	if (dst_hold_safe(&rt->dst))
906 		return true;
907 	if (null_fallback) {
908 		rt = net->ipv6.ip6_null_entry;
909 		dst_hold(&rt->dst);
910 	} else {
911 		rt = NULL;
912 	}
913 	*prt = rt;
914 	return false;
915 }
916 
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 					     struct fib6_table *table,
919 					     struct flowi6 *fl6,
920 					     const struct sk_buff *skb,
921 					     int flags)
922 {
923 	struct rt6_info *rt, *rt_cache;
924 	struct fib6_node *fn;
925 
926 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 		flags &= ~RT6_LOOKUP_F_IFACE;
928 
929 	rcu_read_lock();
930 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932 	rt = rcu_dereference(fn->leaf);
933 	if (!rt) {
934 		rt = net->ipv6.ip6_null_entry;
935 	} else {
936 		rt = rt6_device_match(net, rt, &fl6->saddr,
937 				      fl6->flowi6_oif, flags);
938 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939 			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940 						  skb, flags);
941 	}
942 	if (rt == net->ipv6.ip6_null_entry) {
943 		fn = fib6_backtrack(fn, &fl6->saddr);
944 		if (fn)
945 			goto restart;
946 	}
947 	/* Search through exception table */
948 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 	if (rt_cache)
950 		rt = rt_cache;
951 
952 	if (ip6_hold_safe(net, &rt, true))
953 		dst_use_noref(&rt->dst, jiffies);
954 
955 	rcu_read_unlock();
956 
957 	trace_fib6_table_lookup(net, rt, table, fl6);
958 
959 	return rt;
960 
961 }
962 
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964 				   const struct sk_buff *skb, int flags)
965 {
966 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969 
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971 			    const struct in6_addr *saddr, int oif,
972 			    const struct sk_buff *skb, int strict)
973 {
974 	struct flowi6 fl6 = {
975 		.flowi6_oif = oif,
976 		.daddr = *daddr,
977 	};
978 	struct dst_entry *dst;
979 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980 
981 	if (saddr) {
982 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 		flags |= RT6_LOOKUP_F_HAS_SADDR;
984 	}
985 
986 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987 	if (dst->error == 0)
988 		return (struct rt6_info *) dst;
989 
990 	dst_release(dst);
991 
992 	return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995 
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001 
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 			struct mx6_config *mxc,
1004 			struct netlink_ext_ack *extack)
1005 {
1006 	int err;
1007 	struct fib6_table *table;
1008 
1009 	table = rt->rt6i_table;
1010 	spin_lock_bh(&table->tb6_lock);
1011 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 	spin_unlock_bh(&table->tb6_lock);
1013 
1014 	return err;
1015 }
1016 
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1020 	struct mx6_config mxc = { .mx = NULL, };
1021 
1022 	/* Hold dst to account for the reference from the fib6 tree */
1023 	dst_hold(&rt->dst);
1024 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026 
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030 	struct net_device *dev = rt->dst.dev;
1031 
1032 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033 		/* for copies of local routes, dst->dev needs to be the
1034 		 * device if it is a master device, the master device if
1035 		 * device is enslaved, and the loopback as the default
1036 		 */
1037 		if (netif_is_l3_slave(dev) &&
1038 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1039 			dev = l3mdev_master_dev_rcu(dev);
1040 		else if (!netif_is_l3_master(dev))
1041 			dev = dev_net(dev)->loopback_dev;
1042 		/* last case is netif_is_l3_master(dev) is true in which
1043 		 * case we want dev returned to be dev
1044 		 */
1045 	}
1046 
1047 	return dev;
1048 }
1049 
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 					   const struct in6_addr *daddr,
1052 					   const struct in6_addr *saddr)
1053 {
1054 	struct net_device *dev;
1055 	struct rt6_info *rt;
1056 
1057 	/*
1058 	 *	Clone the route.
1059 	 */
1060 
1061 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 		ort = ort->from;
1063 
1064 	rcu_read_lock();
1065 	dev = ip6_rt_get_dev_rcu(ort);
1066 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067 	rcu_read_unlock();
1068 	if (!rt)
1069 		return NULL;
1070 
1071 	ip6_rt_copy_init(rt, ort);
1072 	rt->rt6i_flags |= RTF_CACHE;
1073 	rt->rt6i_metric = 0;
1074 	rt->dst.flags |= DST_HOST;
1075 	rt->rt6i_dst.addr = *daddr;
1076 	rt->rt6i_dst.plen = 128;
1077 
1078 	if (!rt6_is_gw_or_nonexthop(ort)) {
1079 		if (ort->rt6i_dst.plen != 128 &&
1080 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 			rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 		if (rt->rt6i_src.plen && saddr) {
1084 			rt->rt6i_src.addr = *saddr;
1085 			rt->rt6i_src.plen = 128;
1086 		}
1087 #endif
1088 	}
1089 
1090 	return rt;
1091 }
1092 
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095 	struct net_device *dev;
1096 	struct rt6_info *pcpu_rt;
1097 
1098 	rcu_read_lock();
1099 	dev = ip6_rt_get_dev_rcu(rt);
1100 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101 	rcu_read_unlock();
1102 	if (!pcpu_rt)
1103 		return NULL;
1104 	ip6_rt_copy_init(pcpu_rt, rt);
1105 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 	return pcpu_rt;
1108 }
1109 
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113 	struct rt6_info *pcpu_rt, **p;
1114 
1115 	p = this_cpu_ptr(rt->rt6i_pcpu);
1116 	pcpu_rt = *p;
1117 
1118 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 		rt6_dst_from_metrics_check(pcpu_rt);
1120 
1121 	return pcpu_rt;
1122 }
1123 
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126 	struct rt6_info *pcpu_rt, *prev, **p;
1127 
1128 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129 	if (!pcpu_rt) {
1130 		struct net *net = dev_net(rt->dst.dev);
1131 
1132 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 		return net->ipv6.ip6_null_entry;
1134 	}
1135 
1136 	dst_hold(&pcpu_rt->dst);
1137 	p = this_cpu_ptr(rt->rt6i_pcpu);
1138 	prev = cmpxchg(p, NULL, pcpu_rt);
1139 	BUG_ON(prev);
1140 
1141 	rt6_dst_from_metrics_check(pcpu_rt);
1142 	return pcpu_rt;
1143 }
1144 
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148 
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 				 struct rt6_exception *rt6_ex)
1154 {
1155 	struct net *net;
1156 
1157 	if (!bucket || !rt6_ex)
1158 		return;
1159 
1160 	net = dev_net(rt6_ex->rt6i->dst.dev);
1161 	rt6_ex->rt6i->rt6i_node = NULL;
1162 	hlist_del_rcu(&rt6_ex->hlist);
1163 	rt6_release(rt6_ex->rt6i);
1164 	kfree_rcu(rt6_ex, rcu);
1165 	WARN_ON_ONCE(!bucket->depth);
1166 	bucket->depth--;
1167 	net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169 
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175 	struct rt6_exception *rt6_ex, *oldest = NULL;
1176 
1177 	if (!bucket)
1178 		return;
1179 
1180 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182 			oldest = rt6_ex;
1183 	}
1184 	rt6_remove_exception(bucket, oldest);
1185 }
1186 
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188 			      const struct in6_addr *src)
1189 {
1190 	static u32 seed __read_mostly;
1191 	u32 val;
1192 
1193 	net_get_random_once(&seed, sizeof(seed));
1194 	val = jhash(dst, sizeof(*dst), seed);
1195 
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 	if (src)
1198 		val = jhash(src, sizeof(*src), val);
1199 #endif
1200 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202 
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210 			      const struct in6_addr *daddr,
1211 			      const struct in6_addr *saddr)
1212 {
1213 	struct rt6_exception *rt6_ex;
1214 	u32 hval;
1215 
1216 	if (!(*bucket) || !daddr)
1217 		return NULL;
1218 
1219 	hval = rt6_exception_hash(daddr, saddr);
1220 	*bucket += hval;
1221 
1222 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223 		struct rt6_info *rt6 = rt6_ex->rt6i;
1224 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225 
1226 #ifdef CONFIG_IPV6_SUBTREES
1227 		if (matched && saddr)
1228 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230 		if (matched)
1231 			return rt6_ex;
1232 	}
1233 	return NULL;
1234 }
1235 
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243 			 const struct in6_addr *daddr,
1244 			 const struct in6_addr *saddr)
1245 {
1246 	struct rt6_exception *rt6_ex;
1247 	u32 hval;
1248 
1249 	WARN_ON_ONCE(!rcu_read_lock_held());
1250 
1251 	if (!(*bucket) || !daddr)
1252 		return NULL;
1253 
1254 	hval = rt6_exception_hash(daddr, saddr);
1255 	*bucket += hval;
1256 
1257 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258 		struct rt6_info *rt6 = rt6_ex->rt6i;
1259 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260 
1261 #ifdef CONFIG_IPV6_SUBTREES
1262 		if (matched && saddr)
1263 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265 		if (matched)
1266 			return rt6_ex;
1267 	}
1268 	return NULL;
1269 }
1270 
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272 				struct rt6_info *ort)
1273 {
1274 	struct net *net = dev_net(ort->dst.dev);
1275 	struct rt6_exception_bucket *bucket;
1276 	struct in6_addr *src_key = NULL;
1277 	struct rt6_exception *rt6_ex;
1278 	int err = 0;
1279 
1280 	/* ort can't be a cache or pcpu route */
1281 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282 		ort = ort->from;
1283 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284 
1285 	spin_lock_bh(&rt6_exception_lock);
1286 
1287 	if (ort->exception_bucket_flushed) {
1288 		err = -EINVAL;
1289 		goto out;
1290 	}
1291 
1292 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293 					lockdep_is_held(&rt6_exception_lock));
1294 	if (!bucket) {
1295 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296 				 GFP_ATOMIC);
1297 		if (!bucket) {
1298 			err = -ENOMEM;
1299 			goto out;
1300 		}
1301 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302 	}
1303 
1304 #ifdef CONFIG_IPV6_SUBTREES
1305 	/* rt6i_src.plen != 0 indicates ort is in subtree
1306 	 * and exception table is indexed by a hash of
1307 	 * both rt6i_dst and rt6i_src.
1308 	 * Otherwise, the exception table is indexed by
1309 	 * a hash of only rt6i_dst.
1310 	 */
1311 	if (ort->rt6i_src.plen)
1312 		src_key = &nrt->rt6i_src.addr;
1313 #endif
1314 
1315 	/* Update rt6i_prefsrc as it could be changed
1316 	 * in rt6_remove_prefsrc()
1317 	 */
1318 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319 	/* rt6_mtu_change() might lower mtu on ort.
1320 	 * Only insert this exception route if its mtu
1321 	 * is less than ort's mtu value.
1322 	 */
1323 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324 		err = -EINVAL;
1325 		goto out;
1326 	}
1327 
1328 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329 					       src_key);
1330 	if (rt6_ex)
1331 		rt6_remove_exception(bucket, rt6_ex);
1332 
1333 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334 	if (!rt6_ex) {
1335 		err = -ENOMEM;
1336 		goto out;
1337 	}
1338 	rt6_ex->rt6i = nrt;
1339 	rt6_ex->stamp = jiffies;
1340 	atomic_inc(&nrt->rt6i_ref);
1341 	nrt->rt6i_node = ort->rt6i_node;
1342 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343 	bucket->depth++;
1344 	net->ipv6.rt6_stats->fib_rt_cache++;
1345 
1346 	if (bucket->depth > FIB6_MAX_DEPTH)
1347 		rt6_exception_remove_oldest(bucket);
1348 
1349 out:
1350 	spin_unlock_bh(&rt6_exception_lock);
1351 
1352 	/* Update fn->fn_sernum to invalidate all cached dst */
1353 	if (!err) {
1354 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355 		fib6_update_sernum(ort);
1356 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357 		fib6_force_start_gc(net);
1358 	}
1359 
1360 	return err;
1361 }
1362 
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365 	struct rt6_exception_bucket *bucket;
1366 	struct rt6_exception *rt6_ex;
1367 	struct hlist_node *tmp;
1368 	int i;
1369 
1370 	spin_lock_bh(&rt6_exception_lock);
1371 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1372 	rt->exception_bucket_flushed = 1;
1373 
1374 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375 				    lockdep_is_held(&rt6_exception_lock));
1376 	if (!bucket)
1377 		goto out;
1378 
1379 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381 			rt6_remove_exception(bucket, rt6_ex);
1382 		WARN_ON_ONCE(bucket->depth);
1383 		bucket++;
1384 	}
1385 
1386 out:
1387 	spin_unlock_bh(&rt6_exception_lock);
1388 }
1389 
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394 					   struct in6_addr *daddr,
1395 					   struct in6_addr *saddr)
1396 {
1397 	struct rt6_exception_bucket *bucket;
1398 	struct in6_addr *src_key = NULL;
1399 	struct rt6_exception *rt6_ex;
1400 	struct rt6_info *res = NULL;
1401 
1402 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403 
1404 #ifdef CONFIG_IPV6_SUBTREES
1405 	/* rt6i_src.plen != 0 indicates rt is in subtree
1406 	 * and exception table is indexed by a hash of
1407 	 * both rt6i_dst and rt6i_src.
1408 	 * Otherwise, the exception table is indexed by
1409 	 * a hash of only rt6i_dst.
1410 	 */
1411 	if (rt->rt6i_src.plen)
1412 		src_key = saddr;
1413 #endif
1414 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415 
1416 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417 		res = rt6_ex->rt6i;
1418 
1419 	return res;
1420 }
1421 
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425 	struct rt6_exception_bucket *bucket;
1426 	struct rt6_info *from = rt->from;
1427 	struct in6_addr *src_key = NULL;
1428 	struct rt6_exception *rt6_ex;
1429 	int err;
1430 
1431 	if (!from ||
1432 	    !(rt->rt6i_flags & RTF_CACHE))
1433 		return -EINVAL;
1434 
1435 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436 		return -ENOENT;
1437 
1438 	spin_lock_bh(&rt6_exception_lock);
1439 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440 				    lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1443 	 * and exception table is indexed by a hash of
1444 	 * both rt6i_dst and rt6i_src.
1445 	 * Otherwise, the exception table is indexed by
1446 	 * a hash of only rt6i_dst.
1447 	 */
1448 	if (from->rt6i_src.plen)
1449 		src_key = &rt->rt6i_src.addr;
1450 #endif
1451 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452 					       &rt->rt6i_dst.addr,
1453 					       src_key);
1454 	if (rt6_ex) {
1455 		rt6_remove_exception(bucket, rt6_ex);
1456 		err = 0;
1457 	} else {
1458 		err = -ENOENT;
1459 	}
1460 
1461 	spin_unlock_bh(&rt6_exception_lock);
1462 	return err;
1463 }
1464 
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470 	struct rt6_exception_bucket *bucket;
1471 	struct rt6_info *from = rt->from;
1472 	struct in6_addr *src_key = NULL;
1473 	struct rt6_exception *rt6_ex;
1474 
1475 	if (!from ||
1476 	    !(rt->rt6i_flags & RTF_CACHE))
1477 		return;
1478 
1479 	rcu_read_lock();
1480 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1481 
1482 #ifdef CONFIG_IPV6_SUBTREES
1483 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1484 	 * and exception table is indexed by a hash of
1485 	 * both rt6i_dst and rt6i_src.
1486 	 * Otherwise, the exception table is indexed by
1487 	 * a hash of only rt6i_dst.
1488 	 */
1489 	if (from->rt6i_src.plen)
1490 		src_key = &rt->rt6i_src.addr;
1491 #endif
1492 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1493 					  &rt->rt6i_dst.addr,
1494 					  src_key);
1495 	if (rt6_ex)
1496 		rt6_ex->stamp = jiffies;
1497 
1498 	rcu_read_unlock();
1499 }
1500 
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503 	struct rt6_exception_bucket *bucket;
1504 	struct rt6_exception *rt6_ex;
1505 	int i;
1506 
1507 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508 					lockdep_is_held(&rt6_exception_lock));
1509 
1510 	if (bucket) {
1511 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514 			}
1515 			bucket++;
1516 		}
1517 	}
1518 }
1519 
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521 					 struct rt6_info *rt, int mtu)
1522 {
1523 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1524 	 * lowest MTU in the path: always allow updating the route PMTU to
1525 	 * reflect PMTU decreases.
1526 	 *
1527 	 * If the new MTU is higher, and the route PMTU is equal to the local
1528 	 * MTU, this means the old MTU is the lowest in the path, so allow
1529 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530 	 * handle this.
1531 	 */
1532 
1533 	if (dst_mtu(&rt->dst) >= mtu)
1534 		return true;
1535 
1536 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537 		return true;
1538 
1539 	return false;
1540 }
1541 
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 				       struct rt6_info *rt, int mtu)
1544 {
1545 	struct rt6_exception_bucket *bucket;
1546 	struct rt6_exception *rt6_ex;
1547 	int i;
1548 
1549 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 					lockdep_is_held(&rt6_exception_lock));
1551 
1552 	if (!bucket)
1553 		return;
1554 
1555 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557 			struct rt6_info *entry = rt6_ex->rt6i;
1558 
1559 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 			 * route), the metrics of its rt->dst.from have already
1561 			 * been updated.
1562 			 */
1563 			if (entry->rt6i_pmtu &&
1564 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 				entry->rt6i_pmtu = mtu;
1566 		}
1567 		bucket++;
1568 	}
1569 }
1570 
1571 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1572 
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574 					struct in6_addr *gateway)
1575 {
1576 	struct rt6_exception_bucket *bucket;
1577 	struct rt6_exception *rt6_ex;
1578 	struct hlist_node *tmp;
1579 	int i;
1580 
1581 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582 		return;
1583 
1584 	spin_lock_bh(&rt6_exception_lock);
1585 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586 				     lockdep_is_held(&rt6_exception_lock));
1587 
1588 	if (bucket) {
1589 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590 			hlist_for_each_entry_safe(rt6_ex, tmp,
1591 						  &bucket->chain, hlist) {
1592 				struct rt6_info *entry = rt6_ex->rt6i;
1593 
1594 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595 				    RTF_CACHE_GATEWAY &&
1596 				    ipv6_addr_equal(gateway,
1597 						    &entry->rt6i_gateway)) {
1598 					rt6_remove_exception(bucket, rt6_ex);
1599 				}
1600 			}
1601 			bucket++;
1602 		}
1603 	}
1604 
1605 	spin_unlock_bh(&rt6_exception_lock);
1606 }
1607 
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609 				      struct rt6_exception *rt6_ex,
1610 				      struct fib6_gc_args *gc_args,
1611 				      unsigned long now)
1612 {
1613 	struct rt6_info *rt = rt6_ex->rt6i;
1614 
1615 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1616 	 * even if others have still references to them, so that on next
1617 	 * dst_check() such references can be dropped.
1618 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619 	 * expired, independently from their aging, as per RFC 8201 section 4
1620 	 */
1621 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623 			RT6_TRACE("aging clone %p\n", rt);
1624 			rt6_remove_exception(bucket, rt6_ex);
1625 			return;
1626 		}
1627 	} else if (time_after(jiffies, rt->dst.expires)) {
1628 		RT6_TRACE("purging expired route %p\n", rt);
1629 		rt6_remove_exception(bucket, rt6_ex);
1630 		return;
1631 	}
1632 
1633 	if (rt->rt6i_flags & RTF_GATEWAY) {
1634 		struct neighbour *neigh;
1635 		__u8 neigh_flags = 0;
1636 
1637 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638 		if (neigh)
1639 			neigh_flags = neigh->flags;
1640 
1641 		if (!(neigh_flags & NTF_ROUTER)) {
1642 			RT6_TRACE("purging route %p via non-router but gateway\n",
1643 				  rt);
1644 			rt6_remove_exception(bucket, rt6_ex);
1645 			return;
1646 		}
1647 	}
1648 
1649 	gc_args->more++;
1650 }
1651 
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653 			struct fib6_gc_args *gc_args,
1654 			unsigned long now)
1655 {
1656 	struct rt6_exception_bucket *bucket;
1657 	struct rt6_exception *rt6_ex;
1658 	struct hlist_node *tmp;
1659 	int i;
1660 
1661 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662 		return;
1663 
1664 	rcu_read_lock_bh();
1665 	spin_lock(&rt6_exception_lock);
1666 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 				    lockdep_is_held(&rt6_exception_lock));
1668 
1669 	if (bucket) {
1670 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 			hlist_for_each_entry_safe(rt6_ex, tmp,
1672 						  &bucket->chain, hlist) {
1673 				rt6_age_examine_exception(bucket, rt6_ex,
1674 							  gc_args, now);
1675 			}
1676 			bucket++;
1677 		}
1678 	}
1679 	spin_unlock(&rt6_exception_lock);
1680 	rcu_read_unlock_bh();
1681 }
1682 
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684 			       int oif, struct flowi6 *fl6,
1685 			       const struct sk_buff *skb, int flags)
1686 {
1687 	struct fib6_node *fn, *saved_fn;
1688 	struct rt6_info *rt, *rt_cache;
1689 	int strict = 0;
1690 
1691 	strict |= flags & RT6_LOOKUP_F_IFACE;
1692 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693 	if (net->ipv6.devconf_all->forwarding == 0)
1694 		strict |= RT6_LOOKUP_F_REACHABLE;
1695 
1696 	rcu_read_lock();
1697 
1698 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699 	saved_fn = fn;
1700 
1701 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702 		oif = 0;
1703 
1704 redo_rt6_select:
1705 	rt = rt6_select(net, fn, oif, strict);
1706 	if (rt->rt6i_nsiblings)
1707 		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708 	if (rt == net->ipv6.ip6_null_entry) {
1709 		fn = fib6_backtrack(fn, &fl6->saddr);
1710 		if (fn)
1711 			goto redo_rt6_select;
1712 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713 			/* also consider unreachable route */
1714 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1715 			fn = saved_fn;
1716 			goto redo_rt6_select;
1717 		}
1718 	}
1719 
1720 	/*Search through exception table */
1721 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722 	if (rt_cache)
1723 		rt = rt_cache;
1724 
1725 	if (rt == net->ipv6.ip6_null_entry) {
1726 		rcu_read_unlock();
1727 		dst_hold(&rt->dst);
1728 		trace_fib6_table_lookup(net, rt, table, fl6);
1729 		return rt;
1730 	} else if (rt->rt6i_flags & RTF_CACHE) {
1731 		if (ip6_hold_safe(net, &rt, true)) {
1732 			dst_use_noref(&rt->dst, jiffies);
1733 			rt6_dst_from_metrics_check(rt);
1734 		}
1735 		rcu_read_unlock();
1736 		trace_fib6_table_lookup(net, rt, table, fl6);
1737 		return rt;
1738 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1740 		/* Create a RTF_CACHE clone which will not be
1741 		 * owned by the fib6 tree.  It is for the special case where
1742 		 * the daddr in the skb during the neighbor look-up is different
1743 		 * from the fl6->daddr used to look-up route here.
1744 		 */
1745 
1746 		struct rt6_info *uncached_rt;
1747 
1748 		if (ip6_hold_safe(net, &rt, true)) {
1749 			dst_use_noref(&rt->dst, jiffies);
1750 		} else {
1751 			rcu_read_unlock();
1752 			uncached_rt = rt;
1753 			goto uncached_rt_out;
1754 		}
1755 		rcu_read_unlock();
1756 
1757 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758 		dst_release(&rt->dst);
1759 
1760 		if (uncached_rt) {
1761 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762 			 * No need for another dst_hold()
1763 			 */
1764 			rt6_uncached_list_add(uncached_rt);
1765 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766 		} else {
1767 			uncached_rt = net->ipv6.ip6_null_entry;
1768 			dst_hold(&uncached_rt->dst);
1769 		}
1770 
1771 uncached_rt_out:
1772 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773 		return uncached_rt;
1774 
1775 	} else {
1776 		/* Get a percpu copy */
1777 
1778 		struct rt6_info *pcpu_rt;
1779 
1780 		dst_use_noref(&rt->dst, jiffies);
1781 		local_bh_disable();
1782 		pcpu_rt = rt6_get_pcpu_route(rt);
1783 
1784 		if (!pcpu_rt) {
1785 			/* atomic_inc_not_zero() is needed when using rcu */
1786 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787 				/* No dst_hold() on rt is needed because grabbing
1788 				 * rt->rt6i_ref makes sure rt can't be released.
1789 				 */
1790 				pcpu_rt = rt6_make_pcpu_route(rt);
1791 				rt6_release(rt);
1792 			} else {
1793 				/* rt is already removed from tree */
1794 				pcpu_rt = net->ipv6.ip6_null_entry;
1795 				dst_hold(&pcpu_rt->dst);
1796 			}
1797 		}
1798 		local_bh_enable();
1799 		rcu_read_unlock();
1800 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801 		return pcpu_rt;
1802 	}
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805 
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807 					    struct fib6_table *table,
1808 					    struct flowi6 *fl6,
1809 					    const struct sk_buff *skb,
1810 					    int flags)
1811 {
1812 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814 
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816 					 struct net_device *dev,
1817 					 struct flowi6 *fl6,
1818 					 const struct sk_buff *skb,
1819 					 int flags)
1820 {
1821 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822 		flags |= RT6_LOOKUP_F_IFACE;
1823 
1824 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827 
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829 				  struct flow_keys *keys,
1830 				  struct flow_keys *flkeys)
1831 {
1832 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833 	const struct ipv6hdr *key_iph = outer_iph;
1834 	struct flow_keys *_flkeys = flkeys;
1835 	const struct ipv6hdr *inner_iph;
1836 	const struct icmp6hdr *icmph;
1837 	struct ipv6hdr _inner_iph;
1838 
1839 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840 		goto out;
1841 
1842 	icmph = icmp6_hdr(skb);
1843 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1847 		goto out;
1848 
1849 	inner_iph = skb_header_pointer(skb,
1850 				       skb_transport_offset(skb) + sizeof(*icmph),
1851 				       sizeof(_inner_iph), &_inner_iph);
1852 	if (!inner_iph)
1853 		goto out;
1854 
1855 	key_iph = inner_iph;
1856 	_flkeys = NULL;
1857 out:
1858 	if (_flkeys) {
1859 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861 		keys->tags.flow_label = _flkeys->tags.flow_label;
1862 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863 	} else {
1864 		keys->addrs.v6addrs.src = key_iph->saddr;
1865 		keys->addrs.v6addrs.dst = key_iph->daddr;
1866 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1867 		keys->basic.ip_proto = key_iph->nexthdr;
1868 	}
1869 }
1870 
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875 	struct flow_keys hash_keys;
1876 	u32 mhash;
1877 
1878 	switch (ip6_multipath_hash_policy(net)) {
1879 	case 0:
1880 		memset(&hash_keys, 0, sizeof(hash_keys));
1881 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882 		if (skb) {
1883 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884 		} else {
1885 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1886 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889 		}
1890 		break;
1891 	case 1:
1892 		if (skb) {
1893 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894 			struct flow_keys keys;
1895 
1896 			/* short-circuit if we already have L4 hash present */
1897 			if (skb->l4_hash)
1898 				return skb_get_hash_raw(skb) >> 1;
1899 
1900 			memset(&hash_keys, 0, sizeof(hash_keys));
1901 
1902                         if (!flkeys) {
1903 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1904 				flkeys = &keys;
1905 			}
1906 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909 			hash_keys.ports.src = flkeys->ports.src;
1910 			hash_keys.ports.dst = flkeys->ports.dst;
1911 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912 		} else {
1913 			memset(&hash_keys, 0, sizeof(hash_keys));
1914 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1916 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917 			hash_keys.ports.src = fl6->fl6_sport;
1918 			hash_keys.ports.dst = fl6->fl6_dport;
1919 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920 		}
1921 		break;
1922 	}
1923 	mhash = flow_hash_from_keys(&hash_keys);
1924 
1925 	return mhash >> 1;
1926 }
1927 
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1931 	struct net *net = dev_net(skb->dev);
1932 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1933 	struct ip_tunnel_info *tun_info;
1934 	struct flowi6 fl6 = {
1935 		.flowi6_iif = skb->dev->ifindex,
1936 		.daddr = iph->daddr,
1937 		.saddr = iph->saddr,
1938 		.flowlabel = ip6_flowinfo(iph),
1939 		.flowi6_mark = skb->mark,
1940 		.flowi6_proto = iph->nexthdr,
1941 	};
1942 	struct flow_keys *flkeys = NULL, _flkeys;
1943 
1944 	tun_info = skb_tunnel_info(skb);
1945 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947 
1948 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949 		flkeys = &_flkeys;
1950 
1951 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953 	skb_dst_drop(skb);
1954 	skb_dst_set(skb,
1955 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957 
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959 					     struct fib6_table *table,
1960 					     struct flowi6 *fl6,
1961 					     const struct sk_buff *skb,
1962 					     int flags)
1963 {
1964 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966 
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968 					 struct flowi6 *fl6, int flags)
1969 {
1970 	bool any_src;
1971 
1972 	if (rt6_need_strict(&fl6->daddr)) {
1973 		struct dst_entry *dst;
1974 
1975 		dst = l3mdev_link_scope_lookup(net, fl6);
1976 		if (dst)
1977 			return dst;
1978 	}
1979 
1980 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981 
1982 	any_src = ipv6_addr_any(&fl6->saddr);
1983 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984 	    (fl6->flowi6_oif && any_src))
1985 		flags |= RT6_LOOKUP_F_IFACE;
1986 
1987 	if (!any_src)
1988 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1989 	else if (sk)
1990 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991 
1992 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995 
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999 	struct net_device *loopback_dev = net->loopback_dev;
2000 	struct dst_entry *new = NULL;
2001 
2002 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003 		       DST_OBSOLETE_DEAD, 0);
2004 	if (rt) {
2005 		rt6_info_init(rt);
2006 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007 
2008 		new = &rt->dst;
2009 		new->__use = 1;
2010 		new->input = dst_discard;
2011 		new->output = dst_discard_out;
2012 
2013 		dst_copy_metrics(new, &ort->dst);
2014 
2015 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2016 		rt->rt6i_gateway = ort->rt6i_gateway;
2017 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018 		rt->rt6i_metric = 0;
2019 
2020 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024 	}
2025 
2026 	dst_release(dst_orig);
2027 	return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029 
2030 /*
2031  *	Destination cache support functions
2032  */
2033 
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036 	if (rt->from &&
2037 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040 
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043 	u32 rt_cookie = 0;
2044 
2045 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046 		return NULL;
2047 
2048 	if (rt6_check_expired(rt))
2049 		return NULL;
2050 
2051 	return &rt->dst;
2052 }
2053 
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056 	if (!__rt6_check_expired(rt) &&
2057 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058 	    rt6_check(rt->from, cookie))
2059 		return &rt->dst;
2060 	else
2061 		return NULL;
2062 }
2063 
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066 	struct rt6_info *rt;
2067 
2068 	rt = (struct rt6_info *) dst;
2069 
2070 	/* All IPV6 dsts are created with ->obsolete set to the value
2071 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072 	 * into this function always.
2073 	 */
2074 
2075 	rt6_dst_from_metrics_check(rt);
2076 
2077 	if (rt->rt6i_flags & RTF_PCPU ||
2078 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079 		return rt6_dst_from_check(rt, cookie);
2080 	else
2081 		return rt6_check(rt, cookie);
2082 }
2083 
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086 	struct rt6_info *rt = (struct rt6_info *) dst;
2087 
2088 	if (rt) {
2089 		if (rt->rt6i_flags & RTF_CACHE) {
2090 			if (rt6_check_expired(rt)) {
2091 				ip6_del_rt(rt);
2092 				dst = NULL;
2093 			}
2094 		} else {
2095 			dst_release(dst);
2096 			dst = NULL;
2097 		}
2098 	}
2099 	return dst;
2100 }
2101 
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104 	struct rt6_info *rt;
2105 
2106 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107 
2108 	rt = (struct rt6_info *) skb_dst(skb);
2109 	if (rt) {
2110 		if (rt->rt6i_flags & RTF_CACHE) {
2111 			if (dst_hold_safe(&rt->dst))
2112 				ip6_del_rt(rt);
2113 		} else {
2114 			struct fib6_node *fn;
2115 
2116 			rcu_read_lock();
2117 			fn = rcu_dereference(rt->rt6i_node);
2118 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119 				fn->fn_sernum = -1;
2120 			rcu_read_unlock();
2121 		}
2122 	}
2123 }
2124 
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127 	struct net *net = dev_net(rt->dst.dev);
2128 
2129 	rt->rt6i_flags |= RTF_MODIFIED;
2130 	rt->rt6i_pmtu = mtu;
2131 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133 
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136 	return !(rt->rt6i_flags & RTF_CACHE) &&
2137 		(rt->rt6i_flags & RTF_PCPU ||
2138 		 rcu_access_pointer(rt->rt6i_node));
2139 }
2140 
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142 				 const struct ipv6hdr *iph, u32 mtu)
2143 {
2144 	const struct in6_addr *daddr, *saddr;
2145 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2146 
2147 	if (rt6->rt6i_flags & RTF_LOCAL)
2148 		return;
2149 
2150 	if (dst_metric_locked(dst, RTAX_MTU))
2151 		return;
2152 
2153 	if (iph) {
2154 		daddr = &iph->daddr;
2155 		saddr = &iph->saddr;
2156 	} else if (sk) {
2157 		daddr = &sk->sk_v6_daddr;
2158 		saddr = &inet6_sk(sk)->saddr;
2159 	} else {
2160 		daddr = NULL;
2161 		saddr = NULL;
2162 	}
2163 	dst_confirm_neigh(dst, daddr);
2164 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165 	if (mtu >= dst_mtu(dst))
2166 		return;
2167 
2168 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169 		rt6_do_update_pmtu(rt6, mtu);
2170 		/* update rt6_ex->stamp for cache */
2171 		if (rt6->rt6i_flags & RTF_CACHE)
2172 			rt6_update_exception_stamp_rt(rt6);
2173 	} else if (daddr) {
2174 		struct rt6_info *nrt6;
2175 
2176 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177 		if (nrt6) {
2178 			rt6_do_update_pmtu(nrt6, mtu);
2179 			if (rt6_insert_exception(nrt6, rt6))
2180 				dst_release_immediate(&nrt6->dst);
2181 		}
2182 	}
2183 }
2184 
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186 			       struct sk_buff *skb, u32 mtu)
2187 {
2188 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190 
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192 		     int oif, u32 mark, kuid_t uid)
2193 {
2194 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195 	struct dst_entry *dst;
2196 	struct flowi6 fl6;
2197 
2198 	memset(&fl6, 0, sizeof(fl6));
2199 	fl6.flowi6_oif = oif;
2200 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201 	fl6.daddr = iph->daddr;
2202 	fl6.saddr = iph->saddr;
2203 	fl6.flowlabel = ip6_flowinfo(iph);
2204 	fl6.flowi6_uid = uid;
2205 
2206 	dst = ip6_route_output(net, NULL, &fl6);
2207 	if (!dst->error)
2208 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209 	dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212 
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215 	struct dst_entry *dst;
2216 
2217 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2218 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219 
2220 	dst = __sk_dst_get(sk);
2221 	if (!dst || !dst->obsolete ||
2222 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223 		return;
2224 
2225 	bh_lock_sock(sk);
2226 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227 		ip6_datagram_dst_update(sk, false);
2228 	bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231 
2232 /* Handle redirects */
2233 struct ip6rd_flowi {
2234 	struct flowi6 fl6;
2235 	struct in6_addr gateway;
2236 };
2237 
2238 static struct rt6_info *__ip6_route_redirect(struct net *net,
2239 					     struct fib6_table *table,
2240 					     struct flowi6 *fl6,
2241 					     const struct sk_buff *skb,
2242 					     int flags)
2243 {
2244 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2245 	struct rt6_info *rt, *rt_cache;
2246 	struct fib6_node *fn;
2247 
2248 	/* Get the "current" route for this destination and
2249 	 * check if the redirect has come from appropriate router.
2250 	 *
2251 	 * RFC 4861 specifies that redirects should only be
2252 	 * accepted if they come from the nexthop to the target.
2253 	 * Due to the way the routes are chosen, this notion
2254 	 * is a bit fuzzy and one might need to check all possible
2255 	 * routes.
2256 	 */
2257 
2258 	rcu_read_lock();
2259 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2260 restart:
2261 	for_each_fib6_node_rt_rcu(fn) {
2262 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2263 			continue;
2264 		if (rt6_check_expired(rt))
2265 			continue;
2266 		if (rt->dst.error)
2267 			break;
2268 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2269 			continue;
2270 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2271 			continue;
2272 		/* rt_cache's gateway might be different from its 'parent'
2273 		 * in the case of an ip redirect.
2274 		 * So we keep searching in the exception table if the gateway
2275 		 * is different.
2276 		 */
2277 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2278 			rt_cache = rt6_find_cached_rt(rt,
2279 						      &fl6->daddr,
2280 						      &fl6->saddr);
2281 			if (rt_cache &&
2282 			    ipv6_addr_equal(&rdfl->gateway,
2283 					    &rt_cache->rt6i_gateway)) {
2284 				rt = rt_cache;
2285 				break;
2286 			}
2287 			continue;
2288 		}
2289 		break;
2290 	}
2291 
2292 	if (!rt)
2293 		rt = net->ipv6.ip6_null_entry;
2294 	else if (rt->dst.error) {
2295 		rt = net->ipv6.ip6_null_entry;
2296 		goto out;
2297 	}
2298 
2299 	if (rt == net->ipv6.ip6_null_entry) {
2300 		fn = fib6_backtrack(fn, &fl6->saddr);
2301 		if (fn)
2302 			goto restart;
2303 	}
2304 
2305 out:
2306 	ip6_hold_safe(net, &rt, true);
2307 
2308 	rcu_read_unlock();
2309 
2310 	trace_fib6_table_lookup(net, rt, table, fl6);
2311 	return rt;
2312 };
2313 
2314 static struct dst_entry *ip6_route_redirect(struct net *net,
2315 					    const struct flowi6 *fl6,
2316 					    const struct sk_buff *skb,
2317 					    const struct in6_addr *gateway)
2318 {
2319 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2320 	struct ip6rd_flowi rdfl;
2321 
2322 	rdfl.fl6 = *fl6;
2323 	rdfl.gateway = *gateway;
2324 
2325 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2326 				flags, __ip6_route_redirect);
2327 }
2328 
2329 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2330 		  kuid_t uid)
2331 {
2332 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2333 	struct dst_entry *dst;
2334 	struct flowi6 fl6;
2335 
2336 	memset(&fl6, 0, sizeof(fl6));
2337 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2338 	fl6.flowi6_oif = oif;
2339 	fl6.flowi6_mark = mark;
2340 	fl6.daddr = iph->daddr;
2341 	fl6.saddr = iph->saddr;
2342 	fl6.flowlabel = ip6_flowinfo(iph);
2343 	fl6.flowi6_uid = uid;
2344 
2345 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2346 	rt6_do_redirect(dst, NULL, skb);
2347 	dst_release(dst);
2348 }
2349 EXPORT_SYMBOL_GPL(ip6_redirect);
2350 
2351 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2352 			    u32 mark)
2353 {
2354 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2355 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2356 	struct dst_entry *dst;
2357 	struct flowi6 fl6;
2358 
2359 	memset(&fl6, 0, sizeof(fl6));
2360 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2361 	fl6.flowi6_oif = oif;
2362 	fl6.flowi6_mark = mark;
2363 	fl6.daddr = msg->dest;
2364 	fl6.saddr = iph->daddr;
2365 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2366 
2367 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2368 	rt6_do_redirect(dst, NULL, skb);
2369 	dst_release(dst);
2370 }
2371 
2372 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2373 {
2374 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2375 		     sk->sk_uid);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2378 
2379 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2380 {
2381 	struct net_device *dev = dst->dev;
2382 	unsigned int mtu = dst_mtu(dst);
2383 	struct net *net = dev_net(dev);
2384 
2385 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2386 
2387 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2388 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2389 
2390 	/*
2391 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2392 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2393 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2394 	 * rely only on pmtu discovery"
2395 	 */
2396 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2397 		mtu = IPV6_MAXPLEN;
2398 	return mtu;
2399 }
2400 
2401 static unsigned int ip6_mtu(const struct dst_entry *dst)
2402 {
2403 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2404 	unsigned int mtu = rt->rt6i_pmtu;
2405 	struct inet6_dev *idev;
2406 
2407 	if (mtu)
2408 		goto out;
2409 
2410 	mtu = dst_metric_raw(dst, RTAX_MTU);
2411 	if (mtu)
2412 		goto out;
2413 
2414 	mtu = IPV6_MIN_MTU;
2415 
2416 	rcu_read_lock();
2417 	idev = __in6_dev_get(dst->dev);
2418 	if (idev)
2419 		mtu = idev->cnf.mtu6;
2420 	rcu_read_unlock();
2421 
2422 out:
2423 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2424 
2425 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2426 }
2427 
2428 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2429 				  struct flowi6 *fl6)
2430 {
2431 	struct dst_entry *dst;
2432 	struct rt6_info *rt;
2433 	struct inet6_dev *idev = in6_dev_get(dev);
2434 	struct net *net = dev_net(dev);
2435 
2436 	if (unlikely(!idev))
2437 		return ERR_PTR(-ENODEV);
2438 
2439 	rt = ip6_dst_alloc(net, dev, 0);
2440 	if (unlikely(!rt)) {
2441 		in6_dev_put(idev);
2442 		dst = ERR_PTR(-ENOMEM);
2443 		goto out;
2444 	}
2445 
2446 	rt->dst.flags |= DST_HOST;
2447 	rt->dst.input = ip6_input;
2448 	rt->dst.output  = ip6_output;
2449 	rt->rt6i_gateway  = fl6->daddr;
2450 	rt->rt6i_dst.addr = fl6->daddr;
2451 	rt->rt6i_dst.plen = 128;
2452 	rt->rt6i_idev     = idev;
2453 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2454 
2455 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2456 	 * do proper release of the net_device
2457 	 */
2458 	rt6_uncached_list_add(rt);
2459 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2460 
2461 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2462 
2463 out:
2464 	return dst;
2465 }
2466 
2467 static int ip6_dst_gc(struct dst_ops *ops)
2468 {
2469 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2470 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2471 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2472 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2473 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2474 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2475 	int entries;
2476 
2477 	entries = dst_entries_get_fast(ops);
2478 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2479 	    entries <= rt_max_size)
2480 		goto out;
2481 
2482 	net->ipv6.ip6_rt_gc_expire++;
2483 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2484 	entries = dst_entries_get_slow(ops);
2485 	if (entries < ops->gc_thresh)
2486 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2487 out:
2488 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2489 	return entries > rt_max_size;
2490 }
2491 
2492 static int ip6_convert_metrics(struct mx6_config *mxc,
2493 			       const struct fib6_config *cfg)
2494 {
2495 	struct net *net = cfg->fc_nlinfo.nl_net;
2496 	bool ecn_ca = false;
2497 	struct nlattr *nla;
2498 	int remaining;
2499 	u32 *mp;
2500 
2501 	if (!cfg->fc_mx)
2502 		return 0;
2503 
2504 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2505 	if (unlikely(!mp))
2506 		return -ENOMEM;
2507 
2508 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2509 		int type = nla_type(nla);
2510 		u32 val;
2511 
2512 		if (!type)
2513 			continue;
2514 		if (unlikely(type > RTAX_MAX))
2515 			goto err;
2516 
2517 		if (type == RTAX_CC_ALGO) {
2518 			char tmp[TCP_CA_NAME_MAX];
2519 
2520 			nla_strlcpy(tmp, nla, sizeof(tmp));
2521 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2522 			if (val == TCP_CA_UNSPEC)
2523 				goto err;
2524 		} else {
2525 			val = nla_get_u32(nla);
2526 		}
2527 		if (type == RTAX_HOPLIMIT && val > 255)
2528 			val = 255;
2529 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2530 			goto err;
2531 
2532 		mp[type - 1] = val;
2533 		__set_bit(type - 1, mxc->mx_valid);
2534 	}
2535 
2536 	if (ecn_ca) {
2537 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2538 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2539 	}
2540 
2541 	mxc->mx = mp;
2542 	return 0;
2543  err:
2544 	kfree(mp);
2545 	return -EINVAL;
2546 }
2547 
2548 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2549 					    struct fib6_config *cfg,
2550 					    const struct in6_addr *gw_addr,
2551 					    u32 tbid, int flags)
2552 {
2553 	struct flowi6 fl6 = {
2554 		.flowi6_oif = cfg->fc_ifindex,
2555 		.daddr = *gw_addr,
2556 		.saddr = cfg->fc_prefsrc,
2557 	};
2558 	struct fib6_table *table;
2559 	struct rt6_info *rt;
2560 
2561 	table = fib6_get_table(net, tbid);
2562 	if (!table)
2563 		return NULL;
2564 
2565 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2566 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2567 
2568 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2569 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2570 
2571 	/* if table lookup failed, fall back to full lookup */
2572 	if (rt == net->ipv6.ip6_null_entry) {
2573 		ip6_rt_put(rt);
2574 		rt = NULL;
2575 	}
2576 
2577 	return rt;
2578 }
2579 
2580 static int ip6_route_check_nh_onlink(struct net *net,
2581 				     struct fib6_config *cfg,
2582 				     const struct net_device *dev,
2583 				     struct netlink_ext_ack *extack)
2584 {
2585 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2586 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2587 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2588 	struct rt6_info *grt;
2589 	int err;
2590 
2591 	err = 0;
2592 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2593 	if (grt) {
2594 		if (!grt->dst.error &&
2595 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2596 			NL_SET_ERR_MSG(extack,
2597 				       "Nexthop has invalid gateway or device mismatch");
2598 			err = -EINVAL;
2599 		}
2600 
2601 		ip6_rt_put(grt);
2602 	}
2603 
2604 	return err;
2605 }
2606 
2607 static int ip6_route_check_nh(struct net *net,
2608 			      struct fib6_config *cfg,
2609 			      struct net_device **_dev,
2610 			      struct inet6_dev **idev)
2611 {
2612 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2613 	struct net_device *dev = _dev ? *_dev : NULL;
2614 	struct rt6_info *grt = NULL;
2615 	int err = -EHOSTUNREACH;
2616 
2617 	if (cfg->fc_table) {
2618 		int flags = RT6_LOOKUP_F_IFACE;
2619 
2620 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2621 					  cfg->fc_table, flags);
2622 		if (grt) {
2623 			if (grt->rt6i_flags & RTF_GATEWAY ||
2624 			    (dev && dev != grt->dst.dev)) {
2625 				ip6_rt_put(grt);
2626 				grt = NULL;
2627 			}
2628 		}
2629 	}
2630 
2631 	if (!grt)
2632 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2633 
2634 	if (!grt)
2635 		goto out;
2636 
2637 	if (dev) {
2638 		if (dev != grt->dst.dev) {
2639 			ip6_rt_put(grt);
2640 			goto out;
2641 		}
2642 	} else {
2643 		*_dev = dev = grt->dst.dev;
2644 		*idev = grt->rt6i_idev;
2645 		dev_hold(dev);
2646 		in6_dev_hold(grt->rt6i_idev);
2647 	}
2648 
2649 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2650 		err = 0;
2651 
2652 	ip6_rt_put(grt);
2653 
2654 out:
2655 	return err;
2656 }
2657 
2658 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2659 			   struct net_device **_dev, struct inet6_dev **idev,
2660 			   struct netlink_ext_ack *extack)
2661 {
2662 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2663 	int gwa_type = ipv6_addr_type(gw_addr);
2664 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2665 	const struct net_device *dev = *_dev;
2666 	bool need_addr_check = !dev;
2667 	int err = -EINVAL;
2668 
2669 	/* if gw_addr is local we will fail to detect this in case
2670 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2671 	 * will return already-added prefix route via interface that
2672 	 * prefix route was assigned to, which might be non-loopback.
2673 	 */
2674 	if (dev &&
2675 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2676 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2677 		goto out;
2678 	}
2679 
2680 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2681 		/* IPv6 strictly inhibits using not link-local
2682 		 * addresses as nexthop address.
2683 		 * Otherwise, router will not able to send redirects.
2684 		 * It is very good, but in some (rare!) circumstances
2685 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2686 		 * some exceptions. --ANK
2687 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2688 		 * addressing
2689 		 */
2690 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2691 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2692 			goto out;
2693 		}
2694 
2695 		if (cfg->fc_flags & RTNH_F_ONLINK)
2696 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2697 		else
2698 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2699 
2700 		if (err)
2701 			goto out;
2702 	}
2703 
2704 	/* reload in case device was changed */
2705 	dev = *_dev;
2706 
2707 	err = -EINVAL;
2708 	if (!dev) {
2709 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2710 		goto out;
2711 	} else if (dev->flags & IFF_LOOPBACK) {
2712 		NL_SET_ERR_MSG(extack,
2713 			       "Egress device can not be loopback device for this route");
2714 		goto out;
2715 	}
2716 
2717 	/* if we did not check gw_addr above, do so now that the
2718 	 * egress device has been resolved.
2719 	 */
2720 	if (need_addr_check &&
2721 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2722 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2723 		goto out;
2724 	}
2725 
2726 	err = 0;
2727 out:
2728 	return err;
2729 }
2730 
2731 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2732 					      struct netlink_ext_ack *extack)
2733 {
2734 	struct net *net = cfg->fc_nlinfo.nl_net;
2735 	struct rt6_info *rt = NULL;
2736 	struct net_device *dev = NULL;
2737 	struct inet6_dev *idev = NULL;
2738 	struct fib6_table *table;
2739 	int addr_type;
2740 	int err = -EINVAL;
2741 
2742 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2743 	if (cfg->fc_flags & RTF_PCPU) {
2744 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2745 		goto out;
2746 	}
2747 
2748 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2749 	if (cfg->fc_flags & RTF_CACHE) {
2750 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2751 		goto out;
2752 	}
2753 
2754 	if (cfg->fc_dst_len > 128) {
2755 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2756 		goto out;
2757 	}
2758 	if (cfg->fc_src_len > 128) {
2759 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2760 		goto out;
2761 	}
2762 #ifndef CONFIG_IPV6_SUBTREES
2763 	if (cfg->fc_src_len) {
2764 		NL_SET_ERR_MSG(extack,
2765 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2766 		goto out;
2767 	}
2768 #endif
2769 	if (cfg->fc_ifindex) {
2770 		err = -ENODEV;
2771 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2772 		if (!dev)
2773 			goto out;
2774 		idev = in6_dev_get(dev);
2775 		if (!idev)
2776 			goto out;
2777 	}
2778 
2779 	if (cfg->fc_metric == 0)
2780 		cfg->fc_metric = IP6_RT_PRIO_USER;
2781 
2782 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2783 		if (!dev) {
2784 			NL_SET_ERR_MSG(extack,
2785 				       "Nexthop device required for onlink");
2786 			err = -ENODEV;
2787 			goto out;
2788 		}
2789 
2790 		if (!(dev->flags & IFF_UP)) {
2791 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2792 			err = -ENETDOWN;
2793 			goto out;
2794 		}
2795 	}
2796 
2797 	err = -ENOBUFS;
2798 	if (cfg->fc_nlinfo.nlh &&
2799 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2800 		table = fib6_get_table(net, cfg->fc_table);
2801 		if (!table) {
2802 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2803 			table = fib6_new_table(net, cfg->fc_table);
2804 		}
2805 	} else {
2806 		table = fib6_new_table(net, cfg->fc_table);
2807 	}
2808 
2809 	if (!table)
2810 		goto out;
2811 
2812 	rt = ip6_dst_alloc(net, NULL,
2813 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2814 
2815 	if (!rt) {
2816 		err = -ENOMEM;
2817 		goto out;
2818 	}
2819 
2820 	if (cfg->fc_flags & RTF_EXPIRES)
2821 		rt6_set_expires(rt, jiffies +
2822 				clock_t_to_jiffies(cfg->fc_expires));
2823 	else
2824 		rt6_clean_expires(rt);
2825 
2826 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2827 		cfg->fc_protocol = RTPROT_BOOT;
2828 	rt->rt6i_protocol = cfg->fc_protocol;
2829 
2830 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2831 
2832 	if (addr_type & IPV6_ADDR_MULTICAST)
2833 		rt->dst.input = ip6_mc_input;
2834 	else if (cfg->fc_flags & RTF_LOCAL)
2835 		rt->dst.input = ip6_input;
2836 	else
2837 		rt->dst.input = ip6_forward;
2838 
2839 	rt->dst.output = ip6_output;
2840 
2841 	if (cfg->fc_encap) {
2842 		struct lwtunnel_state *lwtstate;
2843 
2844 		err = lwtunnel_build_state(cfg->fc_encap_type,
2845 					   cfg->fc_encap, AF_INET6, cfg,
2846 					   &lwtstate, extack);
2847 		if (err)
2848 			goto out;
2849 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2850 		lwtunnel_set_redirect(&rt->dst);
2851 	}
2852 
2853 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2854 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2855 	if (rt->rt6i_dst.plen == 128)
2856 		rt->dst.flags |= DST_HOST;
2857 
2858 #ifdef CONFIG_IPV6_SUBTREES
2859 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2860 	rt->rt6i_src.plen = cfg->fc_src_len;
2861 #endif
2862 
2863 	rt->rt6i_metric = cfg->fc_metric;
2864 	rt->rt6i_nh_weight = 1;
2865 
2866 	/* We cannot add true routes via loopback here,
2867 	   they would result in kernel looping; promote them to reject routes
2868 	 */
2869 	if ((cfg->fc_flags & RTF_REJECT) ||
2870 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2871 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2872 	     !(cfg->fc_flags & RTF_LOCAL))) {
2873 		/* hold loopback dev/idev if we haven't done so. */
2874 		if (dev != net->loopback_dev) {
2875 			if (dev) {
2876 				dev_put(dev);
2877 				in6_dev_put(idev);
2878 			}
2879 			dev = net->loopback_dev;
2880 			dev_hold(dev);
2881 			idev = in6_dev_get(dev);
2882 			if (!idev) {
2883 				err = -ENODEV;
2884 				goto out;
2885 			}
2886 		}
2887 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2888 		switch (cfg->fc_type) {
2889 		case RTN_BLACKHOLE:
2890 			rt->dst.error = -EINVAL;
2891 			rt->dst.output = dst_discard_out;
2892 			rt->dst.input = dst_discard;
2893 			break;
2894 		case RTN_PROHIBIT:
2895 			rt->dst.error = -EACCES;
2896 			rt->dst.output = ip6_pkt_prohibit_out;
2897 			rt->dst.input = ip6_pkt_prohibit;
2898 			break;
2899 		case RTN_THROW:
2900 		case RTN_UNREACHABLE:
2901 		default:
2902 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2903 					: (cfg->fc_type == RTN_UNREACHABLE)
2904 					? -EHOSTUNREACH : -ENETUNREACH;
2905 			rt->dst.output = ip6_pkt_discard_out;
2906 			rt->dst.input = ip6_pkt_discard;
2907 			break;
2908 		}
2909 		goto install_route;
2910 	}
2911 
2912 	if (cfg->fc_flags & RTF_GATEWAY) {
2913 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2914 		if (err)
2915 			goto out;
2916 
2917 		rt->rt6i_gateway = cfg->fc_gateway;
2918 	}
2919 
2920 	err = -ENODEV;
2921 	if (!dev)
2922 		goto out;
2923 
2924 	if (idev->cnf.disable_ipv6) {
2925 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2926 		err = -EACCES;
2927 		goto out;
2928 	}
2929 
2930 	if (!(dev->flags & IFF_UP)) {
2931 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2932 		err = -ENETDOWN;
2933 		goto out;
2934 	}
2935 
2936 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2937 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2938 			NL_SET_ERR_MSG(extack, "Invalid source address");
2939 			err = -EINVAL;
2940 			goto out;
2941 		}
2942 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2943 		rt->rt6i_prefsrc.plen = 128;
2944 	} else
2945 		rt->rt6i_prefsrc.plen = 0;
2946 
2947 	rt->rt6i_flags = cfg->fc_flags;
2948 
2949 install_route:
2950 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2951 	    !netif_carrier_ok(dev))
2952 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2953 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2954 	rt->dst.dev = dev;
2955 	rt->rt6i_idev = idev;
2956 	rt->rt6i_table = table;
2957 
2958 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2959 
2960 	return rt;
2961 out:
2962 	if (dev)
2963 		dev_put(dev);
2964 	if (idev)
2965 		in6_dev_put(idev);
2966 	if (rt)
2967 		dst_release_immediate(&rt->dst);
2968 
2969 	return ERR_PTR(err);
2970 }
2971 
2972 int ip6_route_add(struct fib6_config *cfg,
2973 		  struct netlink_ext_ack *extack)
2974 {
2975 	struct mx6_config mxc = { .mx = NULL, };
2976 	struct rt6_info *rt;
2977 	int err;
2978 
2979 	rt = ip6_route_info_create(cfg, extack);
2980 	if (IS_ERR(rt)) {
2981 		err = PTR_ERR(rt);
2982 		rt = NULL;
2983 		goto out;
2984 	}
2985 
2986 	err = ip6_convert_metrics(&mxc, cfg);
2987 	if (err)
2988 		goto out;
2989 
2990 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2991 
2992 	kfree(mxc.mx);
2993 
2994 	return err;
2995 out:
2996 	if (rt)
2997 		dst_release_immediate(&rt->dst);
2998 
2999 	return err;
3000 }
3001 
3002 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3003 {
3004 	int err;
3005 	struct fib6_table *table;
3006 	struct net *net = dev_net(rt->dst.dev);
3007 
3008 	if (rt == net->ipv6.ip6_null_entry) {
3009 		err = -ENOENT;
3010 		goto out;
3011 	}
3012 
3013 	table = rt->rt6i_table;
3014 	spin_lock_bh(&table->tb6_lock);
3015 	err = fib6_del(rt, info);
3016 	spin_unlock_bh(&table->tb6_lock);
3017 
3018 out:
3019 	ip6_rt_put(rt);
3020 	return err;
3021 }
3022 
3023 int ip6_del_rt(struct rt6_info *rt)
3024 {
3025 	struct nl_info info = {
3026 		.nl_net = dev_net(rt->dst.dev),
3027 	};
3028 	return __ip6_del_rt(rt, &info);
3029 }
3030 
3031 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3032 {
3033 	struct nl_info *info = &cfg->fc_nlinfo;
3034 	struct net *net = info->nl_net;
3035 	struct sk_buff *skb = NULL;
3036 	struct fib6_table *table;
3037 	int err = -ENOENT;
3038 
3039 	if (rt == net->ipv6.ip6_null_entry)
3040 		goto out_put;
3041 	table = rt->rt6i_table;
3042 	spin_lock_bh(&table->tb6_lock);
3043 
3044 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3045 		struct rt6_info *sibling, *next_sibling;
3046 
3047 		/* prefer to send a single notification with all hops */
3048 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3049 		if (skb) {
3050 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3051 
3052 			if (rt6_fill_node(net, skb, rt,
3053 					  NULL, NULL, 0, RTM_DELROUTE,
3054 					  info->portid, seq, 0) < 0) {
3055 				kfree_skb(skb);
3056 				skb = NULL;
3057 			} else
3058 				info->skip_notify = 1;
3059 		}
3060 
3061 		list_for_each_entry_safe(sibling, next_sibling,
3062 					 &rt->rt6i_siblings,
3063 					 rt6i_siblings) {
3064 			err = fib6_del(sibling, info);
3065 			if (err)
3066 				goto out_unlock;
3067 		}
3068 	}
3069 
3070 	err = fib6_del(rt, info);
3071 out_unlock:
3072 	spin_unlock_bh(&table->tb6_lock);
3073 out_put:
3074 	ip6_rt_put(rt);
3075 
3076 	if (skb) {
3077 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3078 			    info->nlh, gfp_any());
3079 	}
3080 	return err;
3081 }
3082 
3083 static int ip6_route_del(struct fib6_config *cfg,
3084 			 struct netlink_ext_ack *extack)
3085 {
3086 	struct rt6_info *rt, *rt_cache;
3087 	struct fib6_table *table;
3088 	struct fib6_node *fn;
3089 	int err = -ESRCH;
3090 
3091 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3092 	if (!table) {
3093 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3094 		return err;
3095 	}
3096 
3097 	rcu_read_lock();
3098 
3099 	fn = fib6_locate(&table->tb6_root,
3100 			 &cfg->fc_dst, cfg->fc_dst_len,
3101 			 &cfg->fc_src, cfg->fc_src_len,
3102 			 !(cfg->fc_flags & RTF_CACHE));
3103 
3104 	if (fn) {
3105 		for_each_fib6_node_rt_rcu(fn) {
3106 			if (cfg->fc_flags & RTF_CACHE) {
3107 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3108 							      &cfg->fc_src);
3109 				if (!rt_cache)
3110 					continue;
3111 				rt = rt_cache;
3112 			}
3113 			if (cfg->fc_ifindex &&
3114 			    (!rt->dst.dev ||
3115 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
3116 				continue;
3117 			if (cfg->fc_flags & RTF_GATEWAY &&
3118 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3119 				continue;
3120 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3121 				continue;
3122 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3123 				continue;
3124 			if (!dst_hold_safe(&rt->dst))
3125 				break;
3126 			rcu_read_unlock();
3127 
3128 			/* if gateway was specified only delete the one hop */
3129 			if (cfg->fc_flags & RTF_GATEWAY)
3130 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3131 
3132 			return __ip6_del_rt_siblings(rt, cfg);
3133 		}
3134 	}
3135 	rcu_read_unlock();
3136 
3137 	return err;
3138 }
3139 
3140 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3141 {
3142 	struct netevent_redirect netevent;
3143 	struct rt6_info *rt, *nrt = NULL;
3144 	struct ndisc_options ndopts;
3145 	struct inet6_dev *in6_dev;
3146 	struct neighbour *neigh;
3147 	struct rd_msg *msg;
3148 	int optlen, on_link;
3149 	u8 *lladdr;
3150 
3151 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3152 	optlen -= sizeof(*msg);
3153 
3154 	if (optlen < 0) {
3155 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3156 		return;
3157 	}
3158 
3159 	msg = (struct rd_msg *)icmp6_hdr(skb);
3160 
3161 	if (ipv6_addr_is_multicast(&msg->dest)) {
3162 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3163 		return;
3164 	}
3165 
3166 	on_link = 0;
3167 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3168 		on_link = 1;
3169 	} else if (ipv6_addr_type(&msg->target) !=
3170 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3171 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3172 		return;
3173 	}
3174 
3175 	in6_dev = __in6_dev_get(skb->dev);
3176 	if (!in6_dev)
3177 		return;
3178 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3179 		return;
3180 
3181 	/* RFC2461 8.1:
3182 	 *	The IP source address of the Redirect MUST be the same as the current
3183 	 *	first-hop router for the specified ICMP Destination Address.
3184 	 */
3185 
3186 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3187 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3188 		return;
3189 	}
3190 
3191 	lladdr = NULL;
3192 	if (ndopts.nd_opts_tgt_lladdr) {
3193 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3194 					     skb->dev);
3195 		if (!lladdr) {
3196 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3197 			return;
3198 		}
3199 	}
3200 
3201 	rt = (struct rt6_info *) dst;
3202 	if (rt->rt6i_flags & RTF_REJECT) {
3203 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3204 		return;
3205 	}
3206 
3207 	/* Redirect received -> path was valid.
3208 	 * Look, redirects are sent only in response to data packets,
3209 	 * so that this nexthop apparently is reachable. --ANK
3210 	 */
3211 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3212 
3213 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3214 	if (!neigh)
3215 		return;
3216 
3217 	/*
3218 	 *	We have finally decided to accept it.
3219 	 */
3220 
3221 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3222 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3223 		     NEIGH_UPDATE_F_OVERRIDE|
3224 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3225 				     NEIGH_UPDATE_F_ISROUTER)),
3226 		     NDISC_REDIRECT, &ndopts);
3227 
3228 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3229 	if (!nrt)
3230 		goto out;
3231 
3232 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3233 	if (on_link)
3234 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3235 
3236 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3237 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3238 
3239 	/* No need to remove rt from the exception table if rt is
3240 	 * a cached route because rt6_insert_exception() will
3241 	 * takes care of it
3242 	 */
3243 	if (rt6_insert_exception(nrt, rt)) {
3244 		dst_release_immediate(&nrt->dst);
3245 		goto out;
3246 	}
3247 
3248 	netevent.old = &rt->dst;
3249 	netevent.new = &nrt->dst;
3250 	netevent.daddr = &msg->dest;
3251 	netevent.neigh = neigh;
3252 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3253 
3254 out:
3255 	neigh_release(neigh);
3256 }
3257 
3258 /*
3259  *	Misc support functions
3260  */
3261 
3262 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3263 {
3264 	BUG_ON(from->from);
3265 
3266 	rt->rt6i_flags &= ~RTF_EXPIRES;
3267 	dst_hold(&from->dst);
3268 	rt->from = from;
3269 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3270 }
3271 
3272 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3273 {
3274 	rt->dst.input = ort->dst.input;
3275 	rt->dst.output = ort->dst.output;
3276 	rt->rt6i_dst = ort->rt6i_dst;
3277 	rt->dst.error = ort->dst.error;
3278 	rt->rt6i_idev = ort->rt6i_idev;
3279 	if (rt->rt6i_idev)
3280 		in6_dev_hold(rt->rt6i_idev);
3281 	rt->dst.lastuse = jiffies;
3282 	rt->rt6i_gateway = ort->rt6i_gateway;
3283 	rt->rt6i_flags = ort->rt6i_flags;
3284 	rt6_set_from(rt, ort);
3285 	rt->rt6i_metric = ort->rt6i_metric;
3286 #ifdef CONFIG_IPV6_SUBTREES
3287 	rt->rt6i_src = ort->rt6i_src;
3288 #endif
3289 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3290 	rt->rt6i_table = ort->rt6i_table;
3291 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3292 }
3293 
3294 #ifdef CONFIG_IPV6_ROUTE_INFO
3295 static struct rt6_info *rt6_get_route_info(struct net *net,
3296 					   const struct in6_addr *prefix, int prefixlen,
3297 					   const struct in6_addr *gwaddr,
3298 					   struct net_device *dev)
3299 {
3300 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3301 	int ifindex = dev->ifindex;
3302 	struct fib6_node *fn;
3303 	struct rt6_info *rt = NULL;
3304 	struct fib6_table *table;
3305 
3306 	table = fib6_get_table(net, tb_id);
3307 	if (!table)
3308 		return NULL;
3309 
3310 	rcu_read_lock();
3311 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3312 	if (!fn)
3313 		goto out;
3314 
3315 	for_each_fib6_node_rt_rcu(fn) {
3316 		if (rt->dst.dev->ifindex != ifindex)
3317 			continue;
3318 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3319 			continue;
3320 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3321 			continue;
3322 		ip6_hold_safe(NULL, &rt, false);
3323 		break;
3324 	}
3325 out:
3326 	rcu_read_unlock();
3327 	return rt;
3328 }
3329 
3330 static struct rt6_info *rt6_add_route_info(struct net *net,
3331 					   const struct in6_addr *prefix, int prefixlen,
3332 					   const struct in6_addr *gwaddr,
3333 					   struct net_device *dev,
3334 					   unsigned int pref)
3335 {
3336 	struct fib6_config cfg = {
3337 		.fc_metric	= IP6_RT_PRIO_USER,
3338 		.fc_ifindex	= dev->ifindex,
3339 		.fc_dst_len	= prefixlen,
3340 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3341 				  RTF_UP | RTF_PREF(pref),
3342 		.fc_protocol = RTPROT_RA,
3343 		.fc_nlinfo.portid = 0,
3344 		.fc_nlinfo.nlh = NULL,
3345 		.fc_nlinfo.nl_net = net,
3346 	};
3347 
3348 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3349 	cfg.fc_dst = *prefix;
3350 	cfg.fc_gateway = *gwaddr;
3351 
3352 	/* We should treat it as a default route if prefix length is 0. */
3353 	if (!prefixlen)
3354 		cfg.fc_flags |= RTF_DEFAULT;
3355 
3356 	ip6_route_add(&cfg, NULL);
3357 
3358 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3359 }
3360 #endif
3361 
3362 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3363 {
3364 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3365 	struct rt6_info *rt;
3366 	struct fib6_table *table;
3367 
3368 	table = fib6_get_table(dev_net(dev), tb_id);
3369 	if (!table)
3370 		return NULL;
3371 
3372 	rcu_read_lock();
3373 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3374 		if (dev == rt->dst.dev &&
3375 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3376 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3377 			break;
3378 	}
3379 	if (rt)
3380 		ip6_hold_safe(NULL, &rt, false);
3381 	rcu_read_unlock();
3382 	return rt;
3383 }
3384 
3385 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3386 				     struct net_device *dev,
3387 				     unsigned int pref)
3388 {
3389 	struct fib6_config cfg = {
3390 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3391 		.fc_metric	= IP6_RT_PRIO_USER,
3392 		.fc_ifindex	= dev->ifindex,
3393 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3394 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3395 		.fc_protocol = RTPROT_RA,
3396 		.fc_nlinfo.portid = 0,
3397 		.fc_nlinfo.nlh = NULL,
3398 		.fc_nlinfo.nl_net = dev_net(dev),
3399 	};
3400 
3401 	cfg.fc_gateway = *gwaddr;
3402 
3403 	if (!ip6_route_add(&cfg, NULL)) {
3404 		struct fib6_table *table;
3405 
3406 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3407 		if (table)
3408 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3409 	}
3410 
3411 	return rt6_get_dflt_router(gwaddr, dev);
3412 }
3413 
3414 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3415 {
3416 	struct rt6_info *rt;
3417 
3418 restart:
3419 	rcu_read_lock();
3420 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3421 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3422 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3423 			if (dst_hold_safe(&rt->dst)) {
3424 				rcu_read_unlock();
3425 				ip6_del_rt(rt);
3426 			} else {
3427 				rcu_read_unlock();
3428 			}
3429 			goto restart;
3430 		}
3431 	}
3432 	rcu_read_unlock();
3433 
3434 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3435 }
3436 
3437 void rt6_purge_dflt_routers(struct net *net)
3438 {
3439 	struct fib6_table *table;
3440 	struct hlist_head *head;
3441 	unsigned int h;
3442 
3443 	rcu_read_lock();
3444 
3445 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3446 		head = &net->ipv6.fib_table_hash[h];
3447 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3448 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3449 				__rt6_purge_dflt_routers(table);
3450 		}
3451 	}
3452 
3453 	rcu_read_unlock();
3454 }
3455 
3456 static void rtmsg_to_fib6_config(struct net *net,
3457 				 struct in6_rtmsg *rtmsg,
3458 				 struct fib6_config *cfg)
3459 {
3460 	memset(cfg, 0, sizeof(*cfg));
3461 
3462 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3463 			 : RT6_TABLE_MAIN;
3464 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3465 	cfg->fc_metric = rtmsg->rtmsg_metric;
3466 	cfg->fc_expires = rtmsg->rtmsg_info;
3467 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3468 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3469 	cfg->fc_flags = rtmsg->rtmsg_flags;
3470 
3471 	cfg->fc_nlinfo.nl_net = net;
3472 
3473 	cfg->fc_dst = rtmsg->rtmsg_dst;
3474 	cfg->fc_src = rtmsg->rtmsg_src;
3475 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3476 }
3477 
3478 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3479 {
3480 	struct fib6_config cfg;
3481 	struct in6_rtmsg rtmsg;
3482 	int err;
3483 
3484 	switch (cmd) {
3485 	case SIOCADDRT:		/* Add a route */
3486 	case SIOCDELRT:		/* Delete a route */
3487 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3488 			return -EPERM;
3489 		err = copy_from_user(&rtmsg, arg,
3490 				     sizeof(struct in6_rtmsg));
3491 		if (err)
3492 			return -EFAULT;
3493 
3494 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3495 
3496 		rtnl_lock();
3497 		switch (cmd) {
3498 		case SIOCADDRT:
3499 			err = ip6_route_add(&cfg, NULL);
3500 			break;
3501 		case SIOCDELRT:
3502 			err = ip6_route_del(&cfg, NULL);
3503 			break;
3504 		default:
3505 			err = -EINVAL;
3506 		}
3507 		rtnl_unlock();
3508 
3509 		return err;
3510 	}
3511 
3512 	return -EINVAL;
3513 }
3514 
3515 /*
3516  *	Drop the packet on the floor
3517  */
3518 
3519 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3520 {
3521 	int type;
3522 	struct dst_entry *dst = skb_dst(skb);
3523 	switch (ipstats_mib_noroutes) {
3524 	case IPSTATS_MIB_INNOROUTES:
3525 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3526 		if (type == IPV6_ADDR_ANY) {
3527 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3528 				      IPSTATS_MIB_INADDRERRORS);
3529 			break;
3530 		}
3531 		/* FALLTHROUGH */
3532 	case IPSTATS_MIB_OUTNOROUTES:
3533 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3534 			      ipstats_mib_noroutes);
3535 		break;
3536 	}
3537 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3538 	kfree_skb(skb);
3539 	return 0;
3540 }
3541 
3542 static int ip6_pkt_discard(struct sk_buff *skb)
3543 {
3544 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3545 }
3546 
3547 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3548 {
3549 	skb->dev = skb_dst(skb)->dev;
3550 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3551 }
3552 
3553 static int ip6_pkt_prohibit(struct sk_buff *skb)
3554 {
3555 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3556 }
3557 
3558 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3559 {
3560 	skb->dev = skb_dst(skb)->dev;
3561 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3562 }
3563 
3564 /*
3565  *	Allocate a dst for local (unicast / anycast) address.
3566  */
3567 
3568 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3569 				    const struct in6_addr *addr,
3570 				    bool anycast)
3571 {
3572 	u32 tb_id;
3573 	struct net *net = dev_net(idev->dev);
3574 	struct net_device *dev = idev->dev;
3575 	struct rt6_info *rt;
3576 
3577 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3578 	if (!rt)
3579 		return ERR_PTR(-ENOMEM);
3580 
3581 	in6_dev_hold(idev);
3582 
3583 	rt->dst.flags |= DST_HOST;
3584 	rt->dst.input = ip6_input;
3585 	rt->dst.output = ip6_output;
3586 	rt->rt6i_idev = idev;
3587 
3588 	rt->rt6i_protocol = RTPROT_KERNEL;
3589 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3590 	if (anycast)
3591 		rt->rt6i_flags |= RTF_ANYCAST;
3592 	else
3593 		rt->rt6i_flags |= RTF_LOCAL;
3594 
3595 	rt->rt6i_gateway  = *addr;
3596 	rt->rt6i_dst.addr = *addr;
3597 	rt->rt6i_dst.plen = 128;
3598 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3599 	rt->rt6i_table = fib6_get_table(net, tb_id);
3600 
3601 	return rt;
3602 }
3603 
3604 /* remove deleted ip from prefsrc entries */
3605 struct arg_dev_net_ip {
3606 	struct net_device *dev;
3607 	struct net *net;
3608 	struct in6_addr *addr;
3609 };
3610 
3611 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3612 {
3613 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3614 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3615 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3616 
3617 	if (((void *)rt->dst.dev == dev || !dev) &&
3618 	    rt != net->ipv6.ip6_null_entry &&
3619 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3620 		spin_lock_bh(&rt6_exception_lock);
3621 		/* remove prefsrc entry */
3622 		rt->rt6i_prefsrc.plen = 0;
3623 		/* need to update cache as well */
3624 		rt6_exceptions_remove_prefsrc(rt);
3625 		spin_unlock_bh(&rt6_exception_lock);
3626 	}
3627 	return 0;
3628 }
3629 
3630 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3631 {
3632 	struct net *net = dev_net(ifp->idev->dev);
3633 	struct arg_dev_net_ip adni = {
3634 		.dev = ifp->idev->dev,
3635 		.net = net,
3636 		.addr = &ifp->addr,
3637 	};
3638 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3639 }
3640 
3641 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3642 
3643 /* Remove routers and update dst entries when gateway turn into host. */
3644 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3645 {
3646 	struct in6_addr *gateway = (struct in6_addr *)arg;
3647 
3648 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3649 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3650 		return -1;
3651 	}
3652 
3653 	/* Further clean up cached routes in exception table.
3654 	 * This is needed because cached route may have a different
3655 	 * gateway than its 'parent' in the case of an ip redirect.
3656 	 */
3657 	rt6_exceptions_clean_tohost(rt, gateway);
3658 
3659 	return 0;
3660 }
3661 
3662 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3663 {
3664 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3665 }
3666 
3667 struct arg_netdev_event {
3668 	const struct net_device *dev;
3669 	union {
3670 		unsigned int nh_flags;
3671 		unsigned long event;
3672 	};
3673 };
3674 
3675 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3676 {
3677 	struct rt6_info *iter;
3678 	struct fib6_node *fn;
3679 
3680 	fn = rcu_dereference_protected(rt->rt6i_node,
3681 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3682 	iter = rcu_dereference_protected(fn->leaf,
3683 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3684 	while (iter) {
3685 		if (iter->rt6i_metric == rt->rt6i_metric &&
3686 		    rt6_qualify_for_ecmp(iter))
3687 			return iter;
3688 		iter = rcu_dereference_protected(iter->rt6_next,
3689 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3690 	}
3691 
3692 	return NULL;
3693 }
3694 
3695 static bool rt6_is_dead(const struct rt6_info *rt)
3696 {
3697 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3698 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3699 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3700 		return true;
3701 
3702 	return false;
3703 }
3704 
3705 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3706 {
3707 	struct rt6_info *iter;
3708 	int total = 0;
3709 
3710 	if (!rt6_is_dead(rt))
3711 		total += rt->rt6i_nh_weight;
3712 
3713 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3714 		if (!rt6_is_dead(iter))
3715 			total += iter->rt6i_nh_weight;
3716 	}
3717 
3718 	return total;
3719 }
3720 
3721 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3722 {
3723 	int upper_bound = -1;
3724 
3725 	if (!rt6_is_dead(rt)) {
3726 		*weight += rt->rt6i_nh_weight;
3727 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3728 						    total) - 1;
3729 	}
3730 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3731 }
3732 
3733 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3734 {
3735 	struct rt6_info *iter;
3736 	int weight = 0;
3737 
3738 	rt6_upper_bound_set(rt, &weight, total);
3739 
3740 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3741 		rt6_upper_bound_set(iter, &weight, total);
3742 }
3743 
3744 void rt6_multipath_rebalance(struct rt6_info *rt)
3745 {
3746 	struct rt6_info *first;
3747 	int total;
3748 
3749 	/* In case the entire multipath route was marked for flushing,
3750 	 * then there is no need to rebalance upon the removal of every
3751 	 * sibling route.
3752 	 */
3753 	if (!rt->rt6i_nsiblings || rt->should_flush)
3754 		return;
3755 
3756 	/* During lookup routes are evaluated in order, so we need to
3757 	 * make sure upper bounds are assigned from the first sibling
3758 	 * onwards.
3759 	 */
3760 	first = rt6_multipath_first_sibling(rt);
3761 	if (WARN_ON_ONCE(!first))
3762 		return;
3763 
3764 	total = rt6_multipath_total_weight(first);
3765 	rt6_multipath_upper_bound_set(first, total);
3766 }
3767 
3768 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3769 {
3770 	const struct arg_netdev_event *arg = p_arg;
3771 	const struct net *net = dev_net(arg->dev);
3772 
3773 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3774 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3775 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3776 		rt6_multipath_rebalance(rt);
3777 	}
3778 
3779 	return 0;
3780 }
3781 
3782 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3783 {
3784 	struct arg_netdev_event arg = {
3785 		.dev = dev,
3786 		{
3787 			.nh_flags = nh_flags,
3788 		},
3789 	};
3790 
3791 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3792 		arg.nh_flags |= RTNH_F_LINKDOWN;
3793 
3794 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3795 }
3796 
3797 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3798 				   const struct net_device *dev)
3799 {
3800 	struct rt6_info *iter;
3801 
3802 	if (rt->dst.dev == dev)
3803 		return true;
3804 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3805 		if (iter->dst.dev == dev)
3806 			return true;
3807 
3808 	return false;
3809 }
3810 
3811 static void rt6_multipath_flush(struct rt6_info *rt)
3812 {
3813 	struct rt6_info *iter;
3814 
3815 	rt->should_flush = 1;
3816 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3817 		iter->should_flush = 1;
3818 }
3819 
3820 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3821 					     const struct net_device *down_dev)
3822 {
3823 	struct rt6_info *iter;
3824 	unsigned int dead = 0;
3825 
3826 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3827 		dead++;
3828 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3829 		if (iter->dst.dev == down_dev ||
3830 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3831 			dead++;
3832 
3833 	return dead;
3834 }
3835 
3836 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3837 				       const struct net_device *dev,
3838 				       unsigned int nh_flags)
3839 {
3840 	struct rt6_info *iter;
3841 
3842 	if (rt->dst.dev == dev)
3843 		rt->rt6i_nh_flags |= nh_flags;
3844 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3845 		if (iter->dst.dev == dev)
3846 			iter->rt6i_nh_flags |= nh_flags;
3847 }
3848 
3849 /* called with write lock held for table with rt */
3850 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3851 {
3852 	const struct arg_netdev_event *arg = p_arg;
3853 	const struct net_device *dev = arg->dev;
3854 	const struct net *net = dev_net(dev);
3855 
3856 	if (rt == net->ipv6.ip6_null_entry)
3857 		return 0;
3858 
3859 	switch (arg->event) {
3860 	case NETDEV_UNREGISTER:
3861 		return rt->dst.dev == dev ? -1 : 0;
3862 	case NETDEV_DOWN:
3863 		if (rt->should_flush)
3864 			return -1;
3865 		if (!rt->rt6i_nsiblings)
3866 			return rt->dst.dev == dev ? -1 : 0;
3867 		if (rt6_multipath_uses_dev(rt, dev)) {
3868 			unsigned int count;
3869 
3870 			count = rt6_multipath_dead_count(rt, dev);
3871 			if (rt->rt6i_nsiblings + 1 == count) {
3872 				rt6_multipath_flush(rt);
3873 				return -1;
3874 			}
3875 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3876 						   RTNH_F_LINKDOWN);
3877 			fib6_update_sernum(rt);
3878 			rt6_multipath_rebalance(rt);
3879 		}
3880 		return -2;
3881 	case NETDEV_CHANGE:
3882 		if (rt->dst.dev != dev ||
3883 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3884 			break;
3885 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3886 		rt6_multipath_rebalance(rt);
3887 		break;
3888 	}
3889 
3890 	return 0;
3891 }
3892 
3893 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3894 {
3895 	struct arg_netdev_event arg = {
3896 		.dev = dev,
3897 		{
3898 			.event = event,
3899 		},
3900 	};
3901 
3902 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3903 }
3904 
3905 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3906 {
3907 	rt6_sync_down_dev(dev, event);
3908 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3909 	neigh_ifdown(&nd_tbl, dev);
3910 }
3911 
3912 struct rt6_mtu_change_arg {
3913 	struct net_device *dev;
3914 	unsigned int mtu;
3915 };
3916 
3917 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3918 {
3919 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3920 	struct inet6_dev *idev;
3921 
3922 	/* In IPv6 pmtu discovery is not optional,
3923 	   so that RTAX_MTU lock cannot disable it.
3924 	   We still use this lock to block changes
3925 	   caused by addrconf/ndisc.
3926 	*/
3927 
3928 	idev = __in6_dev_get(arg->dev);
3929 	if (!idev)
3930 		return 0;
3931 
3932 	/* For administrative MTU increase, there is no way to discover
3933 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3934 	   Since RFC 1981 doesn't include administrative MTU increase
3935 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3936 	 */
3937 	if (rt->dst.dev == arg->dev &&
3938 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3939 		spin_lock_bh(&rt6_exception_lock);
3940 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3941 		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3942 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3943 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3944 		spin_unlock_bh(&rt6_exception_lock);
3945 	}
3946 	return 0;
3947 }
3948 
3949 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3950 {
3951 	struct rt6_mtu_change_arg arg = {
3952 		.dev = dev,
3953 		.mtu = mtu,
3954 	};
3955 
3956 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3957 }
3958 
3959 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3960 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3961 	[RTA_OIF]               = { .type = NLA_U32 },
3962 	[RTA_IIF]		= { .type = NLA_U32 },
3963 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3964 	[RTA_METRICS]           = { .type = NLA_NESTED },
3965 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3966 	[RTA_PREF]              = { .type = NLA_U8 },
3967 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3968 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3969 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3970 	[RTA_UID]		= { .type = NLA_U32 },
3971 	[RTA_MARK]		= { .type = NLA_U32 },
3972 };
3973 
3974 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3975 			      struct fib6_config *cfg,
3976 			      struct netlink_ext_ack *extack)
3977 {
3978 	struct rtmsg *rtm;
3979 	struct nlattr *tb[RTA_MAX+1];
3980 	unsigned int pref;
3981 	int err;
3982 
3983 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3984 			  NULL);
3985 	if (err < 0)
3986 		goto errout;
3987 
3988 	err = -EINVAL;
3989 	rtm = nlmsg_data(nlh);
3990 	memset(cfg, 0, sizeof(*cfg));
3991 
3992 	cfg->fc_table = rtm->rtm_table;
3993 	cfg->fc_dst_len = rtm->rtm_dst_len;
3994 	cfg->fc_src_len = rtm->rtm_src_len;
3995 	cfg->fc_flags = RTF_UP;
3996 	cfg->fc_protocol = rtm->rtm_protocol;
3997 	cfg->fc_type = rtm->rtm_type;
3998 
3999 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4000 	    rtm->rtm_type == RTN_BLACKHOLE ||
4001 	    rtm->rtm_type == RTN_PROHIBIT ||
4002 	    rtm->rtm_type == RTN_THROW)
4003 		cfg->fc_flags |= RTF_REJECT;
4004 
4005 	if (rtm->rtm_type == RTN_LOCAL)
4006 		cfg->fc_flags |= RTF_LOCAL;
4007 
4008 	if (rtm->rtm_flags & RTM_F_CLONED)
4009 		cfg->fc_flags |= RTF_CACHE;
4010 
4011 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4012 
4013 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4014 	cfg->fc_nlinfo.nlh = nlh;
4015 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4016 
4017 	if (tb[RTA_GATEWAY]) {
4018 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4019 		cfg->fc_flags |= RTF_GATEWAY;
4020 	}
4021 
4022 	if (tb[RTA_DST]) {
4023 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4024 
4025 		if (nla_len(tb[RTA_DST]) < plen)
4026 			goto errout;
4027 
4028 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4029 	}
4030 
4031 	if (tb[RTA_SRC]) {
4032 		int plen = (rtm->rtm_src_len + 7) >> 3;
4033 
4034 		if (nla_len(tb[RTA_SRC]) < plen)
4035 			goto errout;
4036 
4037 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4038 	}
4039 
4040 	if (tb[RTA_PREFSRC])
4041 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4042 
4043 	if (tb[RTA_OIF])
4044 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4045 
4046 	if (tb[RTA_PRIORITY])
4047 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4048 
4049 	if (tb[RTA_METRICS]) {
4050 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4051 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4052 	}
4053 
4054 	if (tb[RTA_TABLE])
4055 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4056 
4057 	if (tb[RTA_MULTIPATH]) {
4058 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4059 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4060 
4061 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4062 						     cfg->fc_mp_len, extack);
4063 		if (err < 0)
4064 			goto errout;
4065 	}
4066 
4067 	if (tb[RTA_PREF]) {
4068 		pref = nla_get_u8(tb[RTA_PREF]);
4069 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4070 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4071 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4072 		cfg->fc_flags |= RTF_PREF(pref);
4073 	}
4074 
4075 	if (tb[RTA_ENCAP])
4076 		cfg->fc_encap = tb[RTA_ENCAP];
4077 
4078 	if (tb[RTA_ENCAP_TYPE]) {
4079 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4080 
4081 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4082 		if (err < 0)
4083 			goto errout;
4084 	}
4085 
4086 	if (tb[RTA_EXPIRES]) {
4087 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4088 
4089 		if (addrconf_finite_timeout(timeout)) {
4090 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4091 			cfg->fc_flags |= RTF_EXPIRES;
4092 		}
4093 	}
4094 
4095 	err = 0;
4096 errout:
4097 	return err;
4098 }
4099 
4100 struct rt6_nh {
4101 	struct rt6_info *rt6_info;
4102 	struct fib6_config r_cfg;
4103 	struct mx6_config mxc;
4104 	struct list_head next;
4105 };
4106 
4107 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4108 {
4109 	struct rt6_nh *nh;
4110 
4111 	list_for_each_entry(nh, rt6_nh_list, next) {
4112 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4113 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4114 		        nh->r_cfg.fc_ifindex);
4115 	}
4116 }
4117 
4118 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4119 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4120 {
4121 	struct rt6_nh *nh;
4122 	int err = -EEXIST;
4123 
4124 	list_for_each_entry(nh, rt6_nh_list, next) {
4125 		/* check if rt6_info already exists */
4126 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4127 			return err;
4128 	}
4129 
4130 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4131 	if (!nh)
4132 		return -ENOMEM;
4133 	nh->rt6_info = rt;
4134 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4135 	if (err) {
4136 		kfree(nh);
4137 		return err;
4138 	}
4139 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4140 	list_add_tail(&nh->next, rt6_nh_list);
4141 
4142 	return 0;
4143 }
4144 
4145 static void ip6_route_mpath_notify(struct rt6_info *rt,
4146 				   struct rt6_info *rt_last,
4147 				   struct nl_info *info,
4148 				   __u16 nlflags)
4149 {
4150 	/* if this is an APPEND route, then rt points to the first route
4151 	 * inserted and rt_last points to last route inserted. Userspace
4152 	 * wants a consistent dump of the route which starts at the first
4153 	 * nexthop. Since sibling routes are always added at the end of
4154 	 * the list, find the first sibling of the last route appended
4155 	 */
4156 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4157 		rt = list_first_entry(&rt_last->rt6i_siblings,
4158 				      struct rt6_info,
4159 				      rt6i_siblings);
4160 	}
4161 
4162 	if (rt)
4163 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4164 }
4165 
4166 static int ip6_route_multipath_add(struct fib6_config *cfg,
4167 				   struct netlink_ext_ack *extack)
4168 {
4169 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4170 	struct nl_info *info = &cfg->fc_nlinfo;
4171 	struct fib6_config r_cfg;
4172 	struct rtnexthop *rtnh;
4173 	struct rt6_info *rt;
4174 	struct rt6_nh *err_nh;
4175 	struct rt6_nh *nh, *nh_safe;
4176 	__u16 nlflags;
4177 	int remaining;
4178 	int attrlen;
4179 	int err = 1;
4180 	int nhn = 0;
4181 	int replace = (cfg->fc_nlinfo.nlh &&
4182 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4183 	LIST_HEAD(rt6_nh_list);
4184 
4185 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4186 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4187 		nlflags |= NLM_F_APPEND;
4188 
4189 	remaining = cfg->fc_mp_len;
4190 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4191 
4192 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4193 	 * rt6_info structs per nexthop
4194 	 */
4195 	while (rtnh_ok(rtnh, remaining)) {
4196 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4197 		if (rtnh->rtnh_ifindex)
4198 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4199 
4200 		attrlen = rtnh_attrlen(rtnh);
4201 		if (attrlen > 0) {
4202 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4203 
4204 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4205 			if (nla) {
4206 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4207 				r_cfg.fc_flags |= RTF_GATEWAY;
4208 			}
4209 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4210 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4211 			if (nla)
4212 				r_cfg.fc_encap_type = nla_get_u16(nla);
4213 		}
4214 
4215 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4216 		rt = ip6_route_info_create(&r_cfg, extack);
4217 		if (IS_ERR(rt)) {
4218 			err = PTR_ERR(rt);
4219 			rt = NULL;
4220 			goto cleanup;
4221 		}
4222 
4223 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4224 
4225 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4226 		if (err) {
4227 			dst_release_immediate(&rt->dst);
4228 			goto cleanup;
4229 		}
4230 
4231 		rtnh = rtnh_next(rtnh, &remaining);
4232 	}
4233 
4234 	/* for add and replace send one notification with all nexthops.
4235 	 * Skip the notification in fib6_add_rt2node and send one with
4236 	 * the full route when done
4237 	 */
4238 	info->skip_notify = 1;
4239 
4240 	err_nh = NULL;
4241 	list_for_each_entry(nh, &rt6_nh_list, next) {
4242 		rt_last = nh->rt6_info;
4243 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4244 		/* save reference to first route for notification */
4245 		if (!rt_notif && !err)
4246 			rt_notif = nh->rt6_info;
4247 
4248 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4249 		nh->rt6_info = NULL;
4250 		if (err) {
4251 			if (replace && nhn)
4252 				ip6_print_replace_route_err(&rt6_nh_list);
4253 			err_nh = nh;
4254 			goto add_errout;
4255 		}
4256 
4257 		/* Because each route is added like a single route we remove
4258 		 * these flags after the first nexthop: if there is a collision,
4259 		 * we have already failed to add the first nexthop:
4260 		 * fib6_add_rt2node() has rejected it; when replacing, old
4261 		 * nexthops have been replaced by first new, the rest should
4262 		 * be added to it.
4263 		 */
4264 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4265 						     NLM_F_REPLACE);
4266 		nhn++;
4267 	}
4268 
4269 	/* success ... tell user about new route */
4270 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4271 	goto cleanup;
4272 
4273 add_errout:
4274 	/* send notification for routes that were added so that
4275 	 * the delete notifications sent by ip6_route_del are
4276 	 * coherent
4277 	 */
4278 	if (rt_notif)
4279 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4280 
4281 	/* Delete routes that were already added */
4282 	list_for_each_entry(nh, &rt6_nh_list, next) {
4283 		if (err_nh == nh)
4284 			break;
4285 		ip6_route_del(&nh->r_cfg, extack);
4286 	}
4287 
4288 cleanup:
4289 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4290 		if (nh->rt6_info)
4291 			dst_release_immediate(&nh->rt6_info->dst);
4292 		kfree(nh->mxc.mx);
4293 		list_del(&nh->next);
4294 		kfree(nh);
4295 	}
4296 
4297 	return err;
4298 }
4299 
4300 static int ip6_route_multipath_del(struct fib6_config *cfg,
4301 				   struct netlink_ext_ack *extack)
4302 {
4303 	struct fib6_config r_cfg;
4304 	struct rtnexthop *rtnh;
4305 	int remaining;
4306 	int attrlen;
4307 	int err = 1, last_err = 0;
4308 
4309 	remaining = cfg->fc_mp_len;
4310 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4311 
4312 	/* Parse a Multipath Entry */
4313 	while (rtnh_ok(rtnh, remaining)) {
4314 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4315 		if (rtnh->rtnh_ifindex)
4316 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4317 
4318 		attrlen = rtnh_attrlen(rtnh);
4319 		if (attrlen > 0) {
4320 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4321 
4322 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4323 			if (nla) {
4324 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4325 				r_cfg.fc_flags |= RTF_GATEWAY;
4326 			}
4327 		}
4328 		err = ip6_route_del(&r_cfg, extack);
4329 		if (err)
4330 			last_err = err;
4331 
4332 		rtnh = rtnh_next(rtnh, &remaining);
4333 	}
4334 
4335 	return last_err;
4336 }
4337 
4338 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4339 			      struct netlink_ext_ack *extack)
4340 {
4341 	struct fib6_config cfg;
4342 	int err;
4343 
4344 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4345 	if (err < 0)
4346 		return err;
4347 
4348 	if (cfg.fc_mp)
4349 		return ip6_route_multipath_del(&cfg, extack);
4350 	else {
4351 		cfg.fc_delete_all_nh = 1;
4352 		return ip6_route_del(&cfg, extack);
4353 	}
4354 }
4355 
4356 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4357 			      struct netlink_ext_ack *extack)
4358 {
4359 	struct fib6_config cfg;
4360 	int err;
4361 
4362 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4363 	if (err < 0)
4364 		return err;
4365 
4366 	if (cfg.fc_mp)
4367 		return ip6_route_multipath_add(&cfg, extack);
4368 	else
4369 		return ip6_route_add(&cfg, extack);
4370 }
4371 
4372 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4373 {
4374 	int nexthop_len = 0;
4375 
4376 	if (rt->rt6i_nsiblings) {
4377 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4378 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4379 			    + nla_total_size(16) /* RTA_GATEWAY */
4380 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4381 
4382 		nexthop_len *= rt->rt6i_nsiblings;
4383 	}
4384 
4385 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4386 	       + nla_total_size(16) /* RTA_SRC */
4387 	       + nla_total_size(16) /* RTA_DST */
4388 	       + nla_total_size(16) /* RTA_GATEWAY */
4389 	       + nla_total_size(16) /* RTA_PREFSRC */
4390 	       + nla_total_size(4) /* RTA_TABLE */
4391 	       + nla_total_size(4) /* RTA_IIF */
4392 	       + nla_total_size(4) /* RTA_OIF */
4393 	       + nla_total_size(4) /* RTA_PRIORITY */
4394 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4395 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4396 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4397 	       + nla_total_size(1) /* RTA_PREF */
4398 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4399 	       + nexthop_len;
4400 }
4401 
4402 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4403 			    unsigned int *flags, bool skip_oif)
4404 {
4405 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4406 		*flags |= RTNH_F_DEAD;
4407 
4408 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4409 		*flags |= RTNH_F_LINKDOWN;
4410 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4411 			*flags |= RTNH_F_DEAD;
4412 	}
4413 
4414 	if (rt->rt6i_flags & RTF_GATEWAY) {
4415 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4416 			goto nla_put_failure;
4417 	}
4418 
4419 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4420 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4421 		*flags |= RTNH_F_OFFLOAD;
4422 
4423 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4424 	if (!skip_oif && rt->dst.dev &&
4425 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4426 		goto nla_put_failure;
4427 
4428 	if (rt->dst.lwtstate &&
4429 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4430 		goto nla_put_failure;
4431 
4432 	return 0;
4433 
4434 nla_put_failure:
4435 	return -EMSGSIZE;
4436 }
4437 
4438 /* add multipath next hop */
4439 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4440 {
4441 	struct rtnexthop *rtnh;
4442 	unsigned int flags = 0;
4443 
4444 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4445 	if (!rtnh)
4446 		goto nla_put_failure;
4447 
4448 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4449 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4450 
4451 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4452 		goto nla_put_failure;
4453 
4454 	rtnh->rtnh_flags = flags;
4455 
4456 	/* length of rtnetlink header + attributes */
4457 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4458 
4459 	return 0;
4460 
4461 nla_put_failure:
4462 	return -EMSGSIZE;
4463 }
4464 
4465 static int rt6_fill_node(struct net *net,
4466 			 struct sk_buff *skb, struct rt6_info *rt,
4467 			 struct in6_addr *dst, struct in6_addr *src,
4468 			 int iif, int type, u32 portid, u32 seq,
4469 			 unsigned int flags)
4470 {
4471 	u32 metrics[RTAX_MAX];
4472 	struct rtmsg *rtm;
4473 	struct nlmsghdr *nlh;
4474 	long expires;
4475 	u32 table;
4476 
4477 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4478 	if (!nlh)
4479 		return -EMSGSIZE;
4480 
4481 	rtm = nlmsg_data(nlh);
4482 	rtm->rtm_family = AF_INET6;
4483 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4484 	rtm->rtm_src_len = rt->rt6i_src.plen;
4485 	rtm->rtm_tos = 0;
4486 	if (rt->rt6i_table)
4487 		table = rt->rt6i_table->tb6_id;
4488 	else
4489 		table = RT6_TABLE_UNSPEC;
4490 	rtm->rtm_table = table;
4491 	if (nla_put_u32(skb, RTA_TABLE, table))
4492 		goto nla_put_failure;
4493 	if (rt->rt6i_flags & RTF_REJECT) {
4494 		switch (rt->dst.error) {
4495 		case -EINVAL:
4496 			rtm->rtm_type = RTN_BLACKHOLE;
4497 			break;
4498 		case -EACCES:
4499 			rtm->rtm_type = RTN_PROHIBIT;
4500 			break;
4501 		case -EAGAIN:
4502 			rtm->rtm_type = RTN_THROW;
4503 			break;
4504 		default:
4505 			rtm->rtm_type = RTN_UNREACHABLE;
4506 			break;
4507 		}
4508 	}
4509 	else if (rt->rt6i_flags & RTF_LOCAL)
4510 		rtm->rtm_type = RTN_LOCAL;
4511 	else if (rt->rt6i_flags & RTF_ANYCAST)
4512 		rtm->rtm_type = RTN_ANYCAST;
4513 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4514 		rtm->rtm_type = RTN_LOCAL;
4515 	else
4516 		rtm->rtm_type = RTN_UNICAST;
4517 	rtm->rtm_flags = 0;
4518 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4519 	rtm->rtm_protocol = rt->rt6i_protocol;
4520 
4521 	if (rt->rt6i_flags & RTF_CACHE)
4522 		rtm->rtm_flags |= RTM_F_CLONED;
4523 
4524 	if (dst) {
4525 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4526 			goto nla_put_failure;
4527 		rtm->rtm_dst_len = 128;
4528 	} else if (rtm->rtm_dst_len)
4529 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4530 			goto nla_put_failure;
4531 #ifdef CONFIG_IPV6_SUBTREES
4532 	if (src) {
4533 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4534 			goto nla_put_failure;
4535 		rtm->rtm_src_len = 128;
4536 	} else if (rtm->rtm_src_len &&
4537 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4538 		goto nla_put_failure;
4539 #endif
4540 	if (iif) {
4541 #ifdef CONFIG_IPV6_MROUTE
4542 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4543 			int err = ip6mr_get_route(net, skb, rtm, portid);
4544 
4545 			if (err == 0)
4546 				return 0;
4547 			if (err < 0)
4548 				goto nla_put_failure;
4549 		} else
4550 #endif
4551 			if (nla_put_u32(skb, RTA_IIF, iif))
4552 				goto nla_put_failure;
4553 	} else if (dst) {
4554 		struct in6_addr saddr_buf;
4555 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4556 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4557 			goto nla_put_failure;
4558 	}
4559 
4560 	if (rt->rt6i_prefsrc.plen) {
4561 		struct in6_addr saddr_buf;
4562 		saddr_buf = rt->rt6i_prefsrc.addr;
4563 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4564 			goto nla_put_failure;
4565 	}
4566 
4567 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4568 	if (rt->rt6i_pmtu)
4569 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4570 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4571 		goto nla_put_failure;
4572 
4573 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4574 		goto nla_put_failure;
4575 
4576 	/* For multipath routes, walk the siblings list and add
4577 	 * each as a nexthop within RTA_MULTIPATH.
4578 	 */
4579 	if (rt->rt6i_nsiblings) {
4580 		struct rt6_info *sibling, *next_sibling;
4581 		struct nlattr *mp;
4582 
4583 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4584 		if (!mp)
4585 			goto nla_put_failure;
4586 
4587 		if (rt6_add_nexthop(skb, rt) < 0)
4588 			goto nla_put_failure;
4589 
4590 		list_for_each_entry_safe(sibling, next_sibling,
4591 					 &rt->rt6i_siblings, rt6i_siblings) {
4592 			if (rt6_add_nexthop(skb, sibling) < 0)
4593 				goto nla_put_failure;
4594 		}
4595 
4596 		nla_nest_end(skb, mp);
4597 	} else {
4598 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4599 			goto nla_put_failure;
4600 	}
4601 
4602 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4603 
4604 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4605 		goto nla_put_failure;
4606 
4607 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4608 		goto nla_put_failure;
4609 
4610 
4611 	nlmsg_end(skb, nlh);
4612 	return 0;
4613 
4614 nla_put_failure:
4615 	nlmsg_cancel(skb, nlh);
4616 	return -EMSGSIZE;
4617 }
4618 
4619 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4620 {
4621 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4622 	struct net *net = arg->net;
4623 
4624 	if (rt == net->ipv6.ip6_null_entry)
4625 		return 0;
4626 
4627 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4628 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4629 
4630 		/* user wants prefix routes only */
4631 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4632 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4633 			/* success since this is not a prefix route */
4634 			return 1;
4635 		}
4636 	}
4637 
4638 	return rt6_fill_node(net,
4639 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4640 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4641 		     NLM_F_MULTI);
4642 }
4643 
4644 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4645 			      struct netlink_ext_ack *extack)
4646 {
4647 	struct net *net = sock_net(in_skb->sk);
4648 	struct nlattr *tb[RTA_MAX+1];
4649 	int err, iif = 0, oif = 0;
4650 	struct dst_entry *dst;
4651 	struct rt6_info *rt;
4652 	struct sk_buff *skb;
4653 	struct rtmsg *rtm;
4654 	struct flowi6 fl6;
4655 	bool fibmatch;
4656 
4657 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4658 			  extack);
4659 	if (err < 0)
4660 		goto errout;
4661 
4662 	err = -EINVAL;
4663 	memset(&fl6, 0, sizeof(fl6));
4664 	rtm = nlmsg_data(nlh);
4665 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4666 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4667 
4668 	if (tb[RTA_SRC]) {
4669 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4670 			goto errout;
4671 
4672 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4673 	}
4674 
4675 	if (tb[RTA_DST]) {
4676 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4677 			goto errout;
4678 
4679 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4680 	}
4681 
4682 	if (tb[RTA_IIF])
4683 		iif = nla_get_u32(tb[RTA_IIF]);
4684 
4685 	if (tb[RTA_OIF])
4686 		oif = nla_get_u32(tb[RTA_OIF]);
4687 
4688 	if (tb[RTA_MARK])
4689 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4690 
4691 	if (tb[RTA_UID])
4692 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4693 					   nla_get_u32(tb[RTA_UID]));
4694 	else
4695 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4696 
4697 	if (iif) {
4698 		struct net_device *dev;
4699 		int flags = 0;
4700 
4701 		rcu_read_lock();
4702 
4703 		dev = dev_get_by_index_rcu(net, iif);
4704 		if (!dev) {
4705 			rcu_read_unlock();
4706 			err = -ENODEV;
4707 			goto errout;
4708 		}
4709 
4710 		fl6.flowi6_iif = iif;
4711 
4712 		if (!ipv6_addr_any(&fl6.saddr))
4713 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4714 
4715 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4716 
4717 		rcu_read_unlock();
4718 	} else {
4719 		fl6.flowi6_oif = oif;
4720 
4721 		dst = ip6_route_output(net, NULL, &fl6);
4722 	}
4723 
4724 
4725 	rt = container_of(dst, struct rt6_info, dst);
4726 	if (rt->dst.error) {
4727 		err = rt->dst.error;
4728 		ip6_rt_put(rt);
4729 		goto errout;
4730 	}
4731 
4732 	if (rt == net->ipv6.ip6_null_entry) {
4733 		err = rt->dst.error;
4734 		ip6_rt_put(rt);
4735 		goto errout;
4736 	}
4737 
4738 	if (fibmatch && rt->from) {
4739 		struct rt6_info *ort = rt->from;
4740 
4741 		dst_hold(&ort->dst);
4742 		ip6_rt_put(rt);
4743 		rt = ort;
4744 	}
4745 
4746 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4747 	if (!skb) {
4748 		ip6_rt_put(rt);
4749 		err = -ENOBUFS;
4750 		goto errout;
4751 	}
4752 
4753 	skb_dst_set(skb, &rt->dst);
4754 	if (fibmatch)
4755 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4756 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4757 				    nlh->nlmsg_seq, 0);
4758 	else
4759 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4760 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4761 				    nlh->nlmsg_seq, 0);
4762 	if (err < 0) {
4763 		kfree_skb(skb);
4764 		goto errout;
4765 	}
4766 
4767 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4768 errout:
4769 	return err;
4770 }
4771 
4772 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4773 		     unsigned int nlm_flags)
4774 {
4775 	struct sk_buff *skb;
4776 	struct net *net = info->nl_net;
4777 	u32 seq;
4778 	int err;
4779 
4780 	err = -ENOBUFS;
4781 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4782 
4783 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4784 	if (!skb)
4785 		goto errout;
4786 
4787 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4788 				event, info->portid, seq, nlm_flags);
4789 	if (err < 0) {
4790 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4791 		WARN_ON(err == -EMSGSIZE);
4792 		kfree_skb(skb);
4793 		goto errout;
4794 	}
4795 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4796 		    info->nlh, gfp_any());
4797 	return;
4798 errout:
4799 	if (err < 0)
4800 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4801 }
4802 
4803 static int ip6_route_dev_notify(struct notifier_block *this,
4804 				unsigned long event, void *ptr)
4805 {
4806 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4807 	struct net *net = dev_net(dev);
4808 
4809 	if (!(dev->flags & IFF_LOOPBACK))
4810 		return NOTIFY_OK;
4811 
4812 	if (event == NETDEV_REGISTER) {
4813 		net->ipv6.ip6_null_entry->dst.dev = dev;
4814 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4815 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4816 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4817 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4818 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4819 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4820 #endif
4821 	 } else if (event == NETDEV_UNREGISTER &&
4822 		    dev->reg_state != NETREG_UNREGISTERED) {
4823 		/* NETDEV_UNREGISTER could be fired for multiple times by
4824 		 * netdev_wait_allrefs(). Make sure we only call this once.
4825 		 */
4826 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4828 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4829 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4830 #endif
4831 	}
4832 
4833 	return NOTIFY_OK;
4834 }
4835 
4836 /*
4837  *	/proc
4838  */
4839 
4840 #ifdef CONFIG_PROC_FS
4841 
4842 static const struct file_operations ipv6_route_proc_fops = {
4843 	.open		= ipv6_route_open,
4844 	.read		= seq_read,
4845 	.llseek		= seq_lseek,
4846 	.release	= seq_release_net,
4847 };
4848 
4849 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4850 {
4851 	struct net *net = (struct net *)seq->private;
4852 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4853 		   net->ipv6.rt6_stats->fib_nodes,
4854 		   net->ipv6.rt6_stats->fib_route_nodes,
4855 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4856 		   net->ipv6.rt6_stats->fib_rt_entries,
4857 		   net->ipv6.rt6_stats->fib_rt_cache,
4858 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4859 		   net->ipv6.rt6_stats->fib_discarded_routes);
4860 
4861 	return 0;
4862 }
4863 
4864 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4865 {
4866 	return single_open_net(inode, file, rt6_stats_seq_show);
4867 }
4868 
4869 static const struct file_operations rt6_stats_seq_fops = {
4870 	.open	 = rt6_stats_seq_open,
4871 	.read	 = seq_read,
4872 	.llseek	 = seq_lseek,
4873 	.release = single_release_net,
4874 };
4875 #endif	/* CONFIG_PROC_FS */
4876 
4877 #ifdef CONFIG_SYSCTL
4878 
4879 static
4880 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4881 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4882 {
4883 	struct net *net;
4884 	int delay;
4885 	if (!write)
4886 		return -EINVAL;
4887 
4888 	net = (struct net *)ctl->extra1;
4889 	delay = net->ipv6.sysctl.flush_delay;
4890 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4891 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4892 	return 0;
4893 }
4894 
4895 struct ctl_table ipv6_route_table_template[] = {
4896 	{
4897 		.procname	=	"flush",
4898 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4899 		.maxlen		=	sizeof(int),
4900 		.mode		=	0200,
4901 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4902 	},
4903 	{
4904 		.procname	=	"gc_thresh",
4905 		.data		=	&ip6_dst_ops_template.gc_thresh,
4906 		.maxlen		=	sizeof(int),
4907 		.mode		=	0644,
4908 		.proc_handler	=	proc_dointvec,
4909 	},
4910 	{
4911 		.procname	=	"max_size",
4912 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4913 		.maxlen		=	sizeof(int),
4914 		.mode		=	0644,
4915 		.proc_handler	=	proc_dointvec,
4916 	},
4917 	{
4918 		.procname	=	"gc_min_interval",
4919 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4920 		.maxlen		=	sizeof(int),
4921 		.mode		=	0644,
4922 		.proc_handler	=	proc_dointvec_jiffies,
4923 	},
4924 	{
4925 		.procname	=	"gc_timeout",
4926 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4927 		.maxlen		=	sizeof(int),
4928 		.mode		=	0644,
4929 		.proc_handler	=	proc_dointvec_jiffies,
4930 	},
4931 	{
4932 		.procname	=	"gc_interval",
4933 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4934 		.maxlen		=	sizeof(int),
4935 		.mode		=	0644,
4936 		.proc_handler	=	proc_dointvec_jiffies,
4937 	},
4938 	{
4939 		.procname	=	"gc_elasticity",
4940 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4941 		.maxlen		=	sizeof(int),
4942 		.mode		=	0644,
4943 		.proc_handler	=	proc_dointvec,
4944 	},
4945 	{
4946 		.procname	=	"mtu_expires",
4947 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4948 		.maxlen		=	sizeof(int),
4949 		.mode		=	0644,
4950 		.proc_handler	=	proc_dointvec_jiffies,
4951 	},
4952 	{
4953 		.procname	=	"min_adv_mss",
4954 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4955 		.maxlen		=	sizeof(int),
4956 		.mode		=	0644,
4957 		.proc_handler	=	proc_dointvec,
4958 	},
4959 	{
4960 		.procname	=	"gc_min_interval_ms",
4961 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4962 		.maxlen		=	sizeof(int),
4963 		.mode		=	0644,
4964 		.proc_handler	=	proc_dointvec_ms_jiffies,
4965 	},
4966 	{ }
4967 };
4968 
4969 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4970 {
4971 	struct ctl_table *table;
4972 
4973 	table = kmemdup(ipv6_route_table_template,
4974 			sizeof(ipv6_route_table_template),
4975 			GFP_KERNEL);
4976 
4977 	if (table) {
4978 		table[0].data = &net->ipv6.sysctl.flush_delay;
4979 		table[0].extra1 = net;
4980 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4981 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4982 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4983 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4984 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4985 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4986 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4987 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4988 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4989 
4990 		/* Don't export sysctls to unprivileged users */
4991 		if (net->user_ns != &init_user_ns)
4992 			table[0].procname = NULL;
4993 	}
4994 
4995 	return table;
4996 }
4997 #endif
4998 
4999 static int __net_init ip6_route_net_init(struct net *net)
5000 {
5001 	int ret = -ENOMEM;
5002 
5003 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5004 	       sizeof(net->ipv6.ip6_dst_ops));
5005 
5006 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5007 		goto out_ip6_dst_ops;
5008 
5009 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5010 					   sizeof(*net->ipv6.ip6_null_entry),
5011 					   GFP_KERNEL);
5012 	if (!net->ipv6.ip6_null_entry)
5013 		goto out_ip6_dst_entries;
5014 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5015 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5016 			 ip6_template_metrics, true);
5017 
5018 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5019 	net->ipv6.fib6_has_custom_rules = false;
5020 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5021 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5022 					       GFP_KERNEL);
5023 	if (!net->ipv6.ip6_prohibit_entry)
5024 		goto out_ip6_null_entry;
5025 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5026 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5027 			 ip6_template_metrics, true);
5028 
5029 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5030 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5031 					       GFP_KERNEL);
5032 	if (!net->ipv6.ip6_blk_hole_entry)
5033 		goto out_ip6_prohibit_entry;
5034 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5035 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5036 			 ip6_template_metrics, true);
5037 #endif
5038 
5039 	net->ipv6.sysctl.flush_delay = 0;
5040 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5041 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5042 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5043 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5044 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5045 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5046 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5047 
5048 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5049 
5050 	ret = 0;
5051 out:
5052 	return ret;
5053 
5054 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5055 out_ip6_prohibit_entry:
5056 	kfree(net->ipv6.ip6_prohibit_entry);
5057 out_ip6_null_entry:
5058 	kfree(net->ipv6.ip6_null_entry);
5059 #endif
5060 out_ip6_dst_entries:
5061 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5062 out_ip6_dst_ops:
5063 	goto out;
5064 }
5065 
5066 static void __net_exit ip6_route_net_exit(struct net *net)
5067 {
5068 	kfree(net->ipv6.ip6_null_entry);
5069 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5070 	kfree(net->ipv6.ip6_prohibit_entry);
5071 	kfree(net->ipv6.ip6_blk_hole_entry);
5072 #endif
5073 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5074 }
5075 
5076 static int __net_init ip6_route_net_init_late(struct net *net)
5077 {
5078 #ifdef CONFIG_PROC_FS
5079 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5080 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5081 #endif
5082 	return 0;
5083 }
5084 
5085 static void __net_exit ip6_route_net_exit_late(struct net *net)
5086 {
5087 #ifdef CONFIG_PROC_FS
5088 	remove_proc_entry("ipv6_route", net->proc_net);
5089 	remove_proc_entry("rt6_stats", net->proc_net);
5090 #endif
5091 }
5092 
5093 static struct pernet_operations ip6_route_net_ops = {
5094 	.init = ip6_route_net_init,
5095 	.exit = ip6_route_net_exit,
5096 };
5097 
5098 static int __net_init ipv6_inetpeer_init(struct net *net)
5099 {
5100 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5101 
5102 	if (!bp)
5103 		return -ENOMEM;
5104 	inet_peer_base_init(bp);
5105 	net->ipv6.peers = bp;
5106 	return 0;
5107 }
5108 
5109 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5110 {
5111 	struct inet_peer_base *bp = net->ipv6.peers;
5112 
5113 	net->ipv6.peers = NULL;
5114 	inetpeer_invalidate_tree(bp);
5115 	kfree(bp);
5116 }
5117 
5118 static struct pernet_operations ipv6_inetpeer_ops = {
5119 	.init	=	ipv6_inetpeer_init,
5120 	.exit	=	ipv6_inetpeer_exit,
5121 };
5122 
5123 static struct pernet_operations ip6_route_net_late_ops = {
5124 	.init = ip6_route_net_init_late,
5125 	.exit = ip6_route_net_exit_late,
5126 };
5127 
5128 static struct notifier_block ip6_route_dev_notifier = {
5129 	.notifier_call = ip6_route_dev_notify,
5130 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5131 };
5132 
5133 void __init ip6_route_init_special_entries(void)
5134 {
5135 	/* Registering of the loopback is done before this portion of code,
5136 	 * the loopback reference in rt6_info will not be taken, do it
5137 	 * manually for init_net */
5138 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5139 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5140   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5141 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5142 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5143 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5144 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5145   #endif
5146 }
5147 
5148 int __init ip6_route_init(void)
5149 {
5150 	int ret;
5151 	int cpu;
5152 
5153 	ret = -ENOMEM;
5154 	ip6_dst_ops_template.kmem_cachep =
5155 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5156 				  SLAB_HWCACHE_ALIGN, NULL);
5157 	if (!ip6_dst_ops_template.kmem_cachep)
5158 		goto out;
5159 
5160 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5161 	if (ret)
5162 		goto out_kmem_cache;
5163 
5164 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5165 	if (ret)
5166 		goto out_dst_entries;
5167 
5168 	ret = register_pernet_subsys(&ip6_route_net_ops);
5169 	if (ret)
5170 		goto out_register_inetpeer;
5171 
5172 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5173 
5174 	ret = fib6_init();
5175 	if (ret)
5176 		goto out_register_subsys;
5177 
5178 	ret = xfrm6_init();
5179 	if (ret)
5180 		goto out_fib6_init;
5181 
5182 	ret = fib6_rules_init();
5183 	if (ret)
5184 		goto xfrm6_init;
5185 
5186 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5187 	if (ret)
5188 		goto fib6_rules_init;
5189 
5190 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5191 				   inet6_rtm_newroute, NULL, 0);
5192 	if (ret < 0)
5193 		goto out_register_late_subsys;
5194 
5195 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5196 				   inet6_rtm_delroute, NULL, 0);
5197 	if (ret < 0)
5198 		goto out_register_late_subsys;
5199 
5200 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5201 				   inet6_rtm_getroute, NULL,
5202 				   RTNL_FLAG_DOIT_UNLOCKED);
5203 	if (ret < 0)
5204 		goto out_register_late_subsys;
5205 
5206 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5207 	if (ret)
5208 		goto out_register_late_subsys;
5209 
5210 	for_each_possible_cpu(cpu) {
5211 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5212 
5213 		INIT_LIST_HEAD(&ul->head);
5214 		spin_lock_init(&ul->lock);
5215 	}
5216 
5217 out:
5218 	return ret;
5219 
5220 out_register_late_subsys:
5221 	rtnl_unregister_all(PF_INET6);
5222 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5223 fib6_rules_init:
5224 	fib6_rules_cleanup();
5225 xfrm6_init:
5226 	xfrm6_fini();
5227 out_fib6_init:
5228 	fib6_gc_cleanup();
5229 out_register_subsys:
5230 	unregister_pernet_subsys(&ip6_route_net_ops);
5231 out_register_inetpeer:
5232 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5233 out_dst_entries:
5234 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5235 out_kmem_cache:
5236 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5237 	goto out;
5238 }
5239 
5240 void ip6_route_cleanup(void)
5241 {
5242 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5243 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5244 	fib6_rules_cleanup();
5245 	xfrm6_fini();
5246 	fib6_gc_cleanup();
5247 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5248 	unregister_pernet_subsys(&ip6_route_net_ops);
5249 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5250 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5251 }
5252