xref: /linux/net/ipv6/route.c (revision b0d5c81e872ed21de1e56feb0fa6e4161da7be61)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 
459 	/* We might have already computed the hash for ICMPv6 errors. In such
460 	 * case it will always be non-zero. Otherwise now is the time to do it.
461 	 */
462 	if (!fl6->mp_hash)
463 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464 
465 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466 		return match;
467 
468 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469 				 rt6i_siblings) {
470 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471 			continue;
472 		if (rt6_score_route(sibling, oif, strict) < 0)
473 			break;
474 		match = sibling;
475 		break;
476 	}
477 
478 	return match;
479 }
480 
481 /*
482  *	Route lookup. rcu_read_lock() should be held.
483  */
484 
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486 						    struct rt6_info *rt,
487 						    const struct in6_addr *saddr,
488 						    int oif,
489 						    int flags)
490 {
491 	struct rt6_info *local = NULL;
492 	struct rt6_info *sprt;
493 
494 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495 		return rt;
496 
497 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498 		struct net_device *dev = sprt->dst.dev;
499 
500 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501 			continue;
502 
503 		if (oif) {
504 			if (dev->ifindex == oif)
505 				return sprt;
506 			if (dev->flags & IFF_LOOPBACK) {
507 				if (!sprt->rt6i_idev ||
508 				    sprt->rt6i_idev->dev->ifindex != oif) {
509 					if (flags & RT6_LOOKUP_F_IFACE)
510 						continue;
511 					if (local &&
512 					    local->rt6i_idev->dev->ifindex == oif)
513 						continue;
514 				}
515 				local = sprt;
516 			}
517 		} else {
518 			if (ipv6_chk_addr(net, saddr, dev,
519 					  flags & RT6_LOOKUP_F_IFACE))
520 				return sprt;
521 		}
522 	}
523 
524 	if (oif) {
525 		if (local)
526 			return local;
527 
528 		if (flags & RT6_LOOKUP_F_IFACE)
529 			return net->ipv6.ip6_null_entry;
530 	}
531 
532 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534 
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 	struct work_struct work;
538 	struct in6_addr target;
539 	struct net_device *dev;
540 };
541 
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544 	struct in6_addr mcaddr;
545 	struct __rt6_probe_work *work =
546 		container_of(w, struct __rt6_probe_work, work);
547 
548 	addrconf_addr_solict_mult(&work->target, &mcaddr);
549 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 	dev_put(work->dev);
551 	kfree(work);
552 }
553 
554 static void rt6_probe(struct rt6_info *rt)
555 {
556 	struct __rt6_probe_work *work;
557 	struct neighbour *neigh;
558 	/*
559 	 * Okay, this does not seem to be appropriate
560 	 * for now, however, we need to check if it
561 	 * is really so; aka Router Reachability Probing.
562 	 *
563 	 * Router Reachability Probe MUST be rate-limited
564 	 * to no more than one per minute.
565 	 */
566 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 		return;
568 	rcu_read_lock_bh();
569 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 	if (neigh) {
571 		if (neigh->nud_state & NUD_VALID)
572 			goto out;
573 
574 		work = NULL;
575 		write_lock(&neigh->lock);
576 		if (!(neigh->nud_state & NUD_VALID) &&
577 		    time_after(jiffies,
578 			       neigh->updated +
579 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else {
586 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
587 	}
588 
589 	if (work) {
590 		INIT_WORK(&work->work, rt6_probe_deferred);
591 		work->target = rt->rt6i_gateway;
592 		dev_hold(rt->dst.dev);
593 		work->dev = rt->dst.dev;
594 		schedule_work(&work->work);
595 	}
596 
597 out:
598 	rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605 
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611 	struct net_device *dev = rt->dst.dev;
612 	if (!oif || dev->ifindex == oif)
613 		return 2;
614 	if ((dev->flags & IFF_LOOPBACK) &&
615 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 		return 1;
617 	return 0;
618 }
619 
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622 	struct neighbour *neigh;
623 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624 
625 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 	    !(rt->rt6i_flags & RTF_GATEWAY))
627 		return RT6_NUD_SUCCEED;
628 
629 	rcu_read_lock_bh();
630 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 	if (neigh) {
632 		read_lock(&neigh->lock);
633 		if (neigh->nud_state & NUD_VALID)
634 			ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 		else if (!(neigh->nud_state & NUD_FAILED))
637 			ret = RT6_NUD_SUCCEED;
638 		else
639 			ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 		read_unlock(&neigh->lock);
642 	} else {
643 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645 	}
646 	rcu_read_unlock_bh();
647 
648 	return ret;
649 }
650 
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 			   int strict)
653 {
654 	int m;
655 
656 	m = rt6_check_dev(rt, oif);
657 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 		return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 	if (strict & RT6_LOOKUP_F_REACHABLE) {
663 		int n = rt6_check_neigh(rt);
664 		if (n < 0)
665 			return n;
666 	}
667 	return m;
668 }
669 
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 				   int *mpri, struct rt6_info *match,
672 				   bool *do_rr)
673 {
674 	int m;
675 	bool match_do_rr = false;
676 	struct inet6_dev *idev = rt->rt6i_idev;
677 
678 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679 		goto out;
680 
681 	if (idev->cnf.ignore_routes_with_linkdown &&
682 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 		goto out;
685 
686 	if (rt6_check_expired(rt))
687 		goto out;
688 
689 	m = rt6_score_route(rt, oif, strict);
690 	if (m == RT6_NUD_FAIL_DO_RR) {
691 		match_do_rr = true;
692 		m = 0; /* lowest valid score */
693 	} else if (m == RT6_NUD_FAIL_HARD) {
694 		goto out;
695 	}
696 
697 	if (strict & RT6_LOOKUP_F_REACHABLE)
698 		rt6_probe(rt);
699 
700 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
701 	if (m > *mpri) {
702 		*do_rr = match_do_rr;
703 		*mpri = m;
704 		match = rt;
705 	}
706 out:
707 	return match;
708 }
709 
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711 				     struct rt6_info *leaf,
712 				     struct rt6_info *rr_head,
713 				     u32 metric, int oif, int strict,
714 				     bool *do_rr)
715 {
716 	struct rt6_info *rt, *match, *cont;
717 	int mpri = -1;
718 
719 	match = NULL;
720 	cont = NULL;
721 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722 		if (rt->rt6i_metric != metric) {
723 			cont = rt;
724 			break;
725 		}
726 
727 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 	}
729 
730 	for (rt = leaf; rt && rt != rr_head;
731 	     rt = rcu_dereference(rt->rt6_next)) {
732 		if (rt->rt6i_metric != metric) {
733 			cont = rt;
734 			break;
735 		}
736 
737 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 	}
739 
740 	if (match || !cont)
741 		return match;
742 
743 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 
746 	return match;
747 }
748 
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 				   int oif, int strict)
751 {
752 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
753 	struct rt6_info *match, *rt0;
754 	bool do_rr = false;
755 	int key_plen;
756 
757 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
758 		return net->ipv6.ip6_null_entry;
759 
760 	rt0 = rcu_dereference(fn->rr_ptr);
761 	if (!rt0)
762 		rt0 = leaf;
763 
764 	/* Double check to make sure fn is not an intermediate node
765 	 * and fn->leaf does not points to its child's leaf
766 	 * (This might happen if all routes under fn are deleted from
767 	 * the tree and fib6_repair_tree() is called on the node.)
768 	 */
769 	key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771 	if (rt0->rt6i_src.plen)
772 		key_plen = rt0->rt6i_src.plen;
773 #endif
774 	if (fn->fn_bit != key_plen)
775 		return net->ipv6.ip6_null_entry;
776 
777 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778 			     &do_rr);
779 
780 	if (do_rr) {
781 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782 
783 		/* no entries matched; do round-robin */
784 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
785 			next = leaf;
786 
787 		if (next != rt0) {
788 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789 			/* make sure next is not being deleted from the tree */
790 			if (next->rt6i_node)
791 				rcu_assign_pointer(fn->rr_ptr, next);
792 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793 		}
794 	}
795 
796 	return match ? match : net->ipv6.ip6_null_entry;
797 }
798 
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803 
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806 		  const struct in6_addr *gwaddr)
807 {
808 	struct net *net = dev_net(dev);
809 	struct route_info *rinfo = (struct route_info *) opt;
810 	struct in6_addr prefix_buf, *prefix;
811 	unsigned int pref;
812 	unsigned long lifetime;
813 	struct rt6_info *rt;
814 
815 	if (len < sizeof(struct route_info)) {
816 		return -EINVAL;
817 	}
818 
819 	/* Sanity check for prefix_len and length */
820 	if (rinfo->length > 3) {
821 		return -EINVAL;
822 	} else if (rinfo->prefix_len > 128) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 64) {
825 		if (rinfo->length < 2) {
826 			return -EINVAL;
827 		}
828 	} else if (rinfo->prefix_len > 0) {
829 		if (rinfo->length < 1) {
830 			return -EINVAL;
831 		}
832 	}
833 
834 	pref = rinfo->route_pref;
835 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 		return -EINVAL;
837 
838 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839 
840 	if (rinfo->length == 3)
841 		prefix = (struct in6_addr *)rinfo->prefix;
842 	else {
843 		/* this function is safe */
844 		ipv6_addr_prefix(&prefix_buf,
845 				 (struct in6_addr *)rinfo->prefix,
846 				 rinfo->prefix_len);
847 		prefix = &prefix_buf;
848 	}
849 
850 	if (rinfo->prefix_len == 0)
851 		rt = rt6_get_dflt_router(gwaddr, dev);
852 	else
853 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 					gwaddr, dev);
855 
856 	if (rt && !lifetime) {
857 		ip6_del_rt(rt);
858 		rt = NULL;
859 	}
860 
861 	if (!rt && lifetime)
862 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 					dev, pref);
864 	else if (rt)
865 		rt->rt6i_flags = RTF_ROUTEINFO |
866 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867 
868 	if (rt) {
869 		if (!addrconf_finite_timeout(lifetime))
870 			rt6_clean_expires(rt);
871 		else
872 			rt6_set_expires(rt, jiffies + HZ * lifetime);
873 
874 		ip6_rt_put(rt);
875 	}
876 	return 0;
877 }
878 #endif
879 
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881 					struct in6_addr *saddr)
882 {
883 	struct fib6_node *pn, *sn;
884 	while (1) {
885 		if (fn->fn_flags & RTN_TL_ROOT)
886 			return NULL;
887 		pn = rcu_dereference(fn->parent);
888 		sn = FIB6_SUBTREE(pn);
889 		if (sn && sn != fn)
890 			fn = fib6_lookup(sn, NULL, saddr);
891 		else
892 			fn = pn;
893 		if (fn->fn_flags & RTN_RTINFO)
894 			return fn;
895 	}
896 }
897 
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 			  bool null_fallback)
900 {
901 	struct rt6_info *rt = *prt;
902 
903 	if (dst_hold_safe(&rt->dst))
904 		return true;
905 	if (null_fallback) {
906 		rt = net->ipv6.ip6_null_entry;
907 		dst_hold(&rt->dst);
908 	} else {
909 		rt = NULL;
910 	}
911 	*prt = rt;
912 	return false;
913 }
914 
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916 					     struct fib6_table *table,
917 					     struct flowi6 *fl6, int flags)
918 {
919 	struct rt6_info *rt, *rt_cache;
920 	struct fib6_node *fn;
921 
922 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
923 		flags &= ~RT6_LOOKUP_F_IFACE;
924 
925 	rcu_read_lock();
926 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
927 restart:
928 	rt = rcu_dereference(fn->leaf);
929 	if (!rt) {
930 		rt = net->ipv6.ip6_null_entry;
931 	} else {
932 		rt = rt6_device_match(net, rt, &fl6->saddr,
933 				      fl6->flowi6_oif, flags);
934 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
935 			rt = rt6_multipath_select(rt, fl6,
936 						  fl6->flowi6_oif, flags);
937 	}
938 	if (rt == net->ipv6.ip6_null_entry) {
939 		fn = fib6_backtrack(fn, &fl6->saddr);
940 		if (fn)
941 			goto restart;
942 	}
943 	/* Search through exception table */
944 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
945 	if (rt_cache)
946 		rt = rt_cache;
947 
948 	if (ip6_hold_safe(net, &rt, true))
949 		dst_use_noref(&rt->dst, jiffies);
950 
951 	rcu_read_unlock();
952 
953 	trace_fib6_table_lookup(net, rt, table, fl6);
954 
955 	return rt;
956 
957 }
958 
959 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
960 				    int flags)
961 {
962 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
963 }
964 EXPORT_SYMBOL_GPL(ip6_route_lookup);
965 
966 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
967 			    const struct in6_addr *saddr, int oif, int strict)
968 {
969 	struct flowi6 fl6 = {
970 		.flowi6_oif = oif,
971 		.daddr = *daddr,
972 	};
973 	struct dst_entry *dst;
974 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
975 
976 	if (saddr) {
977 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
978 		flags |= RT6_LOOKUP_F_HAS_SADDR;
979 	}
980 
981 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
982 	if (dst->error == 0)
983 		return (struct rt6_info *) dst;
984 
985 	dst_release(dst);
986 
987 	return NULL;
988 }
989 EXPORT_SYMBOL(rt6_lookup);
990 
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992  * It takes new route entry, the addition fails by any reason the
993  * route is released.
994  * Caller must hold dst before calling it.
995  */
996 
997 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
998 			struct mx6_config *mxc,
999 			struct netlink_ext_ack *extack)
1000 {
1001 	int err;
1002 	struct fib6_table *table;
1003 
1004 	table = rt->rt6i_table;
1005 	spin_lock_bh(&table->tb6_lock);
1006 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1007 	spin_unlock_bh(&table->tb6_lock);
1008 
1009 	return err;
1010 }
1011 
1012 int ip6_ins_rt(struct rt6_info *rt)
1013 {
1014 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1015 	struct mx6_config mxc = { .mx = NULL, };
1016 
1017 	/* Hold dst to account for the reference from the fib6 tree */
1018 	dst_hold(&rt->dst);
1019 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1020 }
1021 
1022 /* called with rcu_lock held */
1023 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1024 {
1025 	struct net_device *dev = rt->dst.dev;
1026 
1027 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1028 		/* for copies of local routes, dst->dev needs to be the
1029 		 * device if it is a master device, the master device if
1030 		 * device is enslaved, and the loopback as the default
1031 		 */
1032 		if (netif_is_l3_slave(dev) &&
1033 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1034 			dev = l3mdev_master_dev_rcu(dev);
1035 		else if (!netif_is_l3_master(dev))
1036 			dev = dev_net(dev)->loopback_dev;
1037 		/* last case is netif_is_l3_master(dev) is true in which
1038 		 * case we want dev returned to be dev
1039 		 */
1040 	}
1041 
1042 	return dev;
1043 }
1044 
1045 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1046 					   const struct in6_addr *daddr,
1047 					   const struct in6_addr *saddr)
1048 {
1049 	struct net_device *dev;
1050 	struct rt6_info *rt;
1051 
1052 	/*
1053 	 *	Clone the route.
1054 	 */
1055 
1056 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1057 		ort = ort->from;
1058 
1059 	rcu_read_lock();
1060 	dev = ip6_rt_get_dev_rcu(ort);
1061 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1062 	rcu_read_unlock();
1063 	if (!rt)
1064 		return NULL;
1065 
1066 	ip6_rt_copy_init(rt, ort);
1067 	rt->rt6i_flags |= RTF_CACHE;
1068 	rt->rt6i_metric = 0;
1069 	rt->dst.flags |= DST_HOST;
1070 	rt->rt6i_dst.addr = *daddr;
1071 	rt->rt6i_dst.plen = 128;
1072 
1073 	if (!rt6_is_gw_or_nonexthop(ort)) {
1074 		if (ort->rt6i_dst.plen != 128 &&
1075 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1076 			rt->rt6i_flags |= RTF_ANYCAST;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078 		if (rt->rt6i_src.plen && saddr) {
1079 			rt->rt6i_src.addr = *saddr;
1080 			rt->rt6i_src.plen = 128;
1081 		}
1082 #endif
1083 	}
1084 
1085 	return rt;
1086 }
1087 
1088 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1089 {
1090 	struct net_device *dev;
1091 	struct rt6_info *pcpu_rt;
1092 
1093 	rcu_read_lock();
1094 	dev = ip6_rt_get_dev_rcu(rt);
1095 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1096 	rcu_read_unlock();
1097 	if (!pcpu_rt)
1098 		return NULL;
1099 	ip6_rt_copy_init(pcpu_rt, rt);
1100 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1101 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1102 	return pcpu_rt;
1103 }
1104 
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1107 {
1108 	struct rt6_info *pcpu_rt, **p;
1109 
1110 	p = this_cpu_ptr(rt->rt6i_pcpu);
1111 	pcpu_rt = *p;
1112 
1113 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1114 		rt6_dst_from_metrics_check(pcpu_rt);
1115 
1116 	return pcpu_rt;
1117 }
1118 
1119 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1120 {
1121 	struct rt6_info *pcpu_rt, *prev, **p;
1122 
1123 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1124 	if (!pcpu_rt) {
1125 		struct net *net = dev_net(rt->dst.dev);
1126 
1127 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1128 		return net->ipv6.ip6_null_entry;
1129 	}
1130 
1131 	dst_hold(&pcpu_rt->dst);
1132 	p = this_cpu_ptr(rt->rt6i_pcpu);
1133 	prev = cmpxchg(p, NULL, pcpu_rt);
1134 	BUG_ON(prev);
1135 
1136 	rt6_dst_from_metrics_check(pcpu_rt);
1137 	return pcpu_rt;
1138 }
1139 
1140 /* exception hash table implementation
1141  */
1142 static DEFINE_SPINLOCK(rt6_exception_lock);
1143 
1144 /* Remove rt6_ex from hash table and free the memory
1145  * Caller must hold rt6_exception_lock
1146  */
1147 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1148 				 struct rt6_exception *rt6_ex)
1149 {
1150 	struct net *net;
1151 
1152 	if (!bucket || !rt6_ex)
1153 		return;
1154 
1155 	net = dev_net(rt6_ex->rt6i->dst.dev);
1156 	rt6_ex->rt6i->rt6i_node = NULL;
1157 	hlist_del_rcu(&rt6_ex->hlist);
1158 	rt6_release(rt6_ex->rt6i);
1159 	kfree_rcu(rt6_ex, rcu);
1160 	WARN_ON_ONCE(!bucket->depth);
1161 	bucket->depth--;
1162 	net->ipv6.rt6_stats->fib_rt_cache--;
1163 }
1164 
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166  * Caller must hold rt6_exception_lock
1167  */
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1169 {
1170 	struct rt6_exception *rt6_ex, *oldest = NULL;
1171 
1172 	if (!bucket)
1173 		return;
1174 
1175 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1176 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1177 			oldest = rt6_ex;
1178 	}
1179 	rt6_remove_exception(bucket, oldest);
1180 }
1181 
1182 static u32 rt6_exception_hash(const struct in6_addr *dst,
1183 			      const struct in6_addr *src)
1184 {
1185 	static u32 seed __read_mostly;
1186 	u32 val;
1187 
1188 	net_get_random_once(&seed, sizeof(seed));
1189 	val = jhash(dst, sizeof(*dst), seed);
1190 
1191 #ifdef CONFIG_IPV6_SUBTREES
1192 	if (src)
1193 		val = jhash(src, sizeof(*src), val);
1194 #endif
1195 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1196 }
1197 
1198 /* Helper function to find the cached rt in the hash table
1199  * and update bucket pointer to point to the bucket for this
1200  * (daddr, saddr) pair
1201  * Caller must hold rt6_exception_lock
1202  */
1203 static struct rt6_exception *
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1205 			      const struct in6_addr *daddr,
1206 			      const struct in6_addr *saddr)
1207 {
1208 	struct rt6_exception *rt6_ex;
1209 	u32 hval;
1210 
1211 	if (!(*bucket) || !daddr)
1212 		return NULL;
1213 
1214 	hval = rt6_exception_hash(daddr, saddr);
1215 	*bucket += hval;
1216 
1217 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1218 		struct rt6_info *rt6 = rt6_ex->rt6i;
1219 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1220 
1221 #ifdef CONFIG_IPV6_SUBTREES
1222 		if (matched && saddr)
1223 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 #endif
1225 		if (matched)
1226 			return rt6_ex;
1227 	}
1228 	return NULL;
1229 }
1230 
1231 /* Helper function to find the cached rt in the hash table
1232  * and update bucket pointer to point to the bucket for this
1233  * (daddr, saddr) pair
1234  * Caller must hold rcu_read_lock()
1235  */
1236 static struct rt6_exception *
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1238 			 const struct in6_addr *daddr,
1239 			 const struct in6_addr *saddr)
1240 {
1241 	struct rt6_exception *rt6_ex;
1242 	u32 hval;
1243 
1244 	WARN_ON_ONCE(!rcu_read_lock_held());
1245 
1246 	if (!(*bucket) || !daddr)
1247 		return NULL;
1248 
1249 	hval = rt6_exception_hash(daddr, saddr);
1250 	*bucket += hval;
1251 
1252 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1253 		struct rt6_info *rt6 = rt6_ex->rt6i;
1254 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1255 
1256 #ifdef CONFIG_IPV6_SUBTREES
1257 		if (matched && saddr)
1258 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 #endif
1260 		if (matched)
1261 			return rt6_ex;
1262 	}
1263 	return NULL;
1264 }
1265 
1266 static int rt6_insert_exception(struct rt6_info *nrt,
1267 				struct rt6_info *ort)
1268 {
1269 	struct net *net = dev_net(ort->dst.dev);
1270 	struct rt6_exception_bucket *bucket;
1271 	struct in6_addr *src_key = NULL;
1272 	struct rt6_exception *rt6_ex;
1273 	int err = 0;
1274 
1275 	/* ort can't be a cache or pcpu route */
1276 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1277 		ort = ort->from;
1278 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1279 
1280 	spin_lock_bh(&rt6_exception_lock);
1281 
1282 	if (ort->exception_bucket_flushed) {
1283 		err = -EINVAL;
1284 		goto out;
1285 	}
1286 
1287 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1288 					lockdep_is_held(&rt6_exception_lock));
1289 	if (!bucket) {
1290 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1291 				 GFP_ATOMIC);
1292 		if (!bucket) {
1293 			err = -ENOMEM;
1294 			goto out;
1295 		}
1296 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1297 	}
1298 
1299 #ifdef CONFIG_IPV6_SUBTREES
1300 	/* rt6i_src.plen != 0 indicates ort is in subtree
1301 	 * and exception table is indexed by a hash of
1302 	 * both rt6i_dst and rt6i_src.
1303 	 * Otherwise, the exception table is indexed by
1304 	 * a hash of only rt6i_dst.
1305 	 */
1306 	if (ort->rt6i_src.plen)
1307 		src_key = &nrt->rt6i_src.addr;
1308 #endif
1309 
1310 	/* Update rt6i_prefsrc as it could be changed
1311 	 * in rt6_remove_prefsrc()
1312 	 */
1313 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1314 	/* rt6_mtu_change() might lower mtu on ort.
1315 	 * Only insert this exception route if its mtu
1316 	 * is less than ort's mtu value.
1317 	 */
1318 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1319 		err = -EINVAL;
1320 		goto out;
1321 	}
1322 
1323 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1324 					       src_key);
1325 	if (rt6_ex)
1326 		rt6_remove_exception(bucket, rt6_ex);
1327 
1328 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1329 	if (!rt6_ex) {
1330 		err = -ENOMEM;
1331 		goto out;
1332 	}
1333 	rt6_ex->rt6i = nrt;
1334 	rt6_ex->stamp = jiffies;
1335 	atomic_inc(&nrt->rt6i_ref);
1336 	nrt->rt6i_node = ort->rt6i_node;
1337 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1338 	bucket->depth++;
1339 	net->ipv6.rt6_stats->fib_rt_cache++;
1340 
1341 	if (bucket->depth > FIB6_MAX_DEPTH)
1342 		rt6_exception_remove_oldest(bucket);
1343 
1344 out:
1345 	spin_unlock_bh(&rt6_exception_lock);
1346 
1347 	/* Update fn->fn_sernum to invalidate all cached dst */
1348 	if (!err) {
1349 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1350 		fib6_update_sernum(ort);
1351 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1352 		fib6_force_start_gc(net);
1353 	}
1354 
1355 	return err;
1356 }
1357 
1358 void rt6_flush_exceptions(struct rt6_info *rt)
1359 {
1360 	struct rt6_exception_bucket *bucket;
1361 	struct rt6_exception *rt6_ex;
1362 	struct hlist_node *tmp;
1363 	int i;
1364 
1365 	spin_lock_bh(&rt6_exception_lock);
1366 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1367 	rt->exception_bucket_flushed = 1;
1368 
1369 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1370 				    lockdep_is_held(&rt6_exception_lock));
1371 	if (!bucket)
1372 		goto out;
1373 
1374 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1375 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1376 			rt6_remove_exception(bucket, rt6_ex);
1377 		WARN_ON_ONCE(bucket->depth);
1378 		bucket++;
1379 	}
1380 
1381 out:
1382 	spin_unlock_bh(&rt6_exception_lock);
1383 }
1384 
1385 /* Find cached rt in the hash table inside passed in rt
1386  * Caller has to hold rcu_read_lock()
1387  */
1388 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1389 					   struct in6_addr *daddr,
1390 					   struct in6_addr *saddr)
1391 {
1392 	struct rt6_exception_bucket *bucket;
1393 	struct in6_addr *src_key = NULL;
1394 	struct rt6_exception *rt6_ex;
1395 	struct rt6_info *res = NULL;
1396 
1397 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1398 
1399 #ifdef CONFIG_IPV6_SUBTREES
1400 	/* rt6i_src.plen != 0 indicates rt is in subtree
1401 	 * and exception table is indexed by a hash of
1402 	 * both rt6i_dst and rt6i_src.
1403 	 * Otherwise, the exception table is indexed by
1404 	 * a hash of only rt6i_dst.
1405 	 */
1406 	if (rt->rt6i_src.plen)
1407 		src_key = saddr;
1408 #endif
1409 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1410 
1411 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1412 		res = rt6_ex->rt6i;
1413 
1414 	return res;
1415 }
1416 
1417 /* Remove the passed in cached rt from the hash table that contains it */
1418 int rt6_remove_exception_rt(struct rt6_info *rt)
1419 {
1420 	struct rt6_exception_bucket *bucket;
1421 	struct rt6_info *from = rt->from;
1422 	struct in6_addr *src_key = NULL;
1423 	struct rt6_exception *rt6_ex;
1424 	int err;
1425 
1426 	if (!from ||
1427 	    !(rt->rt6i_flags & RTF_CACHE))
1428 		return -EINVAL;
1429 
1430 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1431 		return -ENOENT;
1432 
1433 	spin_lock_bh(&rt6_exception_lock);
1434 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1435 				    lockdep_is_held(&rt6_exception_lock));
1436 #ifdef CONFIG_IPV6_SUBTREES
1437 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1438 	 * and exception table is indexed by a hash of
1439 	 * both rt6i_dst and rt6i_src.
1440 	 * Otherwise, the exception table is indexed by
1441 	 * a hash of only rt6i_dst.
1442 	 */
1443 	if (from->rt6i_src.plen)
1444 		src_key = &rt->rt6i_src.addr;
1445 #endif
1446 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1447 					       &rt->rt6i_dst.addr,
1448 					       src_key);
1449 	if (rt6_ex) {
1450 		rt6_remove_exception(bucket, rt6_ex);
1451 		err = 0;
1452 	} else {
1453 		err = -ENOENT;
1454 	}
1455 
1456 	spin_unlock_bh(&rt6_exception_lock);
1457 	return err;
1458 }
1459 
1460 /* Find rt6_ex which contains the passed in rt cache and
1461  * refresh its stamp
1462  */
1463 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1464 {
1465 	struct rt6_exception_bucket *bucket;
1466 	struct rt6_info *from = rt->from;
1467 	struct in6_addr *src_key = NULL;
1468 	struct rt6_exception *rt6_ex;
1469 
1470 	if (!from ||
1471 	    !(rt->rt6i_flags & RTF_CACHE))
1472 		return;
1473 
1474 	rcu_read_lock();
1475 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1476 
1477 #ifdef CONFIG_IPV6_SUBTREES
1478 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1479 	 * and exception table is indexed by a hash of
1480 	 * both rt6i_dst and rt6i_src.
1481 	 * Otherwise, the exception table is indexed by
1482 	 * a hash of only rt6i_dst.
1483 	 */
1484 	if (from->rt6i_src.plen)
1485 		src_key = &rt->rt6i_src.addr;
1486 #endif
1487 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1488 					  &rt->rt6i_dst.addr,
1489 					  src_key);
1490 	if (rt6_ex)
1491 		rt6_ex->stamp = jiffies;
1492 
1493 	rcu_read_unlock();
1494 }
1495 
1496 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1497 {
1498 	struct rt6_exception_bucket *bucket;
1499 	struct rt6_exception *rt6_ex;
1500 	int i;
1501 
1502 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503 					lockdep_is_held(&rt6_exception_lock));
1504 
1505 	if (bucket) {
1506 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1507 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1508 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1509 			}
1510 			bucket++;
1511 		}
1512 	}
1513 }
1514 
1515 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1516 					 struct rt6_info *rt, int mtu)
1517 {
1518 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1519 	 * lowest MTU in the path: always allow updating the route PMTU to
1520 	 * reflect PMTU decreases.
1521 	 *
1522 	 * If the new MTU is higher, and the route PMTU is equal to the local
1523 	 * MTU, this means the old MTU is the lowest in the path, so allow
1524 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1525 	 * handle this.
1526 	 */
1527 
1528 	if (dst_mtu(&rt->dst) >= mtu)
1529 		return true;
1530 
1531 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1532 		return true;
1533 
1534 	return false;
1535 }
1536 
1537 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1538 				       struct rt6_info *rt, int mtu)
1539 {
1540 	struct rt6_exception_bucket *bucket;
1541 	struct rt6_exception *rt6_ex;
1542 	int i;
1543 
1544 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1545 					lockdep_is_held(&rt6_exception_lock));
1546 
1547 	if (!bucket)
1548 		return;
1549 
1550 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1551 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1552 			struct rt6_info *entry = rt6_ex->rt6i;
1553 
1554 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1555 			 * route), the metrics of its rt->dst.from have already
1556 			 * been updated.
1557 			 */
1558 			if (entry->rt6i_pmtu &&
1559 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1560 				entry->rt6i_pmtu = mtu;
1561 		}
1562 		bucket++;
1563 	}
1564 }
1565 
1566 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1567 
1568 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1569 					struct in6_addr *gateway)
1570 {
1571 	struct rt6_exception_bucket *bucket;
1572 	struct rt6_exception *rt6_ex;
1573 	struct hlist_node *tmp;
1574 	int i;
1575 
1576 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1577 		return;
1578 
1579 	spin_lock_bh(&rt6_exception_lock);
1580 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1581 				     lockdep_is_held(&rt6_exception_lock));
1582 
1583 	if (bucket) {
1584 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1585 			hlist_for_each_entry_safe(rt6_ex, tmp,
1586 						  &bucket->chain, hlist) {
1587 				struct rt6_info *entry = rt6_ex->rt6i;
1588 
1589 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1590 				    RTF_CACHE_GATEWAY &&
1591 				    ipv6_addr_equal(gateway,
1592 						    &entry->rt6i_gateway)) {
1593 					rt6_remove_exception(bucket, rt6_ex);
1594 				}
1595 			}
1596 			bucket++;
1597 		}
1598 	}
1599 
1600 	spin_unlock_bh(&rt6_exception_lock);
1601 }
1602 
1603 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1604 				      struct rt6_exception *rt6_ex,
1605 				      struct fib6_gc_args *gc_args,
1606 				      unsigned long now)
1607 {
1608 	struct rt6_info *rt = rt6_ex->rt6i;
1609 
1610 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1611 	 * even if others have still references to them, so that on next
1612 	 * dst_check() such references can be dropped.
1613 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1614 	 * expired, independently from their aging, as per RFC 8201 section 4
1615 	 */
1616 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1617 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1618 			RT6_TRACE("aging clone %p\n", rt);
1619 			rt6_remove_exception(bucket, rt6_ex);
1620 			return;
1621 		}
1622 	} else if (time_after(jiffies, rt->dst.expires)) {
1623 		RT6_TRACE("purging expired route %p\n", rt);
1624 		rt6_remove_exception(bucket, rt6_ex);
1625 		return;
1626 	}
1627 
1628 	if (rt->rt6i_flags & RTF_GATEWAY) {
1629 		struct neighbour *neigh;
1630 		__u8 neigh_flags = 0;
1631 
1632 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1633 		if (neigh)
1634 			neigh_flags = neigh->flags;
1635 
1636 		if (!(neigh_flags & NTF_ROUTER)) {
1637 			RT6_TRACE("purging route %p via non-router but gateway\n",
1638 				  rt);
1639 			rt6_remove_exception(bucket, rt6_ex);
1640 			return;
1641 		}
1642 	}
1643 
1644 	gc_args->more++;
1645 }
1646 
1647 void rt6_age_exceptions(struct rt6_info *rt,
1648 			struct fib6_gc_args *gc_args,
1649 			unsigned long now)
1650 {
1651 	struct rt6_exception_bucket *bucket;
1652 	struct rt6_exception *rt6_ex;
1653 	struct hlist_node *tmp;
1654 	int i;
1655 
1656 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1657 		return;
1658 
1659 	rcu_read_lock_bh();
1660 	spin_lock(&rt6_exception_lock);
1661 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662 				    lockdep_is_held(&rt6_exception_lock));
1663 
1664 	if (bucket) {
1665 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666 			hlist_for_each_entry_safe(rt6_ex, tmp,
1667 						  &bucket->chain, hlist) {
1668 				rt6_age_examine_exception(bucket, rt6_ex,
1669 							  gc_args, now);
1670 			}
1671 			bucket++;
1672 		}
1673 	}
1674 	spin_unlock(&rt6_exception_lock);
1675 	rcu_read_unlock_bh();
1676 }
1677 
1678 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1679 			       int oif, struct flowi6 *fl6, int flags)
1680 {
1681 	struct fib6_node *fn, *saved_fn;
1682 	struct rt6_info *rt, *rt_cache;
1683 	int strict = 0;
1684 
1685 	strict |= flags & RT6_LOOKUP_F_IFACE;
1686 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1687 	if (net->ipv6.devconf_all->forwarding == 0)
1688 		strict |= RT6_LOOKUP_F_REACHABLE;
1689 
1690 	rcu_read_lock();
1691 
1692 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1693 	saved_fn = fn;
1694 
1695 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1696 		oif = 0;
1697 
1698 redo_rt6_select:
1699 	rt = rt6_select(net, fn, oif, strict);
1700 	if (rt->rt6i_nsiblings)
1701 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1702 	if (rt == net->ipv6.ip6_null_entry) {
1703 		fn = fib6_backtrack(fn, &fl6->saddr);
1704 		if (fn)
1705 			goto redo_rt6_select;
1706 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1707 			/* also consider unreachable route */
1708 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1709 			fn = saved_fn;
1710 			goto redo_rt6_select;
1711 		}
1712 	}
1713 
1714 	/*Search through exception table */
1715 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1716 	if (rt_cache)
1717 		rt = rt_cache;
1718 
1719 	if (rt == net->ipv6.ip6_null_entry) {
1720 		rcu_read_unlock();
1721 		dst_hold(&rt->dst);
1722 		trace_fib6_table_lookup(net, rt, table, fl6);
1723 		return rt;
1724 	} else if (rt->rt6i_flags & RTF_CACHE) {
1725 		if (ip6_hold_safe(net, &rt, true)) {
1726 			dst_use_noref(&rt->dst, jiffies);
1727 			rt6_dst_from_metrics_check(rt);
1728 		}
1729 		rcu_read_unlock();
1730 		trace_fib6_table_lookup(net, rt, table, fl6);
1731 		return rt;
1732 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1733 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1734 		/* Create a RTF_CACHE clone which will not be
1735 		 * owned by the fib6 tree.  It is for the special case where
1736 		 * the daddr in the skb during the neighbor look-up is different
1737 		 * from the fl6->daddr used to look-up route here.
1738 		 */
1739 
1740 		struct rt6_info *uncached_rt;
1741 
1742 		if (ip6_hold_safe(net, &rt, true)) {
1743 			dst_use_noref(&rt->dst, jiffies);
1744 		} else {
1745 			rcu_read_unlock();
1746 			uncached_rt = rt;
1747 			goto uncached_rt_out;
1748 		}
1749 		rcu_read_unlock();
1750 
1751 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1752 		dst_release(&rt->dst);
1753 
1754 		if (uncached_rt) {
1755 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1756 			 * No need for another dst_hold()
1757 			 */
1758 			rt6_uncached_list_add(uncached_rt);
1759 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1760 		} else {
1761 			uncached_rt = net->ipv6.ip6_null_entry;
1762 			dst_hold(&uncached_rt->dst);
1763 		}
1764 
1765 uncached_rt_out:
1766 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1767 		return uncached_rt;
1768 
1769 	} else {
1770 		/* Get a percpu copy */
1771 
1772 		struct rt6_info *pcpu_rt;
1773 
1774 		dst_use_noref(&rt->dst, jiffies);
1775 		local_bh_disable();
1776 		pcpu_rt = rt6_get_pcpu_route(rt);
1777 
1778 		if (!pcpu_rt) {
1779 			/* atomic_inc_not_zero() is needed when using rcu */
1780 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1781 				/* No dst_hold() on rt is needed because grabbing
1782 				 * rt->rt6i_ref makes sure rt can't be released.
1783 				 */
1784 				pcpu_rt = rt6_make_pcpu_route(rt);
1785 				rt6_release(rt);
1786 			} else {
1787 				/* rt is already removed from tree */
1788 				pcpu_rt = net->ipv6.ip6_null_entry;
1789 				dst_hold(&pcpu_rt->dst);
1790 			}
1791 		}
1792 		local_bh_enable();
1793 		rcu_read_unlock();
1794 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1795 		return pcpu_rt;
1796 	}
1797 }
1798 EXPORT_SYMBOL_GPL(ip6_pol_route);
1799 
1800 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1801 					    struct flowi6 *fl6, int flags)
1802 {
1803 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1804 }
1805 
1806 struct dst_entry *ip6_route_input_lookup(struct net *net,
1807 					 struct net_device *dev,
1808 					 struct flowi6 *fl6, int flags)
1809 {
1810 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1811 		flags |= RT6_LOOKUP_F_IFACE;
1812 
1813 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1814 }
1815 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1816 
1817 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1818 				  struct flow_keys *keys)
1819 {
1820 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1821 	const struct ipv6hdr *key_iph = outer_iph;
1822 	const struct ipv6hdr *inner_iph;
1823 	const struct icmp6hdr *icmph;
1824 	struct ipv6hdr _inner_iph;
1825 
1826 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1827 		goto out;
1828 
1829 	icmph = icmp6_hdr(skb);
1830 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1831 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1832 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1833 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1834 		goto out;
1835 
1836 	inner_iph = skb_header_pointer(skb,
1837 				       skb_transport_offset(skb) + sizeof(*icmph),
1838 				       sizeof(_inner_iph), &_inner_iph);
1839 	if (!inner_iph)
1840 		goto out;
1841 
1842 	key_iph = inner_iph;
1843 out:
1844 	memset(keys, 0, sizeof(*keys));
1845 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1846 	keys->addrs.v6addrs.src = key_iph->saddr;
1847 	keys->addrs.v6addrs.dst = key_iph->daddr;
1848 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1849 	keys->basic.ip_proto = key_iph->nexthdr;
1850 }
1851 
1852 /* if skb is set it will be used and fl6 can be NULL */
1853 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1854 {
1855 	struct flow_keys hash_keys;
1856 
1857 	if (skb) {
1858 		ip6_multipath_l3_keys(skb, &hash_keys);
1859 		return flow_hash_from_keys(&hash_keys) >> 1;
1860 	}
1861 
1862 	return get_hash_from_flowi6(fl6) >> 1;
1863 }
1864 
1865 void ip6_route_input(struct sk_buff *skb)
1866 {
1867 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1868 	struct net *net = dev_net(skb->dev);
1869 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1870 	struct ip_tunnel_info *tun_info;
1871 	struct flowi6 fl6 = {
1872 		.flowi6_iif = skb->dev->ifindex,
1873 		.daddr = iph->daddr,
1874 		.saddr = iph->saddr,
1875 		.flowlabel = ip6_flowinfo(iph),
1876 		.flowi6_mark = skb->mark,
1877 		.flowi6_proto = iph->nexthdr,
1878 	};
1879 
1880 	tun_info = skb_tunnel_info(skb);
1881 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1882 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1883 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1884 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1885 	skb_dst_drop(skb);
1886 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1887 }
1888 
1889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1890 					     struct flowi6 *fl6, int flags)
1891 {
1892 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1893 }
1894 
1895 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1896 					 struct flowi6 *fl6, int flags)
1897 {
1898 	bool any_src;
1899 
1900 	if (rt6_need_strict(&fl6->daddr)) {
1901 		struct dst_entry *dst;
1902 
1903 		dst = l3mdev_link_scope_lookup(net, fl6);
1904 		if (dst)
1905 			return dst;
1906 	}
1907 
1908 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1909 
1910 	any_src = ipv6_addr_any(&fl6->saddr);
1911 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1912 	    (fl6->flowi6_oif && any_src))
1913 		flags |= RT6_LOOKUP_F_IFACE;
1914 
1915 	if (!any_src)
1916 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1917 	else if (sk)
1918 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1919 
1920 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1921 }
1922 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1923 
1924 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1925 {
1926 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1927 	struct net_device *loopback_dev = net->loopback_dev;
1928 	struct dst_entry *new = NULL;
1929 
1930 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1931 		       DST_OBSOLETE_DEAD, 0);
1932 	if (rt) {
1933 		rt6_info_init(rt);
1934 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1935 
1936 		new = &rt->dst;
1937 		new->__use = 1;
1938 		new->input = dst_discard;
1939 		new->output = dst_discard_out;
1940 
1941 		dst_copy_metrics(new, &ort->dst);
1942 
1943 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1944 		rt->rt6i_gateway = ort->rt6i_gateway;
1945 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1946 		rt->rt6i_metric = 0;
1947 
1948 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1949 #ifdef CONFIG_IPV6_SUBTREES
1950 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1951 #endif
1952 	}
1953 
1954 	dst_release(dst_orig);
1955 	return new ? new : ERR_PTR(-ENOMEM);
1956 }
1957 
1958 /*
1959  *	Destination cache support functions
1960  */
1961 
1962 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1963 {
1964 	if (rt->from &&
1965 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1966 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1967 }
1968 
1969 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1970 {
1971 	u32 rt_cookie = 0;
1972 
1973 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1974 		return NULL;
1975 
1976 	if (rt6_check_expired(rt))
1977 		return NULL;
1978 
1979 	return &rt->dst;
1980 }
1981 
1982 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1983 {
1984 	if (!__rt6_check_expired(rt) &&
1985 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1986 	    rt6_check(rt->from, cookie))
1987 		return &rt->dst;
1988 	else
1989 		return NULL;
1990 }
1991 
1992 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1993 {
1994 	struct rt6_info *rt;
1995 
1996 	rt = (struct rt6_info *) dst;
1997 
1998 	/* All IPV6 dsts are created with ->obsolete set to the value
1999 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2000 	 * into this function always.
2001 	 */
2002 
2003 	rt6_dst_from_metrics_check(rt);
2004 
2005 	if (rt->rt6i_flags & RTF_PCPU ||
2006 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2007 		return rt6_dst_from_check(rt, cookie);
2008 	else
2009 		return rt6_check(rt, cookie);
2010 }
2011 
2012 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2013 {
2014 	struct rt6_info *rt = (struct rt6_info *) dst;
2015 
2016 	if (rt) {
2017 		if (rt->rt6i_flags & RTF_CACHE) {
2018 			if (rt6_check_expired(rt)) {
2019 				ip6_del_rt(rt);
2020 				dst = NULL;
2021 			}
2022 		} else {
2023 			dst_release(dst);
2024 			dst = NULL;
2025 		}
2026 	}
2027 	return dst;
2028 }
2029 
2030 static void ip6_link_failure(struct sk_buff *skb)
2031 {
2032 	struct rt6_info *rt;
2033 
2034 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2035 
2036 	rt = (struct rt6_info *) skb_dst(skb);
2037 	if (rt) {
2038 		if (rt->rt6i_flags & RTF_CACHE) {
2039 			if (dst_hold_safe(&rt->dst))
2040 				ip6_del_rt(rt);
2041 		} else {
2042 			struct fib6_node *fn;
2043 
2044 			rcu_read_lock();
2045 			fn = rcu_dereference(rt->rt6i_node);
2046 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2047 				fn->fn_sernum = -1;
2048 			rcu_read_unlock();
2049 		}
2050 	}
2051 }
2052 
2053 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2054 {
2055 	struct net *net = dev_net(rt->dst.dev);
2056 
2057 	rt->rt6i_flags |= RTF_MODIFIED;
2058 	rt->rt6i_pmtu = mtu;
2059 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2060 }
2061 
2062 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2063 {
2064 	return !(rt->rt6i_flags & RTF_CACHE) &&
2065 		(rt->rt6i_flags & RTF_PCPU ||
2066 		 rcu_access_pointer(rt->rt6i_node));
2067 }
2068 
2069 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2070 				 const struct ipv6hdr *iph, u32 mtu)
2071 {
2072 	const struct in6_addr *daddr, *saddr;
2073 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2074 
2075 	if (rt6->rt6i_flags & RTF_LOCAL)
2076 		return;
2077 
2078 	if (dst_metric_locked(dst, RTAX_MTU))
2079 		return;
2080 
2081 	if (iph) {
2082 		daddr = &iph->daddr;
2083 		saddr = &iph->saddr;
2084 	} else if (sk) {
2085 		daddr = &sk->sk_v6_daddr;
2086 		saddr = &inet6_sk(sk)->saddr;
2087 	} else {
2088 		daddr = NULL;
2089 		saddr = NULL;
2090 	}
2091 	dst_confirm_neigh(dst, daddr);
2092 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2093 	if (mtu >= dst_mtu(dst))
2094 		return;
2095 
2096 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2097 		rt6_do_update_pmtu(rt6, mtu);
2098 		/* update rt6_ex->stamp for cache */
2099 		if (rt6->rt6i_flags & RTF_CACHE)
2100 			rt6_update_exception_stamp_rt(rt6);
2101 	} else if (daddr) {
2102 		struct rt6_info *nrt6;
2103 
2104 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2105 		if (nrt6) {
2106 			rt6_do_update_pmtu(nrt6, mtu);
2107 			if (rt6_insert_exception(nrt6, rt6))
2108 				dst_release_immediate(&nrt6->dst);
2109 		}
2110 	}
2111 }
2112 
2113 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2114 			       struct sk_buff *skb, u32 mtu)
2115 {
2116 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2117 }
2118 
2119 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2120 		     int oif, u32 mark, kuid_t uid)
2121 {
2122 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2123 	struct dst_entry *dst;
2124 	struct flowi6 fl6;
2125 
2126 	memset(&fl6, 0, sizeof(fl6));
2127 	fl6.flowi6_oif = oif;
2128 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2129 	fl6.daddr = iph->daddr;
2130 	fl6.saddr = iph->saddr;
2131 	fl6.flowlabel = ip6_flowinfo(iph);
2132 	fl6.flowi6_uid = uid;
2133 
2134 	dst = ip6_route_output(net, NULL, &fl6);
2135 	if (!dst->error)
2136 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2137 	dst_release(dst);
2138 }
2139 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2140 
2141 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2142 {
2143 	struct dst_entry *dst;
2144 
2145 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2146 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2147 
2148 	dst = __sk_dst_get(sk);
2149 	if (!dst || !dst->obsolete ||
2150 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2151 		return;
2152 
2153 	bh_lock_sock(sk);
2154 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2155 		ip6_datagram_dst_update(sk, false);
2156 	bh_unlock_sock(sk);
2157 }
2158 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2159 
2160 /* Handle redirects */
2161 struct ip6rd_flowi {
2162 	struct flowi6 fl6;
2163 	struct in6_addr gateway;
2164 };
2165 
2166 static struct rt6_info *__ip6_route_redirect(struct net *net,
2167 					     struct fib6_table *table,
2168 					     struct flowi6 *fl6,
2169 					     int flags)
2170 {
2171 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2172 	struct rt6_info *rt, *rt_cache;
2173 	struct fib6_node *fn;
2174 
2175 	/* Get the "current" route for this destination and
2176 	 * check if the redirect has come from appropriate router.
2177 	 *
2178 	 * RFC 4861 specifies that redirects should only be
2179 	 * accepted if they come from the nexthop to the target.
2180 	 * Due to the way the routes are chosen, this notion
2181 	 * is a bit fuzzy and one might need to check all possible
2182 	 * routes.
2183 	 */
2184 
2185 	rcu_read_lock();
2186 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2187 restart:
2188 	for_each_fib6_node_rt_rcu(fn) {
2189 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2190 			continue;
2191 		if (rt6_check_expired(rt))
2192 			continue;
2193 		if (rt->dst.error)
2194 			break;
2195 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2196 			continue;
2197 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2198 			continue;
2199 		/* rt_cache's gateway might be different from its 'parent'
2200 		 * in the case of an ip redirect.
2201 		 * So we keep searching in the exception table if the gateway
2202 		 * is different.
2203 		 */
2204 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2205 			rt_cache = rt6_find_cached_rt(rt,
2206 						      &fl6->daddr,
2207 						      &fl6->saddr);
2208 			if (rt_cache &&
2209 			    ipv6_addr_equal(&rdfl->gateway,
2210 					    &rt_cache->rt6i_gateway)) {
2211 				rt = rt_cache;
2212 				break;
2213 			}
2214 			continue;
2215 		}
2216 		break;
2217 	}
2218 
2219 	if (!rt)
2220 		rt = net->ipv6.ip6_null_entry;
2221 	else if (rt->dst.error) {
2222 		rt = net->ipv6.ip6_null_entry;
2223 		goto out;
2224 	}
2225 
2226 	if (rt == net->ipv6.ip6_null_entry) {
2227 		fn = fib6_backtrack(fn, &fl6->saddr);
2228 		if (fn)
2229 			goto restart;
2230 	}
2231 
2232 out:
2233 	ip6_hold_safe(net, &rt, true);
2234 
2235 	rcu_read_unlock();
2236 
2237 	trace_fib6_table_lookup(net, rt, table, fl6);
2238 	return rt;
2239 };
2240 
2241 static struct dst_entry *ip6_route_redirect(struct net *net,
2242 					const struct flowi6 *fl6,
2243 					const struct in6_addr *gateway)
2244 {
2245 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2246 	struct ip6rd_flowi rdfl;
2247 
2248 	rdfl.fl6 = *fl6;
2249 	rdfl.gateway = *gateway;
2250 
2251 	return fib6_rule_lookup(net, &rdfl.fl6,
2252 				flags, __ip6_route_redirect);
2253 }
2254 
2255 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2256 		  kuid_t uid)
2257 {
2258 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2259 	struct dst_entry *dst;
2260 	struct flowi6 fl6;
2261 
2262 	memset(&fl6, 0, sizeof(fl6));
2263 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2264 	fl6.flowi6_oif = oif;
2265 	fl6.flowi6_mark = mark;
2266 	fl6.daddr = iph->daddr;
2267 	fl6.saddr = iph->saddr;
2268 	fl6.flowlabel = ip6_flowinfo(iph);
2269 	fl6.flowi6_uid = uid;
2270 
2271 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2272 	rt6_do_redirect(dst, NULL, skb);
2273 	dst_release(dst);
2274 }
2275 EXPORT_SYMBOL_GPL(ip6_redirect);
2276 
2277 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2278 			    u32 mark)
2279 {
2280 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2281 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2282 	struct dst_entry *dst;
2283 	struct flowi6 fl6;
2284 
2285 	memset(&fl6, 0, sizeof(fl6));
2286 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2287 	fl6.flowi6_oif = oif;
2288 	fl6.flowi6_mark = mark;
2289 	fl6.daddr = msg->dest;
2290 	fl6.saddr = iph->daddr;
2291 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2292 
2293 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2294 	rt6_do_redirect(dst, NULL, skb);
2295 	dst_release(dst);
2296 }
2297 
2298 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2299 {
2300 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2301 		     sk->sk_uid);
2302 }
2303 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2304 
2305 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2306 {
2307 	struct net_device *dev = dst->dev;
2308 	unsigned int mtu = dst_mtu(dst);
2309 	struct net *net = dev_net(dev);
2310 
2311 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2312 
2313 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2314 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2315 
2316 	/*
2317 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2318 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2319 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2320 	 * rely only on pmtu discovery"
2321 	 */
2322 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2323 		mtu = IPV6_MAXPLEN;
2324 	return mtu;
2325 }
2326 
2327 static unsigned int ip6_mtu(const struct dst_entry *dst)
2328 {
2329 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2330 	unsigned int mtu = rt->rt6i_pmtu;
2331 	struct inet6_dev *idev;
2332 
2333 	if (mtu)
2334 		goto out;
2335 
2336 	mtu = dst_metric_raw(dst, RTAX_MTU);
2337 	if (mtu)
2338 		goto out;
2339 
2340 	mtu = IPV6_MIN_MTU;
2341 
2342 	rcu_read_lock();
2343 	idev = __in6_dev_get(dst->dev);
2344 	if (idev)
2345 		mtu = idev->cnf.mtu6;
2346 	rcu_read_unlock();
2347 
2348 out:
2349 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2350 
2351 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2352 }
2353 
2354 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2355 				  struct flowi6 *fl6)
2356 {
2357 	struct dst_entry *dst;
2358 	struct rt6_info *rt;
2359 	struct inet6_dev *idev = in6_dev_get(dev);
2360 	struct net *net = dev_net(dev);
2361 
2362 	if (unlikely(!idev))
2363 		return ERR_PTR(-ENODEV);
2364 
2365 	rt = ip6_dst_alloc(net, dev, 0);
2366 	if (unlikely(!rt)) {
2367 		in6_dev_put(idev);
2368 		dst = ERR_PTR(-ENOMEM);
2369 		goto out;
2370 	}
2371 
2372 	rt->dst.flags |= DST_HOST;
2373 	rt->dst.input = ip6_input;
2374 	rt->dst.output  = ip6_output;
2375 	rt->rt6i_gateway  = fl6->daddr;
2376 	rt->rt6i_dst.addr = fl6->daddr;
2377 	rt->rt6i_dst.plen = 128;
2378 	rt->rt6i_idev     = idev;
2379 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2380 
2381 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2382 	 * do proper release of the net_device
2383 	 */
2384 	rt6_uncached_list_add(rt);
2385 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2386 
2387 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2388 
2389 out:
2390 	return dst;
2391 }
2392 
2393 static int ip6_dst_gc(struct dst_ops *ops)
2394 {
2395 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2396 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2397 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2398 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2399 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2400 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2401 	int entries;
2402 
2403 	entries = dst_entries_get_fast(ops);
2404 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2405 	    entries <= rt_max_size)
2406 		goto out;
2407 
2408 	net->ipv6.ip6_rt_gc_expire++;
2409 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2410 	entries = dst_entries_get_slow(ops);
2411 	if (entries < ops->gc_thresh)
2412 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2413 out:
2414 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2415 	return entries > rt_max_size;
2416 }
2417 
2418 static int ip6_convert_metrics(struct mx6_config *mxc,
2419 			       const struct fib6_config *cfg)
2420 {
2421 	struct net *net = cfg->fc_nlinfo.nl_net;
2422 	bool ecn_ca = false;
2423 	struct nlattr *nla;
2424 	int remaining;
2425 	u32 *mp;
2426 
2427 	if (!cfg->fc_mx)
2428 		return 0;
2429 
2430 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2431 	if (unlikely(!mp))
2432 		return -ENOMEM;
2433 
2434 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2435 		int type = nla_type(nla);
2436 		u32 val;
2437 
2438 		if (!type)
2439 			continue;
2440 		if (unlikely(type > RTAX_MAX))
2441 			goto err;
2442 
2443 		if (type == RTAX_CC_ALGO) {
2444 			char tmp[TCP_CA_NAME_MAX];
2445 
2446 			nla_strlcpy(tmp, nla, sizeof(tmp));
2447 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2448 			if (val == TCP_CA_UNSPEC)
2449 				goto err;
2450 		} else {
2451 			val = nla_get_u32(nla);
2452 		}
2453 		if (type == RTAX_HOPLIMIT && val > 255)
2454 			val = 255;
2455 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2456 			goto err;
2457 
2458 		mp[type - 1] = val;
2459 		__set_bit(type - 1, mxc->mx_valid);
2460 	}
2461 
2462 	if (ecn_ca) {
2463 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2464 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2465 	}
2466 
2467 	mxc->mx = mp;
2468 	return 0;
2469  err:
2470 	kfree(mp);
2471 	return -EINVAL;
2472 }
2473 
2474 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2475 					    struct fib6_config *cfg,
2476 					    const struct in6_addr *gw_addr,
2477 					    u32 tbid, int flags)
2478 {
2479 	struct flowi6 fl6 = {
2480 		.flowi6_oif = cfg->fc_ifindex,
2481 		.daddr = *gw_addr,
2482 		.saddr = cfg->fc_prefsrc,
2483 	};
2484 	struct fib6_table *table;
2485 	struct rt6_info *rt;
2486 
2487 	table = fib6_get_table(net, tbid);
2488 	if (!table)
2489 		return NULL;
2490 
2491 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2492 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2493 
2494 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2495 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2496 
2497 	/* if table lookup failed, fall back to full lookup */
2498 	if (rt == net->ipv6.ip6_null_entry) {
2499 		ip6_rt_put(rt);
2500 		rt = NULL;
2501 	}
2502 
2503 	return rt;
2504 }
2505 
2506 static int ip6_route_check_nh_onlink(struct net *net,
2507 				     struct fib6_config *cfg,
2508 				     struct net_device *dev,
2509 				     struct netlink_ext_ack *extack)
2510 {
2511 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2512 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2513 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2514 	struct rt6_info *grt;
2515 	int err;
2516 
2517 	err = 0;
2518 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2519 	if (grt) {
2520 		if (!grt->dst.error &&
2521 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2522 			NL_SET_ERR_MSG(extack,
2523 				       "Nexthop has invalid gateway or device mismatch");
2524 			err = -EINVAL;
2525 		}
2526 
2527 		ip6_rt_put(grt);
2528 	}
2529 
2530 	return err;
2531 }
2532 
2533 static int ip6_route_check_nh(struct net *net,
2534 			      struct fib6_config *cfg,
2535 			      struct net_device **_dev,
2536 			      struct inet6_dev **idev)
2537 {
2538 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2539 	struct net_device *dev = _dev ? *_dev : NULL;
2540 	struct rt6_info *grt = NULL;
2541 	int err = -EHOSTUNREACH;
2542 
2543 	if (cfg->fc_table) {
2544 		int flags = RT6_LOOKUP_F_IFACE;
2545 
2546 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2547 					  cfg->fc_table, flags);
2548 		if (grt) {
2549 			if (grt->rt6i_flags & RTF_GATEWAY ||
2550 			    (dev && dev != grt->dst.dev)) {
2551 				ip6_rt_put(grt);
2552 				grt = NULL;
2553 			}
2554 		}
2555 	}
2556 
2557 	if (!grt)
2558 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2559 
2560 	if (!grt)
2561 		goto out;
2562 
2563 	if (dev) {
2564 		if (dev != grt->dst.dev) {
2565 			ip6_rt_put(grt);
2566 			goto out;
2567 		}
2568 	} else {
2569 		*_dev = dev = grt->dst.dev;
2570 		*idev = grt->rt6i_idev;
2571 		dev_hold(dev);
2572 		in6_dev_hold(grt->rt6i_idev);
2573 	}
2574 
2575 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2576 		err = 0;
2577 
2578 	ip6_rt_put(grt);
2579 
2580 out:
2581 	return err;
2582 }
2583 
2584 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2585 					      struct netlink_ext_ack *extack)
2586 {
2587 	struct net *net = cfg->fc_nlinfo.nl_net;
2588 	struct rt6_info *rt = NULL;
2589 	struct net_device *dev = NULL;
2590 	struct inet6_dev *idev = NULL;
2591 	struct fib6_table *table;
2592 	int addr_type;
2593 	int err = -EINVAL;
2594 
2595 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2596 	if (cfg->fc_flags & RTF_PCPU) {
2597 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2598 		goto out;
2599 	}
2600 
2601 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2602 	if (cfg->fc_flags & RTF_CACHE) {
2603 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2604 		goto out;
2605 	}
2606 
2607 	if (cfg->fc_dst_len > 128) {
2608 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2609 		goto out;
2610 	}
2611 	if (cfg->fc_src_len > 128) {
2612 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2613 		goto out;
2614 	}
2615 #ifndef CONFIG_IPV6_SUBTREES
2616 	if (cfg->fc_src_len) {
2617 		NL_SET_ERR_MSG(extack,
2618 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2619 		goto out;
2620 	}
2621 #endif
2622 	if (cfg->fc_ifindex) {
2623 		err = -ENODEV;
2624 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2625 		if (!dev)
2626 			goto out;
2627 		idev = in6_dev_get(dev);
2628 		if (!idev)
2629 			goto out;
2630 	}
2631 
2632 	if (cfg->fc_metric == 0)
2633 		cfg->fc_metric = IP6_RT_PRIO_USER;
2634 
2635 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2636 		if (!dev) {
2637 			NL_SET_ERR_MSG(extack,
2638 				       "Nexthop device required for onlink");
2639 			err = -ENODEV;
2640 			goto out;
2641 		}
2642 
2643 		if (!(dev->flags & IFF_UP)) {
2644 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2645 			err = -ENETDOWN;
2646 			goto out;
2647 		}
2648 	}
2649 
2650 	err = -ENOBUFS;
2651 	if (cfg->fc_nlinfo.nlh &&
2652 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2653 		table = fib6_get_table(net, cfg->fc_table);
2654 		if (!table) {
2655 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2656 			table = fib6_new_table(net, cfg->fc_table);
2657 		}
2658 	} else {
2659 		table = fib6_new_table(net, cfg->fc_table);
2660 	}
2661 
2662 	if (!table)
2663 		goto out;
2664 
2665 	rt = ip6_dst_alloc(net, NULL,
2666 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2667 
2668 	if (!rt) {
2669 		err = -ENOMEM;
2670 		goto out;
2671 	}
2672 
2673 	if (cfg->fc_flags & RTF_EXPIRES)
2674 		rt6_set_expires(rt, jiffies +
2675 				clock_t_to_jiffies(cfg->fc_expires));
2676 	else
2677 		rt6_clean_expires(rt);
2678 
2679 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2680 		cfg->fc_protocol = RTPROT_BOOT;
2681 	rt->rt6i_protocol = cfg->fc_protocol;
2682 
2683 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2684 
2685 	if (addr_type & IPV6_ADDR_MULTICAST)
2686 		rt->dst.input = ip6_mc_input;
2687 	else if (cfg->fc_flags & RTF_LOCAL)
2688 		rt->dst.input = ip6_input;
2689 	else
2690 		rt->dst.input = ip6_forward;
2691 
2692 	rt->dst.output = ip6_output;
2693 
2694 	if (cfg->fc_encap) {
2695 		struct lwtunnel_state *lwtstate;
2696 
2697 		err = lwtunnel_build_state(cfg->fc_encap_type,
2698 					   cfg->fc_encap, AF_INET6, cfg,
2699 					   &lwtstate, extack);
2700 		if (err)
2701 			goto out;
2702 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2703 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2704 			rt->dst.lwtstate->orig_output = rt->dst.output;
2705 			rt->dst.output = lwtunnel_output;
2706 		}
2707 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2708 			rt->dst.lwtstate->orig_input = rt->dst.input;
2709 			rt->dst.input = lwtunnel_input;
2710 		}
2711 	}
2712 
2713 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2714 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2715 	if (rt->rt6i_dst.plen == 128)
2716 		rt->dst.flags |= DST_HOST;
2717 
2718 #ifdef CONFIG_IPV6_SUBTREES
2719 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2720 	rt->rt6i_src.plen = cfg->fc_src_len;
2721 #endif
2722 
2723 	rt->rt6i_metric = cfg->fc_metric;
2724 	rt->rt6i_nh_weight = 1;
2725 
2726 	/* We cannot add true routes via loopback here,
2727 	   they would result in kernel looping; promote them to reject routes
2728 	 */
2729 	if ((cfg->fc_flags & RTF_REJECT) ||
2730 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2731 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2732 	     !(cfg->fc_flags & RTF_LOCAL))) {
2733 		/* hold loopback dev/idev if we haven't done so. */
2734 		if (dev != net->loopback_dev) {
2735 			if (dev) {
2736 				dev_put(dev);
2737 				in6_dev_put(idev);
2738 			}
2739 			dev = net->loopback_dev;
2740 			dev_hold(dev);
2741 			idev = in6_dev_get(dev);
2742 			if (!idev) {
2743 				err = -ENODEV;
2744 				goto out;
2745 			}
2746 		}
2747 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2748 		switch (cfg->fc_type) {
2749 		case RTN_BLACKHOLE:
2750 			rt->dst.error = -EINVAL;
2751 			rt->dst.output = dst_discard_out;
2752 			rt->dst.input = dst_discard;
2753 			break;
2754 		case RTN_PROHIBIT:
2755 			rt->dst.error = -EACCES;
2756 			rt->dst.output = ip6_pkt_prohibit_out;
2757 			rt->dst.input = ip6_pkt_prohibit;
2758 			break;
2759 		case RTN_THROW:
2760 		case RTN_UNREACHABLE:
2761 		default:
2762 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2763 					: (cfg->fc_type == RTN_UNREACHABLE)
2764 					? -EHOSTUNREACH : -ENETUNREACH;
2765 			rt->dst.output = ip6_pkt_discard_out;
2766 			rt->dst.input = ip6_pkt_discard;
2767 			break;
2768 		}
2769 		goto install_route;
2770 	}
2771 
2772 	if (cfg->fc_flags & RTF_GATEWAY) {
2773 		const struct in6_addr *gw_addr;
2774 		int gwa_type;
2775 
2776 		gw_addr = &cfg->fc_gateway;
2777 		gwa_type = ipv6_addr_type(gw_addr);
2778 
2779 		/* if gw_addr is local we will fail to detect this in case
2780 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2781 		 * will return already-added prefix route via interface that
2782 		 * prefix route was assigned to, which might be non-loopback.
2783 		 */
2784 		err = -EINVAL;
2785 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2786 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2787 					    dev : NULL, 0, 0)) {
2788 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2789 			goto out;
2790 		}
2791 		rt->rt6i_gateway = *gw_addr;
2792 
2793 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2794 			/* IPv6 strictly inhibits using not link-local
2795 			   addresses as nexthop address.
2796 			   Otherwise, router will not able to send redirects.
2797 			   It is very good, but in some (rare!) circumstances
2798 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2799 			   some exceptions. --ANK
2800 			   We allow IPv4-mapped nexthops to support RFC4798-type
2801 			   addressing
2802 			 */
2803 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2804 					  IPV6_ADDR_MAPPED))) {
2805 				NL_SET_ERR_MSG(extack,
2806 					       "Invalid gateway address");
2807 				goto out;
2808 			}
2809 
2810 			if (cfg->fc_flags & RTNH_F_ONLINK) {
2811 				err = ip6_route_check_nh_onlink(net, cfg, dev,
2812 								extack);
2813 			} else {
2814 				err = ip6_route_check_nh(net, cfg, &dev, &idev);
2815 			}
2816 			if (err)
2817 				goto out;
2818 		}
2819 		err = -EINVAL;
2820 		if (!dev) {
2821 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2822 			goto out;
2823 		} else if (dev->flags & IFF_LOOPBACK) {
2824 			NL_SET_ERR_MSG(extack,
2825 				       "Egress device can not be loopback device for this route");
2826 			goto out;
2827 		}
2828 	}
2829 
2830 	err = -ENODEV;
2831 	if (!dev)
2832 		goto out;
2833 
2834 	if (!(dev->flags & IFF_UP)) {
2835 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2836 		err = -ENETDOWN;
2837 		goto out;
2838 	}
2839 
2840 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2841 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2842 			NL_SET_ERR_MSG(extack, "Invalid source address");
2843 			err = -EINVAL;
2844 			goto out;
2845 		}
2846 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2847 		rt->rt6i_prefsrc.plen = 128;
2848 	} else
2849 		rt->rt6i_prefsrc.plen = 0;
2850 
2851 	rt->rt6i_flags = cfg->fc_flags;
2852 
2853 install_route:
2854 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2855 	    !netif_carrier_ok(dev))
2856 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2857 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2858 	rt->dst.dev = dev;
2859 	rt->rt6i_idev = idev;
2860 	rt->rt6i_table = table;
2861 
2862 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2863 
2864 	return rt;
2865 out:
2866 	if (dev)
2867 		dev_put(dev);
2868 	if (idev)
2869 		in6_dev_put(idev);
2870 	if (rt)
2871 		dst_release_immediate(&rt->dst);
2872 
2873 	return ERR_PTR(err);
2874 }
2875 
2876 int ip6_route_add(struct fib6_config *cfg,
2877 		  struct netlink_ext_ack *extack)
2878 {
2879 	struct mx6_config mxc = { .mx = NULL, };
2880 	struct rt6_info *rt;
2881 	int err;
2882 
2883 	rt = ip6_route_info_create(cfg, extack);
2884 	if (IS_ERR(rt)) {
2885 		err = PTR_ERR(rt);
2886 		rt = NULL;
2887 		goto out;
2888 	}
2889 
2890 	err = ip6_convert_metrics(&mxc, cfg);
2891 	if (err)
2892 		goto out;
2893 
2894 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2895 
2896 	kfree(mxc.mx);
2897 
2898 	return err;
2899 out:
2900 	if (rt)
2901 		dst_release_immediate(&rt->dst);
2902 
2903 	return err;
2904 }
2905 
2906 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2907 {
2908 	int err;
2909 	struct fib6_table *table;
2910 	struct net *net = dev_net(rt->dst.dev);
2911 
2912 	if (rt == net->ipv6.ip6_null_entry) {
2913 		err = -ENOENT;
2914 		goto out;
2915 	}
2916 
2917 	table = rt->rt6i_table;
2918 	spin_lock_bh(&table->tb6_lock);
2919 	err = fib6_del(rt, info);
2920 	spin_unlock_bh(&table->tb6_lock);
2921 
2922 out:
2923 	ip6_rt_put(rt);
2924 	return err;
2925 }
2926 
2927 int ip6_del_rt(struct rt6_info *rt)
2928 {
2929 	struct nl_info info = {
2930 		.nl_net = dev_net(rt->dst.dev),
2931 	};
2932 	return __ip6_del_rt(rt, &info);
2933 }
2934 
2935 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2936 {
2937 	struct nl_info *info = &cfg->fc_nlinfo;
2938 	struct net *net = info->nl_net;
2939 	struct sk_buff *skb = NULL;
2940 	struct fib6_table *table;
2941 	int err = -ENOENT;
2942 
2943 	if (rt == net->ipv6.ip6_null_entry)
2944 		goto out_put;
2945 	table = rt->rt6i_table;
2946 	spin_lock_bh(&table->tb6_lock);
2947 
2948 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2949 		struct rt6_info *sibling, *next_sibling;
2950 
2951 		/* prefer to send a single notification with all hops */
2952 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2953 		if (skb) {
2954 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2955 
2956 			if (rt6_fill_node(net, skb, rt,
2957 					  NULL, NULL, 0, RTM_DELROUTE,
2958 					  info->portid, seq, 0) < 0) {
2959 				kfree_skb(skb);
2960 				skb = NULL;
2961 			} else
2962 				info->skip_notify = 1;
2963 		}
2964 
2965 		list_for_each_entry_safe(sibling, next_sibling,
2966 					 &rt->rt6i_siblings,
2967 					 rt6i_siblings) {
2968 			err = fib6_del(sibling, info);
2969 			if (err)
2970 				goto out_unlock;
2971 		}
2972 	}
2973 
2974 	err = fib6_del(rt, info);
2975 out_unlock:
2976 	spin_unlock_bh(&table->tb6_lock);
2977 out_put:
2978 	ip6_rt_put(rt);
2979 
2980 	if (skb) {
2981 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2982 			    info->nlh, gfp_any());
2983 	}
2984 	return err;
2985 }
2986 
2987 static int ip6_route_del(struct fib6_config *cfg,
2988 			 struct netlink_ext_ack *extack)
2989 {
2990 	struct rt6_info *rt, *rt_cache;
2991 	struct fib6_table *table;
2992 	struct fib6_node *fn;
2993 	int err = -ESRCH;
2994 
2995 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2996 	if (!table) {
2997 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2998 		return err;
2999 	}
3000 
3001 	rcu_read_lock();
3002 
3003 	fn = fib6_locate(&table->tb6_root,
3004 			 &cfg->fc_dst, cfg->fc_dst_len,
3005 			 &cfg->fc_src, cfg->fc_src_len,
3006 			 !(cfg->fc_flags & RTF_CACHE));
3007 
3008 	if (fn) {
3009 		for_each_fib6_node_rt_rcu(fn) {
3010 			if (cfg->fc_flags & RTF_CACHE) {
3011 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3012 							      &cfg->fc_src);
3013 				if (!rt_cache)
3014 					continue;
3015 				rt = rt_cache;
3016 			}
3017 			if (cfg->fc_ifindex &&
3018 			    (!rt->dst.dev ||
3019 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
3020 				continue;
3021 			if (cfg->fc_flags & RTF_GATEWAY &&
3022 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3023 				continue;
3024 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3025 				continue;
3026 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3027 				continue;
3028 			if (!dst_hold_safe(&rt->dst))
3029 				break;
3030 			rcu_read_unlock();
3031 
3032 			/* if gateway was specified only delete the one hop */
3033 			if (cfg->fc_flags & RTF_GATEWAY)
3034 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3035 
3036 			return __ip6_del_rt_siblings(rt, cfg);
3037 		}
3038 	}
3039 	rcu_read_unlock();
3040 
3041 	return err;
3042 }
3043 
3044 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3045 {
3046 	struct netevent_redirect netevent;
3047 	struct rt6_info *rt, *nrt = NULL;
3048 	struct ndisc_options ndopts;
3049 	struct inet6_dev *in6_dev;
3050 	struct neighbour *neigh;
3051 	struct rd_msg *msg;
3052 	int optlen, on_link;
3053 	u8 *lladdr;
3054 
3055 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3056 	optlen -= sizeof(*msg);
3057 
3058 	if (optlen < 0) {
3059 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3060 		return;
3061 	}
3062 
3063 	msg = (struct rd_msg *)icmp6_hdr(skb);
3064 
3065 	if (ipv6_addr_is_multicast(&msg->dest)) {
3066 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3067 		return;
3068 	}
3069 
3070 	on_link = 0;
3071 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3072 		on_link = 1;
3073 	} else if (ipv6_addr_type(&msg->target) !=
3074 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3075 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3076 		return;
3077 	}
3078 
3079 	in6_dev = __in6_dev_get(skb->dev);
3080 	if (!in6_dev)
3081 		return;
3082 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3083 		return;
3084 
3085 	/* RFC2461 8.1:
3086 	 *	The IP source address of the Redirect MUST be the same as the current
3087 	 *	first-hop router for the specified ICMP Destination Address.
3088 	 */
3089 
3090 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3091 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3092 		return;
3093 	}
3094 
3095 	lladdr = NULL;
3096 	if (ndopts.nd_opts_tgt_lladdr) {
3097 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3098 					     skb->dev);
3099 		if (!lladdr) {
3100 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3101 			return;
3102 		}
3103 	}
3104 
3105 	rt = (struct rt6_info *) dst;
3106 	if (rt->rt6i_flags & RTF_REJECT) {
3107 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3108 		return;
3109 	}
3110 
3111 	/* Redirect received -> path was valid.
3112 	 * Look, redirects are sent only in response to data packets,
3113 	 * so that this nexthop apparently is reachable. --ANK
3114 	 */
3115 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3116 
3117 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3118 	if (!neigh)
3119 		return;
3120 
3121 	/*
3122 	 *	We have finally decided to accept it.
3123 	 */
3124 
3125 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3126 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3127 		     NEIGH_UPDATE_F_OVERRIDE|
3128 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3129 				     NEIGH_UPDATE_F_ISROUTER)),
3130 		     NDISC_REDIRECT, &ndopts);
3131 
3132 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3133 	if (!nrt)
3134 		goto out;
3135 
3136 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3137 	if (on_link)
3138 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3139 
3140 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3141 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3142 
3143 	/* No need to remove rt from the exception table if rt is
3144 	 * a cached route because rt6_insert_exception() will
3145 	 * takes care of it
3146 	 */
3147 	if (rt6_insert_exception(nrt, rt)) {
3148 		dst_release_immediate(&nrt->dst);
3149 		goto out;
3150 	}
3151 
3152 	netevent.old = &rt->dst;
3153 	netevent.new = &nrt->dst;
3154 	netevent.daddr = &msg->dest;
3155 	netevent.neigh = neigh;
3156 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3157 
3158 out:
3159 	neigh_release(neigh);
3160 }
3161 
3162 /*
3163  *	Misc support functions
3164  */
3165 
3166 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3167 {
3168 	BUG_ON(from->from);
3169 
3170 	rt->rt6i_flags &= ~RTF_EXPIRES;
3171 	dst_hold(&from->dst);
3172 	rt->from = from;
3173 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3174 }
3175 
3176 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3177 {
3178 	rt->dst.input = ort->dst.input;
3179 	rt->dst.output = ort->dst.output;
3180 	rt->rt6i_dst = ort->rt6i_dst;
3181 	rt->dst.error = ort->dst.error;
3182 	rt->rt6i_idev = ort->rt6i_idev;
3183 	if (rt->rt6i_idev)
3184 		in6_dev_hold(rt->rt6i_idev);
3185 	rt->dst.lastuse = jiffies;
3186 	rt->rt6i_gateway = ort->rt6i_gateway;
3187 	rt->rt6i_flags = ort->rt6i_flags;
3188 	rt6_set_from(rt, ort);
3189 	rt->rt6i_metric = ort->rt6i_metric;
3190 #ifdef CONFIG_IPV6_SUBTREES
3191 	rt->rt6i_src = ort->rt6i_src;
3192 #endif
3193 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3194 	rt->rt6i_table = ort->rt6i_table;
3195 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3196 }
3197 
3198 #ifdef CONFIG_IPV6_ROUTE_INFO
3199 static struct rt6_info *rt6_get_route_info(struct net *net,
3200 					   const struct in6_addr *prefix, int prefixlen,
3201 					   const struct in6_addr *gwaddr,
3202 					   struct net_device *dev)
3203 {
3204 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3205 	int ifindex = dev->ifindex;
3206 	struct fib6_node *fn;
3207 	struct rt6_info *rt = NULL;
3208 	struct fib6_table *table;
3209 
3210 	table = fib6_get_table(net, tb_id);
3211 	if (!table)
3212 		return NULL;
3213 
3214 	rcu_read_lock();
3215 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3216 	if (!fn)
3217 		goto out;
3218 
3219 	for_each_fib6_node_rt_rcu(fn) {
3220 		if (rt->dst.dev->ifindex != ifindex)
3221 			continue;
3222 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3223 			continue;
3224 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3225 			continue;
3226 		ip6_hold_safe(NULL, &rt, false);
3227 		break;
3228 	}
3229 out:
3230 	rcu_read_unlock();
3231 	return rt;
3232 }
3233 
3234 static struct rt6_info *rt6_add_route_info(struct net *net,
3235 					   const struct in6_addr *prefix, int prefixlen,
3236 					   const struct in6_addr *gwaddr,
3237 					   struct net_device *dev,
3238 					   unsigned int pref)
3239 {
3240 	struct fib6_config cfg = {
3241 		.fc_metric	= IP6_RT_PRIO_USER,
3242 		.fc_ifindex	= dev->ifindex,
3243 		.fc_dst_len	= prefixlen,
3244 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3245 				  RTF_UP | RTF_PREF(pref),
3246 		.fc_protocol = RTPROT_RA,
3247 		.fc_nlinfo.portid = 0,
3248 		.fc_nlinfo.nlh = NULL,
3249 		.fc_nlinfo.nl_net = net,
3250 	};
3251 
3252 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3253 	cfg.fc_dst = *prefix;
3254 	cfg.fc_gateway = *gwaddr;
3255 
3256 	/* We should treat it as a default route if prefix length is 0. */
3257 	if (!prefixlen)
3258 		cfg.fc_flags |= RTF_DEFAULT;
3259 
3260 	ip6_route_add(&cfg, NULL);
3261 
3262 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3263 }
3264 #endif
3265 
3266 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3267 {
3268 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3269 	struct rt6_info *rt;
3270 	struct fib6_table *table;
3271 
3272 	table = fib6_get_table(dev_net(dev), tb_id);
3273 	if (!table)
3274 		return NULL;
3275 
3276 	rcu_read_lock();
3277 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3278 		if (dev == rt->dst.dev &&
3279 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3280 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3281 			break;
3282 	}
3283 	if (rt)
3284 		ip6_hold_safe(NULL, &rt, false);
3285 	rcu_read_unlock();
3286 	return rt;
3287 }
3288 
3289 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3290 				     struct net_device *dev,
3291 				     unsigned int pref)
3292 {
3293 	struct fib6_config cfg = {
3294 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3295 		.fc_metric	= IP6_RT_PRIO_USER,
3296 		.fc_ifindex	= dev->ifindex,
3297 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3298 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3299 		.fc_protocol = RTPROT_RA,
3300 		.fc_nlinfo.portid = 0,
3301 		.fc_nlinfo.nlh = NULL,
3302 		.fc_nlinfo.nl_net = dev_net(dev),
3303 	};
3304 
3305 	cfg.fc_gateway = *gwaddr;
3306 
3307 	if (!ip6_route_add(&cfg, NULL)) {
3308 		struct fib6_table *table;
3309 
3310 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3311 		if (table)
3312 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3313 	}
3314 
3315 	return rt6_get_dflt_router(gwaddr, dev);
3316 }
3317 
3318 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3319 {
3320 	struct rt6_info *rt;
3321 
3322 restart:
3323 	rcu_read_lock();
3324 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3325 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3326 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3327 			if (dst_hold_safe(&rt->dst)) {
3328 				rcu_read_unlock();
3329 				ip6_del_rt(rt);
3330 			} else {
3331 				rcu_read_unlock();
3332 			}
3333 			goto restart;
3334 		}
3335 	}
3336 	rcu_read_unlock();
3337 
3338 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3339 }
3340 
3341 void rt6_purge_dflt_routers(struct net *net)
3342 {
3343 	struct fib6_table *table;
3344 	struct hlist_head *head;
3345 	unsigned int h;
3346 
3347 	rcu_read_lock();
3348 
3349 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3350 		head = &net->ipv6.fib_table_hash[h];
3351 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3352 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3353 				__rt6_purge_dflt_routers(table);
3354 		}
3355 	}
3356 
3357 	rcu_read_unlock();
3358 }
3359 
3360 static void rtmsg_to_fib6_config(struct net *net,
3361 				 struct in6_rtmsg *rtmsg,
3362 				 struct fib6_config *cfg)
3363 {
3364 	memset(cfg, 0, sizeof(*cfg));
3365 
3366 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3367 			 : RT6_TABLE_MAIN;
3368 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3369 	cfg->fc_metric = rtmsg->rtmsg_metric;
3370 	cfg->fc_expires = rtmsg->rtmsg_info;
3371 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3372 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3373 	cfg->fc_flags = rtmsg->rtmsg_flags;
3374 
3375 	cfg->fc_nlinfo.nl_net = net;
3376 
3377 	cfg->fc_dst = rtmsg->rtmsg_dst;
3378 	cfg->fc_src = rtmsg->rtmsg_src;
3379 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3380 }
3381 
3382 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3383 {
3384 	struct fib6_config cfg;
3385 	struct in6_rtmsg rtmsg;
3386 	int err;
3387 
3388 	switch (cmd) {
3389 	case SIOCADDRT:		/* Add a route */
3390 	case SIOCDELRT:		/* Delete a route */
3391 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3392 			return -EPERM;
3393 		err = copy_from_user(&rtmsg, arg,
3394 				     sizeof(struct in6_rtmsg));
3395 		if (err)
3396 			return -EFAULT;
3397 
3398 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3399 
3400 		rtnl_lock();
3401 		switch (cmd) {
3402 		case SIOCADDRT:
3403 			err = ip6_route_add(&cfg, NULL);
3404 			break;
3405 		case SIOCDELRT:
3406 			err = ip6_route_del(&cfg, NULL);
3407 			break;
3408 		default:
3409 			err = -EINVAL;
3410 		}
3411 		rtnl_unlock();
3412 
3413 		return err;
3414 	}
3415 
3416 	return -EINVAL;
3417 }
3418 
3419 /*
3420  *	Drop the packet on the floor
3421  */
3422 
3423 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3424 {
3425 	int type;
3426 	struct dst_entry *dst = skb_dst(skb);
3427 	switch (ipstats_mib_noroutes) {
3428 	case IPSTATS_MIB_INNOROUTES:
3429 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3430 		if (type == IPV6_ADDR_ANY) {
3431 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3432 				      IPSTATS_MIB_INADDRERRORS);
3433 			break;
3434 		}
3435 		/* FALLTHROUGH */
3436 	case IPSTATS_MIB_OUTNOROUTES:
3437 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3438 			      ipstats_mib_noroutes);
3439 		break;
3440 	}
3441 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3442 	kfree_skb(skb);
3443 	return 0;
3444 }
3445 
3446 static int ip6_pkt_discard(struct sk_buff *skb)
3447 {
3448 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3449 }
3450 
3451 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3452 {
3453 	skb->dev = skb_dst(skb)->dev;
3454 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3455 }
3456 
3457 static int ip6_pkt_prohibit(struct sk_buff *skb)
3458 {
3459 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3460 }
3461 
3462 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3463 {
3464 	skb->dev = skb_dst(skb)->dev;
3465 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3466 }
3467 
3468 /*
3469  *	Allocate a dst for local (unicast / anycast) address.
3470  */
3471 
3472 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3473 				    const struct in6_addr *addr,
3474 				    bool anycast)
3475 {
3476 	u32 tb_id;
3477 	struct net *net = dev_net(idev->dev);
3478 	struct net_device *dev = idev->dev;
3479 	struct rt6_info *rt;
3480 
3481 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3482 	if (!rt)
3483 		return ERR_PTR(-ENOMEM);
3484 
3485 	in6_dev_hold(idev);
3486 
3487 	rt->dst.flags |= DST_HOST;
3488 	rt->dst.input = ip6_input;
3489 	rt->dst.output = ip6_output;
3490 	rt->rt6i_idev = idev;
3491 
3492 	rt->rt6i_protocol = RTPROT_KERNEL;
3493 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3494 	if (anycast)
3495 		rt->rt6i_flags |= RTF_ANYCAST;
3496 	else
3497 		rt->rt6i_flags |= RTF_LOCAL;
3498 
3499 	rt->rt6i_gateway  = *addr;
3500 	rt->rt6i_dst.addr = *addr;
3501 	rt->rt6i_dst.plen = 128;
3502 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3503 	rt->rt6i_table = fib6_get_table(net, tb_id);
3504 
3505 	return rt;
3506 }
3507 
3508 /* remove deleted ip from prefsrc entries */
3509 struct arg_dev_net_ip {
3510 	struct net_device *dev;
3511 	struct net *net;
3512 	struct in6_addr *addr;
3513 };
3514 
3515 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3516 {
3517 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3518 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3519 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3520 
3521 	if (((void *)rt->dst.dev == dev || !dev) &&
3522 	    rt != net->ipv6.ip6_null_entry &&
3523 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3524 		spin_lock_bh(&rt6_exception_lock);
3525 		/* remove prefsrc entry */
3526 		rt->rt6i_prefsrc.plen = 0;
3527 		/* need to update cache as well */
3528 		rt6_exceptions_remove_prefsrc(rt);
3529 		spin_unlock_bh(&rt6_exception_lock);
3530 	}
3531 	return 0;
3532 }
3533 
3534 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3535 {
3536 	struct net *net = dev_net(ifp->idev->dev);
3537 	struct arg_dev_net_ip adni = {
3538 		.dev = ifp->idev->dev,
3539 		.net = net,
3540 		.addr = &ifp->addr,
3541 	};
3542 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3543 }
3544 
3545 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3546 
3547 /* Remove routers and update dst entries when gateway turn into host. */
3548 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3549 {
3550 	struct in6_addr *gateway = (struct in6_addr *)arg;
3551 
3552 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3553 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3554 		return -1;
3555 	}
3556 
3557 	/* Further clean up cached routes in exception table.
3558 	 * This is needed because cached route may have a different
3559 	 * gateway than its 'parent' in the case of an ip redirect.
3560 	 */
3561 	rt6_exceptions_clean_tohost(rt, gateway);
3562 
3563 	return 0;
3564 }
3565 
3566 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3567 {
3568 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3569 }
3570 
3571 struct arg_netdev_event {
3572 	const struct net_device *dev;
3573 	union {
3574 		unsigned int nh_flags;
3575 		unsigned long event;
3576 	};
3577 };
3578 
3579 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3580 {
3581 	struct rt6_info *iter;
3582 	struct fib6_node *fn;
3583 
3584 	fn = rcu_dereference_protected(rt->rt6i_node,
3585 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3586 	iter = rcu_dereference_protected(fn->leaf,
3587 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3588 	while (iter) {
3589 		if (iter->rt6i_metric == rt->rt6i_metric &&
3590 		    rt6_qualify_for_ecmp(iter))
3591 			return iter;
3592 		iter = rcu_dereference_protected(iter->rt6_next,
3593 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3594 	}
3595 
3596 	return NULL;
3597 }
3598 
3599 static bool rt6_is_dead(const struct rt6_info *rt)
3600 {
3601 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3602 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3603 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3604 		return true;
3605 
3606 	return false;
3607 }
3608 
3609 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3610 {
3611 	struct rt6_info *iter;
3612 	int total = 0;
3613 
3614 	if (!rt6_is_dead(rt))
3615 		total += rt->rt6i_nh_weight;
3616 
3617 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3618 		if (!rt6_is_dead(iter))
3619 			total += iter->rt6i_nh_weight;
3620 	}
3621 
3622 	return total;
3623 }
3624 
3625 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3626 {
3627 	int upper_bound = -1;
3628 
3629 	if (!rt6_is_dead(rt)) {
3630 		*weight += rt->rt6i_nh_weight;
3631 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3632 						    total) - 1;
3633 	}
3634 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3635 }
3636 
3637 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3638 {
3639 	struct rt6_info *iter;
3640 	int weight = 0;
3641 
3642 	rt6_upper_bound_set(rt, &weight, total);
3643 
3644 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3645 		rt6_upper_bound_set(iter, &weight, total);
3646 }
3647 
3648 void rt6_multipath_rebalance(struct rt6_info *rt)
3649 {
3650 	struct rt6_info *first;
3651 	int total;
3652 
3653 	/* In case the entire multipath route was marked for flushing,
3654 	 * then there is no need to rebalance upon the removal of every
3655 	 * sibling route.
3656 	 */
3657 	if (!rt->rt6i_nsiblings || rt->should_flush)
3658 		return;
3659 
3660 	/* During lookup routes are evaluated in order, so we need to
3661 	 * make sure upper bounds are assigned from the first sibling
3662 	 * onwards.
3663 	 */
3664 	first = rt6_multipath_first_sibling(rt);
3665 	if (WARN_ON_ONCE(!first))
3666 		return;
3667 
3668 	total = rt6_multipath_total_weight(first);
3669 	rt6_multipath_upper_bound_set(first, total);
3670 }
3671 
3672 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3673 {
3674 	const struct arg_netdev_event *arg = p_arg;
3675 	const struct net *net = dev_net(arg->dev);
3676 
3677 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3678 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3679 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3680 		rt6_multipath_rebalance(rt);
3681 	}
3682 
3683 	return 0;
3684 }
3685 
3686 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3687 {
3688 	struct arg_netdev_event arg = {
3689 		.dev = dev,
3690 		{
3691 			.nh_flags = nh_flags,
3692 		},
3693 	};
3694 
3695 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3696 		arg.nh_flags |= RTNH_F_LINKDOWN;
3697 
3698 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3699 }
3700 
3701 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3702 				   const struct net_device *dev)
3703 {
3704 	struct rt6_info *iter;
3705 
3706 	if (rt->dst.dev == dev)
3707 		return true;
3708 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3709 		if (iter->dst.dev == dev)
3710 			return true;
3711 
3712 	return false;
3713 }
3714 
3715 static void rt6_multipath_flush(struct rt6_info *rt)
3716 {
3717 	struct rt6_info *iter;
3718 
3719 	rt->should_flush = 1;
3720 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3721 		iter->should_flush = 1;
3722 }
3723 
3724 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3725 					     const struct net_device *down_dev)
3726 {
3727 	struct rt6_info *iter;
3728 	unsigned int dead = 0;
3729 
3730 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3731 		dead++;
3732 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3733 		if (iter->dst.dev == down_dev ||
3734 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3735 			dead++;
3736 
3737 	return dead;
3738 }
3739 
3740 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3741 				       const struct net_device *dev,
3742 				       unsigned int nh_flags)
3743 {
3744 	struct rt6_info *iter;
3745 
3746 	if (rt->dst.dev == dev)
3747 		rt->rt6i_nh_flags |= nh_flags;
3748 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3749 		if (iter->dst.dev == dev)
3750 			iter->rt6i_nh_flags |= nh_flags;
3751 }
3752 
3753 /* called with write lock held for table with rt */
3754 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3755 {
3756 	const struct arg_netdev_event *arg = p_arg;
3757 	const struct net_device *dev = arg->dev;
3758 	const struct net *net = dev_net(dev);
3759 
3760 	if (rt == net->ipv6.ip6_null_entry)
3761 		return 0;
3762 
3763 	switch (arg->event) {
3764 	case NETDEV_UNREGISTER:
3765 		return rt->dst.dev == dev ? -1 : 0;
3766 	case NETDEV_DOWN:
3767 		if (rt->should_flush)
3768 			return -1;
3769 		if (!rt->rt6i_nsiblings)
3770 			return rt->dst.dev == dev ? -1 : 0;
3771 		if (rt6_multipath_uses_dev(rt, dev)) {
3772 			unsigned int count;
3773 
3774 			count = rt6_multipath_dead_count(rt, dev);
3775 			if (rt->rt6i_nsiblings + 1 == count) {
3776 				rt6_multipath_flush(rt);
3777 				return -1;
3778 			}
3779 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3780 						   RTNH_F_LINKDOWN);
3781 			fib6_update_sernum(rt);
3782 			rt6_multipath_rebalance(rt);
3783 		}
3784 		return -2;
3785 	case NETDEV_CHANGE:
3786 		if (rt->dst.dev != dev ||
3787 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3788 			break;
3789 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3790 		rt6_multipath_rebalance(rt);
3791 		break;
3792 	}
3793 
3794 	return 0;
3795 }
3796 
3797 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3798 {
3799 	struct arg_netdev_event arg = {
3800 		.dev = dev,
3801 		{
3802 			.event = event,
3803 		},
3804 	};
3805 
3806 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3807 }
3808 
3809 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3810 {
3811 	rt6_sync_down_dev(dev, event);
3812 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3813 	neigh_ifdown(&nd_tbl, dev);
3814 }
3815 
3816 struct rt6_mtu_change_arg {
3817 	struct net_device *dev;
3818 	unsigned int mtu;
3819 };
3820 
3821 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3822 {
3823 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3824 	struct inet6_dev *idev;
3825 
3826 	/* In IPv6 pmtu discovery is not optional,
3827 	   so that RTAX_MTU lock cannot disable it.
3828 	   We still use this lock to block changes
3829 	   caused by addrconf/ndisc.
3830 	*/
3831 
3832 	idev = __in6_dev_get(arg->dev);
3833 	if (!idev)
3834 		return 0;
3835 
3836 	/* For administrative MTU increase, there is no way to discover
3837 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3838 	   Since RFC 1981 doesn't include administrative MTU increase
3839 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3840 	 */
3841 	if (rt->dst.dev == arg->dev &&
3842 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3843 		spin_lock_bh(&rt6_exception_lock);
3844 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3845 		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3846 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3847 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3848 		spin_unlock_bh(&rt6_exception_lock);
3849 	}
3850 	return 0;
3851 }
3852 
3853 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3854 {
3855 	struct rt6_mtu_change_arg arg = {
3856 		.dev = dev,
3857 		.mtu = mtu,
3858 	};
3859 
3860 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3861 }
3862 
3863 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3864 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3865 	[RTA_OIF]               = { .type = NLA_U32 },
3866 	[RTA_IIF]		= { .type = NLA_U32 },
3867 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3868 	[RTA_METRICS]           = { .type = NLA_NESTED },
3869 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3870 	[RTA_PREF]              = { .type = NLA_U8 },
3871 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3872 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3873 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3874 	[RTA_UID]		= { .type = NLA_U32 },
3875 	[RTA_MARK]		= { .type = NLA_U32 },
3876 };
3877 
3878 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3879 			      struct fib6_config *cfg,
3880 			      struct netlink_ext_ack *extack)
3881 {
3882 	struct rtmsg *rtm;
3883 	struct nlattr *tb[RTA_MAX+1];
3884 	unsigned int pref;
3885 	int err;
3886 
3887 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3888 			  NULL);
3889 	if (err < 0)
3890 		goto errout;
3891 
3892 	err = -EINVAL;
3893 	rtm = nlmsg_data(nlh);
3894 	memset(cfg, 0, sizeof(*cfg));
3895 
3896 	cfg->fc_table = rtm->rtm_table;
3897 	cfg->fc_dst_len = rtm->rtm_dst_len;
3898 	cfg->fc_src_len = rtm->rtm_src_len;
3899 	cfg->fc_flags = RTF_UP;
3900 	cfg->fc_protocol = rtm->rtm_protocol;
3901 	cfg->fc_type = rtm->rtm_type;
3902 
3903 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3904 	    rtm->rtm_type == RTN_BLACKHOLE ||
3905 	    rtm->rtm_type == RTN_PROHIBIT ||
3906 	    rtm->rtm_type == RTN_THROW)
3907 		cfg->fc_flags |= RTF_REJECT;
3908 
3909 	if (rtm->rtm_type == RTN_LOCAL)
3910 		cfg->fc_flags |= RTF_LOCAL;
3911 
3912 	if (rtm->rtm_flags & RTM_F_CLONED)
3913 		cfg->fc_flags |= RTF_CACHE;
3914 
3915 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3916 
3917 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3918 	cfg->fc_nlinfo.nlh = nlh;
3919 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3920 
3921 	if (tb[RTA_GATEWAY]) {
3922 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3923 		cfg->fc_flags |= RTF_GATEWAY;
3924 	}
3925 
3926 	if (tb[RTA_DST]) {
3927 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3928 
3929 		if (nla_len(tb[RTA_DST]) < plen)
3930 			goto errout;
3931 
3932 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3933 	}
3934 
3935 	if (tb[RTA_SRC]) {
3936 		int plen = (rtm->rtm_src_len + 7) >> 3;
3937 
3938 		if (nla_len(tb[RTA_SRC]) < plen)
3939 			goto errout;
3940 
3941 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3942 	}
3943 
3944 	if (tb[RTA_PREFSRC])
3945 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3946 
3947 	if (tb[RTA_OIF])
3948 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3949 
3950 	if (tb[RTA_PRIORITY])
3951 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3952 
3953 	if (tb[RTA_METRICS]) {
3954 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3955 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3956 	}
3957 
3958 	if (tb[RTA_TABLE])
3959 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3960 
3961 	if (tb[RTA_MULTIPATH]) {
3962 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3963 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3964 
3965 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3966 						     cfg->fc_mp_len, extack);
3967 		if (err < 0)
3968 			goto errout;
3969 	}
3970 
3971 	if (tb[RTA_PREF]) {
3972 		pref = nla_get_u8(tb[RTA_PREF]);
3973 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3974 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3975 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3976 		cfg->fc_flags |= RTF_PREF(pref);
3977 	}
3978 
3979 	if (tb[RTA_ENCAP])
3980 		cfg->fc_encap = tb[RTA_ENCAP];
3981 
3982 	if (tb[RTA_ENCAP_TYPE]) {
3983 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3984 
3985 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3986 		if (err < 0)
3987 			goto errout;
3988 	}
3989 
3990 	if (tb[RTA_EXPIRES]) {
3991 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3992 
3993 		if (addrconf_finite_timeout(timeout)) {
3994 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3995 			cfg->fc_flags |= RTF_EXPIRES;
3996 		}
3997 	}
3998 
3999 	err = 0;
4000 errout:
4001 	return err;
4002 }
4003 
4004 struct rt6_nh {
4005 	struct rt6_info *rt6_info;
4006 	struct fib6_config r_cfg;
4007 	struct mx6_config mxc;
4008 	struct list_head next;
4009 };
4010 
4011 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4012 {
4013 	struct rt6_nh *nh;
4014 
4015 	list_for_each_entry(nh, rt6_nh_list, next) {
4016 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4017 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4018 		        nh->r_cfg.fc_ifindex);
4019 	}
4020 }
4021 
4022 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4023 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4024 {
4025 	struct rt6_nh *nh;
4026 	int err = -EEXIST;
4027 
4028 	list_for_each_entry(nh, rt6_nh_list, next) {
4029 		/* check if rt6_info already exists */
4030 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4031 			return err;
4032 	}
4033 
4034 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4035 	if (!nh)
4036 		return -ENOMEM;
4037 	nh->rt6_info = rt;
4038 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4039 	if (err) {
4040 		kfree(nh);
4041 		return err;
4042 	}
4043 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4044 	list_add_tail(&nh->next, rt6_nh_list);
4045 
4046 	return 0;
4047 }
4048 
4049 static void ip6_route_mpath_notify(struct rt6_info *rt,
4050 				   struct rt6_info *rt_last,
4051 				   struct nl_info *info,
4052 				   __u16 nlflags)
4053 {
4054 	/* if this is an APPEND route, then rt points to the first route
4055 	 * inserted and rt_last points to last route inserted. Userspace
4056 	 * wants a consistent dump of the route which starts at the first
4057 	 * nexthop. Since sibling routes are always added at the end of
4058 	 * the list, find the first sibling of the last route appended
4059 	 */
4060 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4061 		rt = list_first_entry(&rt_last->rt6i_siblings,
4062 				      struct rt6_info,
4063 				      rt6i_siblings);
4064 	}
4065 
4066 	if (rt)
4067 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4068 }
4069 
4070 static int ip6_route_multipath_add(struct fib6_config *cfg,
4071 				   struct netlink_ext_ack *extack)
4072 {
4073 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4074 	struct nl_info *info = &cfg->fc_nlinfo;
4075 	struct fib6_config r_cfg;
4076 	struct rtnexthop *rtnh;
4077 	struct rt6_info *rt;
4078 	struct rt6_nh *err_nh;
4079 	struct rt6_nh *nh, *nh_safe;
4080 	__u16 nlflags;
4081 	int remaining;
4082 	int attrlen;
4083 	int err = 1;
4084 	int nhn = 0;
4085 	int replace = (cfg->fc_nlinfo.nlh &&
4086 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4087 	LIST_HEAD(rt6_nh_list);
4088 
4089 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4090 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4091 		nlflags |= NLM_F_APPEND;
4092 
4093 	remaining = cfg->fc_mp_len;
4094 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4095 
4096 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4097 	 * rt6_info structs per nexthop
4098 	 */
4099 	while (rtnh_ok(rtnh, remaining)) {
4100 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4101 		if (rtnh->rtnh_ifindex)
4102 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4103 
4104 		attrlen = rtnh_attrlen(rtnh);
4105 		if (attrlen > 0) {
4106 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4107 
4108 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4109 			if (nla) {
4110 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4111 				r_cfg.fc_flags |= RTF_GATEWAY;
4112 			}
4113 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4114 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4115 			if (nla)
4116 				r_cfg.fc_encap_type = nla_get_u16(nla);
4117 		}
4118 
4119 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4120 		rt = ip6_route_info_create(&r_cfg, extack);
4121 		if (IS_ERR(rt)) {
4122 			err = PTR_ERR(rt);
4123 			rt = NULL;
4124 			goto cleanup;
4125 		}
4126 
4127 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4128 
4129 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4130 		if (err) {
4131 			dst_release_immediate(&rt->dst);
4132 			goto cleanup;
4133 		}
4134 
4135 		rtnh = rtnh_next(rtnh, &remaining);
4136 	}
4137 
4138 	/* for add and replace send one notification with all nexthops.
4139 	 * Skip the notification in fib6_add_rt2node and send one with
4140 	 * the full route when done
4141 	 */
4142 	info->skip_notify = 1;
4143 
4144 	err_nh = NULL;
4145 	list_for_each_entry(nh, &rt6_nh_list, next) {
4146 		rt_last = nh->rt6_info;
4147 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4148 		/* save reference to first route for notification */
4149 		if (!rt_notif && !err)
4150 			rt_notif = nh->rt6_info;
4151 
4152 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4153 		nh->rt6_info = NULL;
4154 		if (err) {
4155 			if (replace && nhn)
4156 				ip6_print_replace_route_err(&rt6_nh_list);
4157 			err_nh = nh;
4158 			goto add_errout;
4159 		}
4160 
4161 		/* Because each route is added like a single route we remove
4162 		 * these flags after the first nexthop: if there is a collision,
4163 		 * we have already failed to add the first nexthop:
4164 		 * fib6_add_rt2node() has rejected it; when replacing, old
4165 		 * nexthops have been replaced by first new, the rest should
4166 		 * be added to it.
4167 		 */
4168 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4169 						     NLM_F_REPLACE);
4170 		nhn++;
4171 	}
4172 
4173 	/* success ... tell user about new route */
4174 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4175 	goto cleanup;
4176 
4177 add_errout:
4178 	/* send notification for routes that were added so that
4179 	 * the delete notifications sent by ip6_route_del are
4180 	 * coherent
4181 	 */
4182 	if (rt_notif)
4183 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4184 
4185 	/* Delete routes that were already added */
4186 	list_for_each_entry(nh, &rt6_nh_list, next) {
4187 		if (err_nh == nh)
4188 			break;
4189 		ip6_route_del(&nh->r_cfg, extack);
4190 	}
4191 
4192 cleanup:
4193 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4194 		if (nh->rt6_info)
4195 			dst_release_immediate(&nh->rt6_info->dst);
4196 		kfree(nh->mxc.mx);
4197 		list_del(&nh->next);
4198 		kfree(nh);
4199 	}
4200 
4201 	return err;
4202 }
4203 
4204 static int ip6_route_multipath_del(struct fib6_config *cfg,
4205 				   struct netlink_ext_ack *extack)
4206 {
4207 	struct fib6_config r_cfg;
4208 	struct rtnexthop *rtnh;
4209 	int remaining;
4210 	int attrlen;
4211 	int err = 1, last_err = 0;
4212 
4213 	remaining = cfg->fc_mp_len;
4214 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4215 
4216 	/* Parse a Multipath Entry */
4217 	while (rtnh_ok(rtnh, remaining)) {
4218 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4219 		if (rtnh->rtnh_ifindex)
4220 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4221 
4222 		attrlen = rtnh_attrlen(rtnh);
4223 		if (attrlen > 0) {
4224 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4225 
4226 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4227 			if (nla) {
4228 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4229 				r_cfg.fc_flags |= RTF_GATEWAY;
4230 			}
4231 		}
4232 		err = ip6_route_del(&r_cfg, extack);
4233 		if (err)
4234 			last_err = err;
4235 
4236 		rtnh = rtnh_next(rtnh, &remaining);
4237 	}
4238 
4239 	return last_err;
4240 }
4241 
4242 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4243 			      struct netlink_ext_ack *extack)
4244 {
4245 	struct fib6_config cfg;
4246 	int err;
4247 
4248 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4249 	if (err < 0)
4250 		return err;
4251 
4252 	if (cfg.fc_mp)
4253 		return ip6_route_multipath_del(&cfg, extack);
4254 	else {
4255 		cfg.fc_delete_all_nh = 1;
4256 		return ip6_route_del(&cfg, extack);
4257 	}
4258 }
4259 
4260 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4261 			      struct netlink_ext_ack *extack)
4262 {
4263 	struct fib6_config cfg;
4264 	int err;
4265 
4266 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4267 	if (err < 0)
4268 		return err;
4269 
4270 	if (cfg.fc_mp)
4271 		return ip6_route_multipath_add(&cfg, extack);
4272 	else
4273 		return ip6_route_add(&cfg, extack);
4274 }
4275 
4276 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4277 {
4278 	int nexthop_len = 0;
4279 
4280 	if (rt->rt6i_nsiblings) {
4281 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4282 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4283 			    + nla_total_size(16) /* RTA_GATEWAY */
4284 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4285 
4286 		nexthop_len *= rt->rt6i_nsiblings;
4287 	}
4288 
4289 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4290 	       + nla_total_size(16) /* RTA_SRC */
4291 	       + nla_total_size(16) /* RTA_DST */
4292 	       + nla_total_size(16) /* RTA_GATEWAY */
4293 	       + nla_total_size(16) /* RTA_PREFSRC */
4294 	       + nla_total_size(4) /* RTA_TABLE */
4295 	       + nla_total_size(4) /* RTA_IIF */
4296 	       + nla_total_size(4) /* RTA_OIF */
4297 	       + nla_total_size(4) /* RTA_PRIORITY */
4298 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4299 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4300 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4301 	       + nla_total_size(1) /* RTA_PREF */
4302 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4303 	       + nexthop_len;
4304 }
4305 
4306 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4307 			    unsigned int *flags, bool skip_oif)
4308 {
4309 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4310 		*flags |= RTNH_F_DEAD;
4311 
4312 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4313 		*flags |= RTNH_F_LINKDOWN;
4314 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4315 			*flags |= RTNH_F_DEAD;
4316 	}
4317 
4318 	if (rt->rt6i_flags & RTF_GATEWAY) {
4319 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4320 			goto nla_put_failure;
4321 	}
4322 
4323 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4324 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4325 		*flags |= RTNH_F_OFFLOAD;
4326 
4327 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4328 	if (!skip_oif && rt->dst.dev &&
4329 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4330 		goto nla_put_failure;
4331 
4332 	if (rt->dst.lwtstate &&
4333 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4334 		goto nla_put_failure;
4335 
4336 	return 0;
4337 
4338 nla_put_failure:
4339 	return -EMSGSIZE;
4340 }
4341 
4342 /* add multipath next hop */
4343 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4344 {
4345 	struct rtnexthop *rtnh;
4346 	unsigned int flags = 0;
4347 
4348 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4349 	if (!rtnh)
4350 		goto nla_put_failure;
4351 
4352 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4353 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4354 
4355 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4356 		goto nla_put_failure;
4357 
4358 	rtnh->rtnh_flags = flags;
4359 
4360 	/* length of rtnetlink header + attributes */
4361 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4362 
4363 	return 0;
4364 
4365 nla_put_failure:
4366 	return -EMSGSIZE;
4367 }
4368 
4369 static int rt6_fill_node(struct net *net,
4370 			 struct sk_buff *skb, struct rt6_info *rt,
4371 			 struct in6_addr *dst, struct in6_addr *src,
4372 			 int iif, int type, u32 portid, u32 seq,
4373 			 unsigned int flags)
4374 {
4375 	u32 metrics[RTAX_MAX];
4376 	struct rtmsg *rtm;
4377 	struct nlmsghdr *nlh;
4378 	long expires;
4379 	u32 table;
4380 
4381 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4382 	if (!nlh)
4383 		return -EMSGSIZE;
4384 
4385 	rtm = nlmsg_data(nlh);
4386 	rtm->rtm_family = AF_INET6;
4387 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4388 	rtm->rtm_src_len = rt->rt6i_src.plen;
4389 	rtm->rtm_tos = 0;
4390 	if (rt->rt6i_table)
4391 		table = rt->rt6i_table->tb6_id;
4392 	else
4393 		table = RT6_TABLE_UNSPEC;
4394 	rtm->rtm_table = table;
4395 	if (nla_put_u32(skb, RTA_TABLE, table))
4396 		goto nla_put_failure;
4397 	if (rt->rt6i_flags & RTF_REJECT) {
4398 		switch (rt->dst.error) {
4399 		case -EINVAL:
4400 			rtm->rtm_type = RTN_BLACKHOLE;
4401 			break;
4402 		case -EACCES:
4403 			rtm->rtm_type = RTN_PROHIBIT;
4404 			break;
4405 		case -EAGAIN:
4406 			rtm->rtm_type = RTN_THROW;
4407 			break;
4408 		default:
4409 			rtm->rtm_type = RTN_UNREACHABLE;
4410 			break;
4411 		}
4412 	}
4413 	else if (rt->rt6i_flags & RTF_LOCAL)
4414 		rtm->rtm_type = RTN_LOCAL;
4415 	else if (rt->rt6i_flags & RTF_ANYCAST)
4416 		rtm->rtm_type = RTN_ANYCAST;
4417 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4418 		rtm->rtm_type = RTN_LOCAL;
4419 	else
4420 		rtm->rtm_type = RTN_UNICAST;
4421 	rtm->rtm_flags = 0;
4422 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4423 	rtm->rtm_protocol = rt->rt6i_protocol;
4424 
4425 	if (rt->rt6i_flags & RTF_CACHE)
4426 		rtm->rtm_flags |= RTM_F_CLONED;
4427 
4428 	if (dst) {
4429 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4430 			goto nla_put_failure;
4431 		rtm->rtm_dst_len = 128;
4432 	} else if (rtm->rtm_dst_len)
4433 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4434 			goto nla_put_failure;
4435 #ifdef CONFIG_IPV6_SUBTREES
4436 	if (src) {
4437 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4438 			goto nla_put_failure;
4439 		rtm->rtm_src_len = 128;
4440 	} else if (rtm->rtm_src_len &&
4441 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4442 		goto nla_put_failure;
4443 #endif
4444 	if (iif) {
4445 #ifdef CONFIG_IPV6_MROUTE
4446 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4447 			int err = ip6mr_get_route(net, skb, rtm, portid);
4448 
4449 			if (err == 0)
4450 				return 0;
4451 			if (err < 0)
4452 				goto nla_put_failure;
4453 		} else
4454 #endif
4455 			if (nla_put_u32(skb, RTA_IIF, iif))
4456 				goto nla_put_failure;
4457 	} else if (dst) {
4458 		struct in6_addr saddr_buf;
4459 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4460 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4461 			goto nla_put_failure;
4462 	}
4463 
4464 	if (rt->rt6i_prefsrc.plen) {
4465 		struct in6_addr saddr_buf;
4466 		saddr_buf = rt->rt6i_prefsrc.addr;
4467 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4468 			goto nla_put_failure;
4469 	}
4470 
4471 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4472 	if (rt->rt6i_pmtu)
4473 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4474 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4475 		goto nla_put_failure;
4476 
4477 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4478 		goto nla_put_failure;
4479 
4480 	/* For multipath routes, walk the siblings list and add
4481 	 * each as a nexthop within RTA_MULTIPATH.
4482 	 */
4483 	if (rt->rt6i_nsiblings) {
4484 		struct rt6_info *sibling, *next_sibling;
4485 		struct nlattr *mp;
4486 
4487 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4488 		if (!mp)
4489 			goto nla_put_failure;
4490 
4491 		if (rt6_add_nexthop(skb, rt) < 0)
4492 			goto nla_put_failure;
4493 
4494 		list_for_each_entry_safe(sibling, next_sibling,
4495 					 &rt->rt6i_siblings, rt6i_siblings) {
4496 			if (rt6_add_nexthop(skb, sibling) < 0)
4497 				goto nla_put_failure;
4498 		}
4499 
4500 		nla_nest_end(skb, mp);
4501 	} else {
4502 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4503 			goto nla_put_failure;
4504 	}
4505 
4506 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4507 
4508 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4509 		goto nla_put_failure;
4510 
4511 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4512 		goto nla_put_failure;
4513 
4514 
4515 	nlmsg_end(skb, nlh);
4516 	return 0;
4517 
4518 nla_put_failure:
4519 	nlmsg_cancel(skb, nlh);
4520 	return -EMSGSIZE;
4521 }
4522 
4523 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4524 {
4525 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4526 	struct net *net = arg->net;
4527 
4528 	if (rt == net->ipv6.ip6_null_entry)
4529 		return 0;
4530 
4531 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4532 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4533 
4534 		/* user wants prefix routes only */
4535 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4536 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4537 			/* success since this is not a prefix route */
4538 			return 1;
4539 		}
4540 	}
4541 
4542 	return rt6_fill_node(net,
4543 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4544 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4545 		     NLM_F_MULTI);
4546 }
4547 
4548 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4549 			      struct netlink_ext_ack *extack)
4550 {
4551 	struct net *net = sock_net(in_skb->sk);
4552 	struct nlattr *tb[RTA_MAX+1];
4553 	int err, iif = 0, oif = 0;
4554 	struct dst_entry *dst;
4555 	struct rt6_info *rt;
4556 	struct sk_buff *skb;
4557 	struct rtmsg *rtm;
4558 	struct flowi6 fl6;
4559 	bool fibmatch;
4560 
4561 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4562 			  extack);
4563 	if (err < 0)
4564 		goto errout;
4565 
4566 	err = -EINVAL;
4567 	memset(&fl6, 0, sizeof(fl6));
4568 	rtm = nlmsg_data(nlh);
4569 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4570 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4571 
4572 	if (tb[RTA_SRC]) {
4573 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4574 			goto errout;
4575 
4576 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4577 	}
4578 
4579 	if (tb[RTA_DST]) {
4580 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4581 			goto errout;
4582 
4583 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4584 	}
4585 
4586 	if (tb[RTA_IIF])
4587 		iif = nla_get_u32(tb[RTA_IIF]);
4588 
4589 	if (tb[RTA_OIF])
4590 		oif = nla_get_u32(tb[RTA_OIF]);
4591 
4592 	if (tb[RTA_MARK])
4593 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4594 
4595 	if (tb[RTA_UID])
4596 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4597 					   nla_get_u32(tb[RTA_UID]));
4598 	else
4599 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4600 
4601 	if (iif) {
4602 		struct net_device *dev;
4603 		int flags = 0;
4604 
4605 		rcu_read_lock();
4606 
4607 		dev = dev_get_by_index_rcu(net, iif);
4608 		if (!dev) {
4609 			rcu_read_unlock();
4610 			err = -ENODEV;
4611 			goto errout;
4612 		}
4613 
4614 		fl6.flowi6_iif = iif;
4615 
4616 		if (!ipv6_addr_any(&fl6.saddr))
4617 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4618 
4619 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4620 
4621 		rcu_read_unlock();
4622 	} else {
4623 		fl6.flowi6_oif = oif;
4624 
4625 		dst = ip6_route_output(net, NULL, &fl6);
4626 	}
4627 
4628 
4629 	rt = container_of(dst, struct rt6_info, dst);
4630 	if (rt->dst.error) {
4631 		err = rt->dst.error;
4632 		ip6_rt_put(rt);
4633 		goto errout;
4634 	}
4635 
4636 	if (rt == net->ipv6.ip6_null_entry) {
4637 		err = rt->dst.error;
4638 		ip6_rt_put(rt);
4639 		goto errout;
4640 	}
4641 
4642 	if (fibmatch && rt->from) {
4643 		struct rt6_info *ort = rt->from;
4644 
4645 		dst_hold(&ort->dst);
4646 		ip6_rt_put(rt);
4647 		rt = ort;
4648 	}
4649 
4650 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4651 	if (!skb) {
4652 		ip6_rt_put(rt);
4653 		err = -ENOBUFS;
4654 		goto errout;
4655 	}
4656 
4657 	skb_dst_set(skb, &rt->dst);
4658 	if (fibmatch)
4659 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4660 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4661 				    nlh->nlmsg_seq, 0);
4662 	else
4663 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4664 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4665 				    nlh->nlmsg_seq, 0);
4666 	if (err < 0) {
4667 		kfree_skb(skb);
4668 		goto errout;
4669 	}
4670 
4671 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4672 errout:
4673 	return err;
4674 }
4675 
4676 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4677 		     unsigned int nlm_flags)
4678 {
4679 	struct sk_buff *skb;
4680 	struct net *net = info->nl_net;
4681 	u32 seq;
4682 	int err;
4683 
4684 	err = -ENOBUFS;
4685 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4686 
4687 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4688 	if (!skb)
4689 		goto errout;
4690 
4691 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4692 				event, info->portid, seq, nlm_flags);
4693 	if (err < 0) {
4694 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4695 		WARN_ON(err == -EMSGSIZE);
4696 		kfree_skb(skb);
4697 		goto errout;
4698 	}
4699 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4700 		    info->nlh, gfp_any());
4701 	return;
4702 errout:
4703 	if (err < 0)
4704 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4705 }
4706 
4707 static int ip6_route_dev_notify(struct notifier_block *this,
4708 				unsigned long event, void *ptr)
4709 {
4710 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4711 	struct net *net = dev_net(dev);
4712 
4713 	if (!(dev->flags & IFF_LOOPBACK))
4714 		return NOTIFY_OK;
4715 
4716 	if (event == NETDEV_REGISTER) {
4717 		net->ipv6.ip6_null_entry->dst.dev = dev;
4718 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4719 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4720 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4721 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4722 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4723 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4724 #endif
4725 	 } else if (event == NETDEV_UNREGISTER &&
4726 		    dev->reg_state != NETREG_UNREGISTERED) {
4727 		/* NETDEV_UNREGISTER could be fired for multiple times by
4728 		 * netdev_wait_allrefs(). Make sure we only call this once.
4729 		 */
4730 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4731 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4732 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4733 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4734 #endif
4735 	}
4736 
4737 	return NOTIFY_OK;
4738 }
4739 
4740 /*
4741  *	/proc
4742  */
4743 
4744 #ifdef CONFIG_PROC_FS
4745 
4746 static const struct file_operations ipv6_route_proc_fops = {
4747 	.open		= ipv6_route_open,
4748 	.read		= seq_read,
4749 	.llseek		= seq_lseek,
4750 	.release	= seq_release_net,
4751 };
4752 
4753 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4754 {
4755 	struct net *net = (struct net *)seq->private;
4756 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4757 		   net->ipv6.rt6_stats->fib_nodes,
4758 		   net->ipv6.rt6_stats->fib_route_nodes,
4759 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4760 		   net->ipv6.rt6_stats->fib_rt_entries,
4761 		   net->ipv6.rt6_stats->fib_rt_cache,
4762 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4763 		   net->ipv6.rt6_stats->fib_discarded_routes);
4764 
4765 	return 0;
4766 }
4767 
4768 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4769 {
4770 	return single_open_net(inode, file, rt6_stats_seq_show);
4771 }
4772 
4773 static const struct file_operations rt6_stats_seq_fops = {
4774 	.open	 = rt6_stats_seq_open,
4775 	.read	 = seq_read,
4776 	.llseek	 = seq_lseek,
4777 	.release = single_release_net,
4778 };
4779 #endif	/* CONFIG_PROC_FS */
4780 
4781 #ifdef CONFIG_SYSCTL
4782 
4783 static
4784 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4785 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4786 {
4787 	struct net *net;
4788 	int delay;
4789 	if (!write)
4790 		return -EINVAL;
4791 
4792 	net = (struct net *)ctl->extra1;
4793 	delay = net->ipv6.sysctl.flush_delay;
4794 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4795 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4796 	return 0;
4797 }
4798 
4799 struct ctl_table ipv6_route_table_template[] = {
4800 	{
4801 		.procname	=	"flush",
4802 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4803 		.maxlen		=	sizeof(int),
4804 		.mode		=	0200,
4805 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4806 	},
4807 	{
4808 		.procname	=	"gc_thresh",
4809 		.data		=	&ip6_dst_ops_template.gc_thresh,
4810 		.maxlen		=	sizeof(int),
4811 		.mode		=	0644,
4812 		.proc_handler	=	proc_dointvec,
4813 	},
4814 	{
4815 		.procname	=	"max_size",
4816 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4817 		.maxlen		=	sizeof(int),
4818 		.mode		=	0644,
4819 		.proc_handler	=	proc_dointvec,
4820 	},
4821 	{
4822 		.procname	=	"gc_min_interval",
4823 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4824 		.maxlen		=	sizeof(int),
4825 		.mode		=	0644,
4826 		.proc_handler	=	proc_dointvec_jiffies,
4827 	},
4828 	{
4829 		.procname	=	"gc_timeout",
4830 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4831 		.maxlen		=	sizeof(int),
4832 		.mode		=	0644,
4833 		.proc_handler	=	proc_dointvec_jiffies,
4834 	},
4835 	{
4836 		.procname	=	"gc_interval",
4837 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4838 		.maxlen		=	sizeof(int),
4839 		.mode		=	0644,
4840 		.proc_handler	=	proc_dointvec_jiffies,
4841 	},
4842 	{
4843 		.procname	=	"gc_elasticity",
4844 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4845 		.maxlen		=	sizeof(int),
4846 		.mode		=	0644,
4847 		.proc_handler	=	proc_dointvec,
4848 	},
4849 	{
4850 		.procname	=	"mtu_expires",
4851 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4852 		.maxlen		=	sizeof(int),
4853 		.mode		=	0644,
4854 		.proc_handler	=	proc_dointvec_jiffies,
4855 	},
4856 	{
4857 		.procname	=	"min_adv_mss",
4858 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4859 		.maxlen		=	sizeof(int),
4860 		.mode		=	0644,
4861 		.proc_handler	=	proc_dointvec,
4862 	},
4863 	{
4864 		.procname	=	"gc_min_interval_ms",
4865 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4866 		.maxlen		=	sizeof(int),
4867 		.mode		=	0644,
4868 		.proc_handler	=	proc_dointvec_ms_jiffies,
4869 	},
4870 	{ }
4871 };
4872 
4873 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4874 {
4875 	struct ctl_table *table;
4876 
4877 	table = kmemdup(ipv6_route_table_template,
4878 			sizeof(ipv6_route_table_template),
4879 			GFP_KERNEL);
4880 
4881 	if (table) {
4882 		table[0].data = &net->ipv6.sysctl.flush_delay;
4883 		table[0].extra1 = net;
4884 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4885 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4886 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4887 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4888 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4889 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4890 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4891 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4892 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4893 
4894 		/* Don't export sysctls to unprivileged users */
4895 		if (net->user_ns != &init_user_ns)
4896 			table[0].procname = NULL;
4897 	}
4898 
4899 	return table;
4900 }
4901 #endif
4902 
4903 static int __net_init ip6_route_net_init(struct net *net)
4904 {
4905 	int ret = -ENOMEM;
4906 
4907 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4908 	       sizeof(net->ipv6.ip6_dst_ops));
4909 
4910 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4911 		goto out_ip6_dst_ops;
4912 
4913 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4914 					   sizeof(*net->ipv6.ip6_null_entry),
4915 					   GFP_KERNEL);
4916 	if (!net->ipv6.ip6_null_entry)
4917 		goto out_ip6_dst_entries;
4918 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4919 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4920 			 ip6_template_metrics, true);
4921 
4922 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4923 	net->ipv6.fib6_has_custom_rules = false;
4924 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4925 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4926 					       GFP_KERNEL);
4927 	if (!net->ipv6.ip6_prohibit_entry)
4928 		goto out_ip6_null_entry;
4929 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4930 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4931 			 ip6_template_metrics, true);
4932 
4933 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4934 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4935 					       GFP_KERNEL);
4936 	if (!net->ipv6.ip6_blk_hole_entry)
4937 		goto out_ip6_prohibit_entry;
4938 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4939 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4940 			 ip6_template_metrics, true);
4941 #endif
4942 
4943 	net->ipv6.sysctl.flush_delay = 0;
4944 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4945 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4946 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4947 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4948 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4949 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4950 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4951 
4952 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4953 
4954 	ret = 0;
4955 out:
4956 	return ret;
4957 
4958 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4959 out_ip6_prohibit_entry:
4960 	kfree(net->ipv6.ip6_prohibit_entry);
4961 out_ip6_null_entry:
4962 	kfree(net->ipv6.ip6_null_entry);
4963 #endif
4964 out_ip6_dst_entries:
4965 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4966 out_ip6_dst_ops:
4967 	goto out;
4968 }
4969 
4970 static void __net_exit ip6_route_net_exit(struct net *net)
4971 {
4972 	kfree(net->ipv6.ip6_null_entry);
4973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4974 	kfree(net->ipv6.ip6_prohibit_entry);
4975 	kfree(net->ipv6.ip6_blk_hole_entry);
4976 #endif
4977 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4978 }
4979 
4980 static int __net_init ip6_route_net_init_late(struct net *net)
4981 {
4982 #ifdef CONFIG_PROC_FS
4983 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4984 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4985 #endif
4986 	return 0;
4987 }
4988 
4989 static void __net_exit ip6_route_net_exit_late(struct net *net)
4990 {
4991 #ifdef CONFIG_PROC_FS
4992 	remove_proc_entry("ipv6_route", net->proc_net);
4993 	remove_proc_entry("rt6_stats", net->proc_net);
4994 #endif
4995 }
4996 
4997 static struct pernet_operations ip6_route_net_ops = {
4998 	.init = ip6_route_net_init,
4999 	.exit = ip6_route_net_exit,
5000 };
5001 
5002 static int __net_init ipv6_inetpeer_init(struct net *net)
5003 {
5004 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5005 
5006 	if (!bp)
5007 		return -ENOMEM;
5008 	inet_peer_base_init(bp);
5009 	net->ipv6.peers = bp;
5010 	return 0;
5011 }
5012 
5013 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5014 {
5015 	struct inet_peer_base *bp = net->ipv6.peers;
5016 
5017 	net->ipv6.peers = NULL;
5018 	inetpeer_invalidate_tree(bp);
5019 	kfree(bp);
5020 }
5021 
5022 static struct pernet_operations ipv6_inetpeer_ops = {
5023 	.init	=	ipv6_inetpeer_init,
5024 	.exit	=	ipv6_inetpeer_exit,
5025 };
5026 
5027 static struct pernet_operations ip6_route_net_late_ops = {
5028 	.init = ip6_route_net_init_late,
5029 	.exit = ip6_route_net_exit_late,
5030 };
5031 
5032 static struct notifier_block ip6_route_dev_notifier = {
5033 	.notifier_call = ip6_route_dev_notify,
5034 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5035 };
5036 
5037 void __init ip6_route_init_special_entries(void)
5038 {
5039 	/* Registering of the loopback is done before this portion of code,
5040 	 * the loopback reference in rt6_info will not be taken, do it
5041 	 * manually for init_net */
5042 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5043 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5044   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5045 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5046 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5047 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5048 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5049   #endif
5050 }
5051 
5052 int __init ip6_route_init(void)
5053 {
5054 	int ret;
5055 	int cpu;
5056 
5057 	ret = -ENOMEM;
5058 	ip6_dst_ops_template.kmem_cachep =
5059 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5060 				  SLAB_HWCACHE_ALIGN, NULL);
5061 	if (!ip6_dst_ops_template.kmem_cachep)
5062 		goto out;
5063 
5064 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5065 	if (ret)
5066 		goto out_kmem_cache;
5067 
5068 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5069 	if (ret)
5070 		goto out_dst_entries;
5071 
5072 	ret = register_pernet_subsys(&ip6_route_net_ops);
5073 	if (ret)
5074 		goto out_register_inetpeer;
5075 
5076 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5077 
5078 	ret = fib6_init();
5079 	if (ret)
5080 		goto out_register_subsys;
5081 
5082 	ret = xfrm6_init();
5083 	if (ret)
5084 		goto out_fib6_init;
5085 
5086 	ret = fib6_rules_init();
5087 	if (ret)
5088 		goto xfrm6_init;
5089 
5090 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5091 	if (ret)
5092 		goto fib6_rules_init;
5093 
5094 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5095 				   inet6_rtm_newroute, NULL, 0);
5096 	if (ret < 0)
5097 		goto out_register_late_subsys;
5098 
5099 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5100 				   inet6_rtm_delroute, NULL, 0);
5101 	if (ret < 0)
5102 		goto out_register_late_subsys;
5103 
5104 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5105 				   inet6_rtm_getroute, NULL,
5106 				   RTNL_FLAG_DOIT_UNLOCKED);
5107 	if (ret < 0)
5108 		goto out_register_late_subsys;
5109 
5110 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5111 	if (ret)
5112 		goto out_register_late_subsys;
5113 
5114 	for_each_possible_cpu(cpu) {
5115 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5116 
5117 		INIT_LIST_HEAD(&ul->head);
5118 		spin_lock_init(&ul->lock);
5119 	}
5120 
5121 out:
5122 	return ret;
5123 
5124 out_register_late_subsys:
5125 	rtnl_unregister_all(PF_INET6);
5126 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5127 fib6_rules_init:
5128 	fib6_rules_cleanup();
5129 xfrm6_init:
5130 	xfrm6_fini();
5131 out_fib6_init:
5132 	fib6_gc_cleanup();
5133 out_register_subsys:
5134 	unregister_pernet_subsys(&ip6_route_net_ops);
5135 out_register_inetpeer:
5136 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5137 out_dst_entries:
5138 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5139 out_kmem_cache:
5140 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5141 	goto out;
5142 }
5143 
5144 void ip6_route_cleanup(void)
5145 {
5146 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5147 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5148 	fib6_rules_cleanup();
5149 	xfrm6_fini();
5150 	fib6_gc_cleanup();
5151 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5152 	unregister_pernet_subsys(&ip6_route_net_ops);
5153 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5154 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5155 }
5156