xref: /linux/net/ipv6/route.c (revision 064223c1231ce508efaded6576ffdb07de9307b5)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 			 struct fib6_info *rt, struct dst_entry *dst,
103 			 struct in6_addr *dest, struct in6_addr *src,
104 			 int iif, int type, u32 portid, u32 seq,
105 			 unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 					   struct in6_addr *daddr,
108 					   struct in6_addr *saddr);
109 
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 					   const struct in6_addr *prefix, int prefixlen,
113 					   const struct in6_addr *gwaddr,
114 					   struct net_device *dev,
115 					   unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 					   const struct in6_addr *prefix, int prefixlen,
118 					   const struct in6_addr *gwaddr,
119 					   struct net_device *dev);
120 #endif
121 
122 struct uncached_list {
123 	spinlock_t		lock;
124 	struct list_head	head;
125 };
126 
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 
133 	rt->rt6i_uncached_list = ul;
134 
135 	spin_lock_bh(&ul->lock);
136 	list_add_tail(&rt->rt6i_uncached, &ul->head);
137 	spin_unlock_bh(&ul->lock);
138 }
139 
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142 	if (!list_empty(&rt->rt6i_uncached)) {
143 		struct uncached_list *ul = rt->rt6i_uncached_list;
144 		struct net *net = dev_net(rt->dst.dev);
145 
146 		spin_lock_bh(&ul->lock);
147 		list_del(&rt->rt6i_uncached);
148 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 		spin_unlock_bh(&ul->lock);
150 	}
151 }
152 
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155 	struct net_device *loopback_dev = net->loopback_dev;
156 	int cpu;
157 
158 	if (dev == loopback_dev)
159 		return;
160 
161 	for_each_possible_cpu(cpu) {
162 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163 		struct rt6_info *rt;
164 
165 		spin_lock_bh(&ul->lock);
166 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 			struct inet6_dev *rt_idev = rt->rt6i_idev;
168 			struct net_device *rt_dev = rt->dst.dev;
169 
170 			if (rt_idev->dev == dev) {
171 				rt->rt6i_idev = in6_dev_get(loopback_dev);
172 				in6_dev_put(rt_idev);
173 			}
174 
175 			if (rt_dev == dev) {
176 				rt->dst.dev = loopback_dev;
177 				dev_hold(rt->dst.dev);
178 				dev_put(rt_dev);
179 			}
180 		}
181 		spin_unlock_bh(&ul->lock);
182 	}
183 }
184 
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	if (!ipv6_addr_any(p))
190 		return (const void *) p;
191 	else if (skb)
192 		return &ipv6_hdr(skb)->daddr;
193 	return daddr;
194 }
195 
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 				   struct net_device *dev,
198 				   struct sk_buff *skb,
199 				   const void *daddr)
200 {
201 	struct neighbour *n;
202 
203 	daddr = choose_neigh_daddr(gw, skb, daddr);
204 	n = __ipv6_neigh_lookup(dev, daddr);
205 	if (n)
206 		return n;
207 	return neigh_create(&nd_tbl, daddr, dev);
208 }
209 
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211 					      struct sk_buff *skb,
212 					      const void *daddr)
213 {
214 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215 
216 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218 
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221 	struct net_device *dev = dst->dev;
222 	struct rt6_info *rt = (struct rt6_info *)dst;
223 
224 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 	if (!daddr)
226 		return;
227 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228 		return;
229 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230 		return;
231 	__ipv6_confirm_neigh(dev, daddr);
232 }
233 
234 static struct dst_ops ip6_dst_ops_template = {
235 	.family			=	AF_INET6,
236 	.gc			=	ip6_dst_gc,
237 	.gc_thresh		=	1024,
238 	.check			=	ip6_dst_check,
239 	.default_advmss		=	ip6_default_advmss,
240 	.mtu			=	ip6_mtu,
241 	.cow_metrics		=	dst_cow_metrics_generic,
242 	.destroy		=	ip6_dst_destroy,
243 	.ifdown			=	ip6_dst_ifdown,
244 	.negative_advice	=	ip6_negative_advice,
245 	.link_failure		=	ip6_link_failure,
246 	.update_pmtu		=	ip6_rt_update_pmtu,
247 	.redirect		=	rt6_do_redirect,
248 	.local_out		=	__ip6_local_out,
249 	.neigh_lookup		=	ip6_dst_neigh_lookup,
250 	.confirm_neigh		=	ip6_confirm_neigh,
251 };
252 
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256 
257 	return mtu ? : dst->dev->mtu;
258 }
259 
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 					 struct sk_buff *skb, u32 mtu)
262 {
263 }
264 
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266 				      struct sk_buff *skb)
267 {
268 }
269 
270 static struct dst_ops ip6_dst_blackhole_ops = {
271 	.family			=	AF_INET6,
272 	.destroy		=	ip6_dst_destroy,
273 	.check			=	ip6_dst_check,
274 	.mtu			=	ip6_blackhole_mtu,
275 	.default_advmss		=	ip6_default_advmss,
276 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277 	.redirect		=	ip6_rt_blackhole_redirect,
278 	.cow_metrics		=	dst_cow_metrics_generic,
279 	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 };
281 
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 	[RTAX_HOPLIMIT - 1] = 0,
284 };
285 
286 static const struct fib6_info fib6_null_entry_template = {
287 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
288 	.fib6_protocol  = RTPROT_KERNEL,
289 	.fib6_metric	= ~(u32)0,
290 	.fib6_ref	= ATOMIC_INIT(1),
291 	.fib6_type	= RTN_UNREACHABLE,
292 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
293 };
294 
295 static const struct rt6_info ip6_null_entry_template = {
296 	.dst = {
297 		.__refcnt	= ATOMIC_INIT(1),
298 		.__use		= 1,
299 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 		.error		= -ENETUNREACH,
301 		.input		= ip6_pkt_discard,
302 		.output		= ip6_pkt_discard_out,
303 	},
304 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
305 };
306 
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308 
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 	.dst = {
311 		.__refcnt	= ATOMIC_INIT(1),
312 		.__use		= 1,
313 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 		.error		= -EACCES,
315 		.input		= ip6_pkt_prohibit,
316 		.output		= ip6_pkt_prohibit_out,
317 	},
318 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319 };
320 
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322 	.dst = {
323 		.__refcnt	= ATOMIC_INIT(1),
324 		.__use		= 1,
325 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 		.error		= -EINVAL,
327 		.input		= dst_discard,
328 		.output		= dst_discard_out,
329 	},
330 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
331 };
332 
333 #endif
334 
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337 	struct dst_entry *dst = &rt->dst;
338 
339 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 	INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342 
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 					1, DST_OBSOLETE_FORCE_CHK, flags);
349 
350 	if (rt) {
351 		rt6_info_init(rt);
352 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353 	}
354 
355 	return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358 
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361 	struct rt6_info *rt = (struct rt6_info *)dst;
362 	struct fib6_info *from;
363 	struct inet6_dev *idev;
364 
365 	dst_destroy_metrics_generic(dst);
366 	rt6_uncached_list_del(rt);
367 
368 	idev = rt->rt6i_idev;
369 	if (idev) {
370 		rt->rt6i_idev = NULL;
371 		in6_dev_put(idev);
372 	}
373 
374 	rcu_read_lock();
375 	from = rcu_dereference(rt->from);
376 	rcu_assign_pointer(rt->from, NULL);
377 	fib6_info_release(from);
378 	rcu_read_unlock();
379 }
380 
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382 			   int how)
383 {
384 	struct rt6_info *rt = (struct rt6_info *)dst;
385 	struct inet6_dev *idev = rt->rt6i_idev;
386 	struct net_device *loopback_dev =
387 		dev_net(dev)->loopback_dev;
388 
389 	if (idev && idev->dev != loopback_dev) {
390 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391 		if (loopback_idev) {
392 			rt->rt6i_idev = loopback_idev;
393 			in6_dev_put(idev);
394 		}
395 	}
396 }
397 
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400 	if (rt->rt6i_flags & RTF_EXPIRES)
401 		return time_after(jiffies, rt->dst.expires);
402 	else
403 		return false;
404 }
405 
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408 	struct fib6_info *from;
409 
410 	from = rcu_dereference(rt->from);
411 
412 	if (rt->rt6i_flags & RTF_EXPIRES) {
413 		if (time_after(jiffies, rt->dst.expires))
414 			return true;
415 	} else if (from) {
416 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 			fib6_check_expired(from);
418 	}
419 	return false;
420 }
421 
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423 					      struct fib6_info *match,
424 					     struct flowi6 *fl6, int oif,
425 					     const struct sk_buff *skb,
426 					     int strict)
427 {
428 	struct fib6_info *sibling, *next_sibling;
429 
430 	/* We might have already computed the hash for ICMPv6 errors. In such
431 	 * case it will always be non-zero. Otherwise now is the time to do it.
432 	 */
433 	if (!fl6->mp_hash)
434 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435 
436 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437 		return match;
438 
439 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440 				 fib6_siblings) {
441 		int nh_upper_bound;
442 
443 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 		if (fl6->mp_hash > nh_upper_bound)
445 			continue;
446 		if (rt6_score_route(sibling, oif, strict) < 0)
447 			break;
448 		match = sibling;
449 		break;
450 	}
451 
452 	return match;
453 }
454 
455 /*
456  *	Route lookup. rcu_read_lock() should be held.
457  */
458 
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 						 struct fib6_info *rt,
461 						    const struct in6_addr *saddr,
462 						    int oif,
463 						    int flags)
464 {
465 	struct fib6_info *sprt;
466 
467 	if (!oif && ipv6_addr_any(saddr) &&
468 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469 		return rt;
470 
471 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
472 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
473 
474 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475 			continue;
476 
477 		if (oif) {
478 			if (dev->ifindex == oif)
479 				return sprt;
480 		} else {
481 			if (ipv6_chk_addr(net, saddr, dev,
482 					  flags & RT6_LOOKUP_F_IFACE))
483 				return sprt;
484 		}
485 	}
486 
487 	if (oif && flags & RT6_LOOKUP_F_IFACE)
488 		return net->ipv6.fib6_null_entry;
489 
490 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492 
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 	struct work_struct work;
496 	struct in6_addr target;
497 	struct net_device *dev;
498 };
499 
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502 	struct in6_addr mcaddr;
503 	struct __rt6_probe_work *work =
504 		container_of(w, struct __rt6_probe_work, work);
505 
506 	addrconf_addr_solict_mult(&work->target, &mcaddr);
507 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508 	dev_put(work->dev);
509 	kfree(work);
510 }
511 
512 static void rt6_probe(struct fib6_info *rt)
513 {
514 	struct __rt6_probe_work *work;
515 	const struct in6_addr *nh_gw;
516 	struct neighbour *neigh;
517 	struct net_device *dev;
518 
519 	/*
520 	 * Okay, this does not seem to be appropriate
521 	 * for now, however, we need to check if it
522 	 * is really so; aka Router Reachability Probing.
523 	 *
524 	 * Router Reachability Probe MUST be rate-limited
525 	 * to no more than one per minute.
526 	 */
527 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528 		return;
529 
530 	nh_gw = &rt->fib6_nh.nh_gw;
531 	dev = rt->fib6_nh.nh_dev;
532 	rcu_read_lock_bh();
533 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534 	if (neigh) {
535 		struct inet6_dev *idev;
536 
537 		if (neigh->nud_state & NUD_VALID)
538 			goto out;
539 
540 		idev = __in6_dev_get(dev);
541 		work = NULL;
542 		write_lock(&neigh->lock);
543 		if (!(neigh->nud_state & NUD_VALID) &&
544 		    time_after(jiffies,
545 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
546 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
547 			if (work)
548 				__neigh_set_probe_once(neigh);
549 		}
550 		write_unlock(&neigh->lock);
551 	} else {
552 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 	}
554 
555 	if (work) {
556 		INIT_WORK(&work->work, rt6_probe_deferred);
557 		work->target = *nh_gw;
558 		dev_hold(dev);
559 		work->dev = dev;
560 		schedule_work(&work->work);
561 	}
562 
563 out:
564 	rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571 
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577 	const struct net_device *dev = rt->fib6_nh.nh_dev;
578 
579 	if (!oif || dev->ifindex == oif)
580 		return 2;
581 	return 0;
582 }
583 
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 	struct neighbour *neigh;
588 
589 	if (rt->fib6_flags & RTF_NONEXTHOP ||
590 	    !(rt->fib6_flags & RTF_GATEWAY))
591 		return RT6_NUD_SUCCEED;
592 
593 	rcu_read_lock_bh();
594 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595 					  &rt->fib6_nh.nh_gw);
596 	if (neigh) {
597 		read_lock(&neigh->lock);
598 		if (neigh->nud_state & NUD_VALID)
599 			ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 		else if (!(neigh->nud_state & NUD_FAILED))
602 			ret = RT6_NUD_SUCCEED;
603 		else
604 			ret = RT6_NUD_FAIL_PROBE;
605 #endif
606 		read_unlock(&neigh->lock);
607 	} else {
608 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610 	}
611 	rcu_read_unlock_bh();
612 
613 	return ret;
614 }
615 
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618 	int m;
619 
620 	m = rt6_check_dev(rt, oif);
621 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 		return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626 	if (strict & RT6_LOOKUP_F_REACHABLE) {
627 		int n = rt6_check_neigh(rt);
628 		if (n < 0)
629 			return n;
630 	}
631 	return m;
632 }
633 
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637 	const struct net_device *dev = fib6_info_nh_dev(f6i);
638 	bool rc = false;
639 
640 	if (dev) {
641 		const struct inet6_dev *idev = __in6_dev_get(dev);
642 
643 		rc = !!idev->cnf.ignore_routes_with_linkdown;
644 	}
645 
646 	return rc;
647 }
648 
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 				   int *mpri, struct fib6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 
656 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657 		goto out;
658 
659 	if (fib6_ignore_linkdown(rt) &&
660 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662 		goto out;
663 
664 	if (fib6_check_expired(rt))
665 		goto out;
666 
667 	m = rt6_score_route(rt, oif, strict);
668 	if (m == RT6_NUD_FAIL_DO_RR) {
669 		match_do_rr = true;
670 		m = 0; /* lowest valid score */
671 	} else if (m == RT6_NUD_FAIL_HARD) {
672 		goto out;
673 	}
674 
675 	if (strict & RT6_LOOKUP_F_REACHABLE)
676 		rt6_probe(rt);
677 
678 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
679 	if (m > *mpri) {
680 		*do_rr = match_do_rr;
681 		*mpri = m;
682 		match = rt;
683 	}
684 out:
685 	return match;
686 }
687 
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 				     struct fib6_info *leaf,
690 				     struct fib6_info *rr_head,
691 				     u32 metric, int oif, int strict,
692 				     bool *do_rr)
693 {
694 	struct fib6_info *rt, *match, *cont;
695 	int mpri = -1;
696 
697 	match = NULL;
698 	cont = NULL;
699 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
700 		if (rt->fib6_metric != metric) {
701 			cont = rt;
702 			break;
703 		}
704 
705 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
706 	}
707 
708 	for (rt = leaf; rt && rt != rr_head;
709 	     rt = rcu_dereference(rt->rt6_next)) {
710 		if (rt->fib6_metric != metric) {
711 			cont = rt;
712 			break;
713 		}
714 
715 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 	}
717 
718 	if (match || !cont)
719 		return match;
720 
721 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 
724 	return match;
725 }
726 
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728 				   int oif, int strict)
729 {
730 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 	struct fib6_info *match, *rt0;
732 	bool do_rr = false;
733 	int key_plen;
734 
735 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 		return net->ipv6.fib6_null_entry;
737 
738 	rt0 = rcu_dereference(fn->rr_ptr);
739 	if (!rt0)
740 		rt0 = leaf;
741 
742 	/* Double check to make sure fn is not an intermediate node
743 	 * and fn->leaf does not points to its child's leaf
744 	 * (This might happen if all routes under fn are deleted from
745 	 * the tree and fib6_repair_tree() is called on the node.)
746 	 */
747 	key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 	if (rt0->fib6_src.plen)
750 		key_plen = rt0->fib6_src.plen;
751 #endif
752 	if (fn->fn_bit != key_plen)
753 		return net->ipv6.fib6_null_entry;
754 
755 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756 			     &do_rr);
757 
758 	if (do_rr) {
759 		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
760 
761 		/* no entries matched; do round-robin */
762 		if (!next || next->fib6_metric != rt0->fib6_metric)
763 			next = leaf;
764 
765 		if (next != rt0) {
766 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 			/* make sure next is not being deleted from the tree */
768 			if (next->fib6_node)
769 				rcu_assign_pointer(fn->rr_ptr, next);
770 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771 		}
772 	}
773 
774 	return match ? match : net->ipv6.fib6_null_entry;
775 }
776 
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781 
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 		  const struct in6_addr *gwaddr)
785 {
786 	struct net *net = dev_net(dev);
787 	struct route_info *rinfo = (struct route_info *) opt;
788 	struct in6_addr prefix_buf, *prefix;
789 	unsigned int pref;
790 	unsigned long lifetime;
791 	struct fib6_info *rt;
792 
793 	if (len < sizeof(struct route_info)) {
794 		return -EINVAL;
795 	}
796 
797 	/* Sanity check for prefix_len and length */
798 	if (rinfo->length > 3) {
799 		return -EINVAL;
800 	} else if (rinfo->prefix_len > 128) {
801 		return -EINVAL;
802 	} else if (rinfo->prefix_len > 64) {
803 		if (rinfo->length < 2) {
804 			return -EINVAL;
805 		}
806 	} else if (rinfo->prefix_len > 0) {
807 		if (rinfo->length < 1) {
808 			return -EINVAL;
809 		}
810 	}
811 
812 	pref = rinfo->route_pref;
813 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
814 		return -EINVAL;
815 
816 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817 
818 	if (rinfo->length == 3)
819 		prefix = (struct in6_addr *)rinfo->prefix;
820 	else {
821 		/* this function is safe */
822 		ipv6_addr_prefix(&prefix_buf,
823 				 (struct in6_addr *)rinfo->prefix,
824 				 rinfo->prefix_len);
825 		prefix = &prefix_buf;
826 	}
827 
828 	if (rinfo->prefix_len == 0)
829 		rt = rt6_get_dflt_router(net, gwaddr, dev);
830 	else
831 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832 					gwaddr, dev);
833 
834 	if (rt && !lifetime) {
835 		ip6_del_rt(net, rt);
836 		rt = NULL;
837 	}
838 
839 	if (!rt && lifetime)
840 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841 					dev, pref);
842 	else if (rt)
843 		rt->fib6_flags = RTF_ROUTEINFO |
844 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845 
846 	if (rt) {
847 		if (!addrconf_finite_timeout(lifetime))
848 			fib6_clean_expires(rt);
849 		else
850 			fib6_set_expires(rt, jiffies + HZ * lifetime);
851 
852 		fib6_info_release(rt);
853 	}
854 	return 0;
855 }
856 #endif
857 
858 /*
859  *	Misc support functions
860  */
861 
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865 	struct net_device *dev = rt->fib6_nh.nh_dev;
866 
867 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 		/* for copies of local routes, dst->dev needs to be the
869 		 * device if it is a master device, the master device if
870 		 * device is enslaved, and the loopback as the default
871 		 */
872 		if (netif_is_l3_slave(dev) &&
873 		    !rt6_need_strict(&rt->fib6_dst.addr))
874 			dev = l3mdev_master_dev_rcu(dev);
875 		else if (!netif_is_l3_master(dev))
876 			dev = dev_net(dev)->loopback_dev;
877 		/* last case is netif_is_l3_master(dev) is true in which
878 		 * case we want dev returned to be dev
879 		 */
880 	}
881 
882 	return dev;
883 }
884 
885 static const int fib6_prop[RTN_MAX + 1] = {
886 	[RTN_UNSPEC]	= 0,
887 	[RTN_UNICAST]	= 0,
888 	[RTN_LOCAL]	= 0,
889 	[RTN_BROADCAST]	= 0,
890 	[RTN_ANYCAST]	= 0,
891 	[RTN_MULTICAST]	= 0,
892 	[RTN_BLACKHOLE]	= -EINVAL,
893 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
894 	[RTN_PROHIBIT]	= -EACCES,
895 	[RTN_THROW]	= -EAGAIN,
896 	[RTN_NAT]	= -EINVAL,
897 	[RTN_XRESOLVE]	= -EINVAL,
898 };
899 
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902 	return fib6_prop[fib6_type];
903 }
904 
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907 	unsigned short flags = 0;
908 
909 	if (rt->dst_nocount)
910 		flags |= DST_NOCOUNT;
911 	if (rt->dst_nopolicy)
912 		flags |= DST_NOPOLICY;
913 	if (rt->dst_host)
914 		flags |= DST_HOST;
915 
916 	return flags;
917 }
918 
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922 
923 	switch (ort->fib6_type) {
924 	case RTN_BLACKHOLE:
925 		rt->dst.output = dst_discard_out;
926 		rt->dst.input = dst_discard;
927 		break;
928 	case RTN_PROHIBIT:
929 		rt->dst.output = ip6_pkt_prohibit_out;
930 		rt->dst.input = ip6_pkt_prohibit;
931 		break;
932 	case RTN_THROW:
933 	case RTN_UNREACHABLE:
934 	default:
935 		rt->dst.output = ip6_pkt_discard_out;
936 		rt->dst.input = ip6_pkt_discard;
937 		break;
938 	}
939 }
940 
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943 	rt->dst.flags |= fib6_info_dst_flags(ort);
944 
945 	if (ort->fib6_flags & RTF_REJECT) {
946 		ip6_rt_init_dst_reject(rt, ort);
947 		return;
948 	}
949 
950 	rt->dst.error = 0;
951 	rt->dst.output = ip6_output;
952 
953 	if (ort->fib6_type == RTN_LOCAL) {
954 		rt->dst.input = ip6_input;
955 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 		rt->dst.input = ip6_mc_input;
957 	} else {
958 		rt->dst.input = ip6_forward;
959 	}
960 
961 	if (ort->fib6_nh.nh_lwtstate) {
962 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 		lwtunnel_set_redirect(&rt->dst);
964 	}
965 
966 	rt->dst.lastuse = jiffies;
967 }
968 
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971 	rt->rt6i_flags &= ~RTF_EXPIRES;
972 	fib6_info_hold(from);
973 	rcu_assign_pointer(rt->from, from);
974 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 	if (from->fib6_metrics != &dst_default_metrics) {
976 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 		refcount_inc(&from->fib6_metrics->refcnt);
978 	}
979 }
980 
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983 	struct net_device *dev = fib6_info_nh_dev(ort);
984 
985 	ip6_rt_init_dst(rt, ort);
986 
987 	rt->rt6i_dst = ort->fib6_dst;
988 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 	rt->rt6i_src = ort->fib6_src;
994 #endif
995 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998 
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 					struct in6_addr *saddr)
1001 {
1002 	struct fib6_node *pn, *sn;
1003 	while (1) {
1004 		if (fn->fn_flags & RTN_TL_ROOT)
1005 			return NULL;
1006 		pn = rcu_dereference(fn->parent);
1007 		sn = FIB6_SUBTREE(pn);
1008 		if (sn && sn != fn)
1009 			fn = fib6_lookup(sn, NULL, saddr);
1010 		else
1011 			fn = pn;
1012 		if (fn->fn_flags & RTN_RTINFO)
1013 			return fn;
1014 	}
1015 }
1016 
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018 			  bool null_fallback)
1019 {
1020 	struct rt6_info *rt = *prt;
1021 
1022 	if (dst_hold_safe(&rt->dst))
1023 		return true;
1024 	if (null_fallback) {
1025 		rt = net->ipv6.ip6_null_entry;
1026 		dst_hold(&rt->dst);
1027 	} else {
1028 		rt = NULL;
1029 	}
1030 	*prt = rt;
1031 	return false;
1032 }
1033 
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037 	unsigned short flags = fib6_info_dst_flags(rt);
1038 	struct net_device *dev = rt->fib6_nh.nh_dev;
1039 	struct rt6_info *nrt;
1040 
1041 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042 	if (nrt)
1043 		ip6_rt_copy_init(nrt, rt);
1044 
1045 	return nrt;
1046 }
1047 
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 					     struct fib6_table *table,
1050 					     struct flowi6 *fl6,
1051 					     const struct sk_buff *skb,
1052 					     int flags)
1053 {
1054 	struct fib6_info *f6i;
1055 	struct fib6_node *fn;
1056 	struct rt6_info *rt;
1057 
1058 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 		flags &= ~RT6_LOOKUP_F_IFACE;
1060 
1061 	rcu_read_lock();
1062 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064 	f6i = rcu_dereference(fn->leaf);
1065 	if (!f6i) {
1066 		f6i = net->ipv6.fib6_null_entry;
1067 	} else {
1068 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 				      fl6->flowi6_oif, flags);
1070 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 			f6i = rt6_multipath_select(net, f6i, fl6,
1072 						   fl6->flowi6_oif, skb, flags);
1073 	}
1074 	if (f6i == net->ipv6.fib6_null_entry) {
1075 		fn = fib6_backtrack(fn, &fl6->saddr);
1076 		if (fn)
1077 			goto restart;
1078 	}
1079 
1080 	/* Search through exception table */
1081 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1082 	if (rt) {
1083 		if (ip6_hold_safe(net, &rt, true))
1084 			dst_use_noref(&rt->dst, jiffies);
1085 	} else if (f6i == net->ipv6.fib6_null_entry) {
1086 		rt = net->ipv6.ip6_null_entry;
1087 		dst_hold(&rt->dst);
1088 	} else {
1089 		rt = ip6_create_rt_rcu(f6i);
1090 		if (!rt) {
1091 			rt = net->ipv6.ip6_null_entry;
1092 			dst_hold(&rt->dst);
1093 		}
1094 	}
1095 
1096 	rcu_read_unlock();
1097 
1098 	trace_fib6_table_lookup(net, rt, table, fl6);
1099 
1100 	return rt;
1101 }
1102 
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104 				   const struct sk_buff *skb, int flags)
1105 {
1106 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1109 
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111 			    const struct in6_addr *saddr, int oif,
1112 			    const struct sk_buff *skb, int strict)
1113 {
1114 	struct flowi6 fl6 = {
1115 		.flowi6_oif = oif,
1116 		.daddr = *daddr,
1117 	};
1118 	struct dst_entry *dst;
1119 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120 
1121 	if (saddr) {
1122 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1124 	}
1125 
1126 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127 	if (dst->error == 0)
1128 		return (struct rt6_info *) dst;
1129 
1130 	dst_release(dst);
1131 
1132 	return NULL;
1133 }
1134 EXPORT_SYMBOL(rt6_lookup);
1135 
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137  * It takes new route entry, the addition fails by any reason the
1138  * route is released.
1139  * Caller must hold dst before calling it.
1140  */
1141 
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143 			struct netlink_ext_ack *extack)
1144 {
1145 	int err;
1146 	struct fib6_table *table;
1147 
1148 	table = rt->fib6_table;
1149 	spin_lock_bh(&table->tb6_lock);
1150 	err = fib6_add(&table->tb6_root, rt, info, extack);
1151 	spin_unlock_bh(&table->tb6_lock);
1152 
1153 	return err;
1154 }
1155 
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157 {
1158 	struct nl_info info = {	.nl_net = net, };
1159 
1160 	return __ip6_ins_rt(rt, &info, NULL);
1161 }
1162 
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164 					   const struct in6_addr *daddr,
1165 					   const struct in6_addr *saddr)
1166 {
1167 	struct net_device *dev;
1168 	struct rt6_info *rt;
1169 
1170 	/*
1171 	 *	Clone the route.
1172 	 */
1173 
1174 	dev = ip6_rt_get_dev_rcu(ort);
1175 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176 	if (!rt)
1177 		return NULL;
1178 
1179 	ip6_rt_copy_init(rt, ort);
1180 	rt->rt6i_flags |= RTF_CACHE;
1181 	rt->dst.flags |= DST_HOST;
1182 	rt->rt6i_dst.addr = *daddr;
1183 	rt->rt6i_dst.plen = 128;
1184 
1185 	if (!rt6_is_gw_or_nonexthop(ort)) {
1186 		if (ort->fib6_dst.plen != 128 &&
1187 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188 			rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190 		if (rt->rt6i_src.plen && saddr) {
1191 			rt->rt6i_src.addr = *saddr;
1192 			rt->rt6i_src.plen = 128;
1193 		}
1194 #endif
1195 	}
1196 
1197 	return rt;
1198 }
1199 
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1201 {
1202 	unsigned short flags = fib6_info_dst_flags(rt);
1203 	struct net_device *dev;
1204 	struct rt6_info *pcpu_rt;
1205 
1206 	rcu_read_lock();
1207 	dev = ip6_rt_get_dev_rcu(rt);
1208 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209 	rcu_read_unlock();
1210 	if (!pcpu_rt)
1211 		return NULL;
1212 	ip6_rt_copy_init(pcpu_rt, rt);
1213 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1214 	return pcpu_rt;
1215 }
1216 
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1219 {
1220 	struct rt6_info *pcpu_rt, **p;
1221 
1222 	p = this_cpu_ptr(rt->rt6i_pcpu);
1223 	pcpu_rt = *p;
1224 
1225 	if (pcpu_rt)
1226 		ip6_hold_safe(NULL, &pcpu_rt, false);
1227 
1228 	return pcpu_rt;
1229 }
1230 
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232 					    struct fib6_info *rt)
1233 {
1234 	struct rt6_info *pcpu_rt, *prev, **p;
1235 
1236 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1237 	if (!pcpu_rt) {
1238 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1239 		return net->ipv6.ip6_null_entry;
1240 	}
1241 
1242 	dst_hold(&pcpu_rt->dst);
1243 	p = this_cpu_ptr(rt->rt6i_pcpu);
1244 	prev = cmpxchg(p, NULL, pcpu_rt);
1245 	BUG_ON(prev);
1246 
1247 	return pcpu_rt;
1248 }
1249 
1250 /* exception hash table implementation
1251  */
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1253 
1254 /* Remove rt6_ex from hash table and free the memory
1255  * Caller must hold rt6_exception_lock
1256  */
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258 				 struct rt6_exception *rt6_ex)
1259 {
1260 	struct net *net;
1261 
1262 	if (!bucket || !rt6_ex)
1263 		return;
1264 
1265 	net = dev_net(rt6_ex->rt6i->dst.dev);
1266 	hlist_del_rcu(&rt6_ex->hlist);
1267 	dst_release(&rt6_ex->rt6i->dst);
1268 	kfree_rcu(rt6_ex, rcu);
1269 	WARN_ON_ONCE(!bucket->depth);
1270 	bucket->depth--;
1271 	net->ipv6.rt6_stats->fib_rt_cache--;
1272 }
1273 
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1278 {
1279 	struct rt6_exception *rt6_ex, *oldest = NULL;
1280 
1281 	if (!bucket)
1282 		return;
1283 
1284 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1286 			oldest = rt6_ex;
1287 	}
1288 	rt6_remove_exception(bucket, oldest);
1289 }
1290 
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292 			      const struct in6_addr *src)
1293 {
1294 	static u32 seed __read_mostly;
1295 	u32 val;
1296 
1297 	net_get_random_once(&seed, sizeof(seed));
1298 	val = jhash(dst, sizeof(*dst), seed);
1299 
1300 #ifdef CONFIG_IPV6_SUBTREES
1301 	if (src)
1302 		val = jhash(src, sizeof(*src), val);
1303 #endif
1304 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1305 }
1306 
1307 /* Helper function to find the cached rt in the hash table
1308  * and update bucket pointer to point to the bucket for this
1309  * (daddr, saddr) pair
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314 			      const struct in6_addr *daddr,
1315 			      const struct in6_addr *saddr)
1316 {
1317 	struct rt6_exception *rt6_ex;
1318 	u32 hval;
1319 
1320 	if (!(*bucket) || !daddr)
1321 		return NULL;
1322 
1323 	hval = rt6_exception_hash(daddr, saddr);
1324 	*bucket += hval;
1325 
1326 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327 		struct rt6_info *rt6 = rt6_ex->rt6i;
1328 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1329 
1330 #ifdef CONFIG_IPV6_SUBTREES
1331 		if (matched && saddr)
1332 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1333 #endif
1334 		if (matched)
1335 			return rt6_ex;
1336 	}
1337 	return NULL;
1338 }
1339 
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rcu_read_lock()
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347 			 const struct in6_addr *daddr,
1348 			 const struct in6_addr *saddr)
1349 {
1350 	struct rt6_exception *rt6_ex;
1351 	u32 hval;
1352 
1353 	WARN_ON_ONCE(!rcu_read_lock_held());
1354 
1355 	if (!(*bucket) || !daddr)
1356 		return NULL;
1357 
1358 	hval = rt6_exception_hash(daddr, saddr);
1359 	*bucket += hval;
1360 
1361 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362 		struct rt6_info *rt6 = rt6_ex->rt6i;
1363 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1364 
1365 #ifdef CONFIG_IPV6_SUBTREES
1366 		if (matched && saddr)
1367 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 #endif
1369 		if (matched)
1370 			return rt6_ex;
1371 	}
1372 	return NULL;
1373 }
1374 
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 {
1377 	unsigned int mtu;
1378 
1379 	if (rt->fib6_pmtu) {
1380 		mtu = rt->fib6_pmtu;
1381 	} else {
1382 		struct net_device *dev = fib6_info_nh_dev(rt);
1383 		struct inet6_dev *idev;
1384 
1385 		rcu_read_lock();
1386 		idev = __in6_dev_get(dev);
1387 		mtu = idev->cnf.mtu6;
1388 		rcu_read_unlock();
1389 	}
1390 
1391 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1392 
1393 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1394 }
1395 
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397 				struct fib6_info *ort)
1398 {
1399 	struct net *net = dev_net(nrt->dst.dev);
1400 	struct rt6_exception_bucket *bucket;
1401 	struct in6_addr *src_key = NULL;
1402 	struct rt6_exception *rt6_ex;
1403 	int err = 0;
1404 
1405 	spin_lock_bh(&rt6_exception_lock);
1406 
1407 	if (ort->exception_bucket_flushed) {
1408 		err = -EINVAL;
1409 		goto out;
1410 	}
1411 
1412 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413 					lockdep_is_held(&rt6_exception_lock));
1414 	if (!bucket) {
1415 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416 				 GFP_ATOMIC);
1417 		if (!bucket) {
1418 			err = -ENOMEM;
1419 			goto out;
1420 		}
1421 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1422 	}
1423 
1424 #ifdef CONFIG_IPV6_SUBTREES
1425 	/* rt6i_src.plen != 0 indicates ort is in subtree
1426 	 * and exception table is indexed by a hash of
1427 	 * both rt6i_dst and rt6i_src.
1428 	 * Otherwise, the exception table is indexed by
1429 	 * a hash of only rt6i_dst.
1430 	 */
1431 	if (ort->fib6_src.plen)
1432 		src_key = &nrt->rt6i_src.addr;
1433 #endif
1434 
1435 	/* Update rt6i_prefsrc as it could be changed
1436 	 * in rt6_remove_prefsrc()
1437 	 */
1438 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439 	/* rt6_mtu_change() might lower mtu on ort.
1440 	 * Only insert this exception route if its mtu
1441 	 * is less than ort's mtu value.
1442 	 */
1443 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444 		err = -EINVAL;
1445 		goto out;
1446 	}
1447 
1448 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1449 					       src_key);
1450 	if (rt6_ex)
1451 		rt6_remove_exception(bucket, rt6_ex);
1452 
1453 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454 	if (!rt6_ex) {
1455 		err = -ENOMEM;
1456 		goto out;
1457 	}
1458 	rt6_ex->rt6i = nrt;
1459 	rt6_ex->stamp = jiffies;
1460 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1461 	bucket->depth++;
1462 	net->ipv6.rt6_stats->fib_rt_cache++;
1463 
1464 	if (bucket->depth > FIB6_MAX_DEPTH)
1465 		rt6_exception_remove_oldest(bucket);
1466 
1467 out:
1468 	spin_unlock_bh(&rt6_exception_lock);
1469 
1470 	/* Update fn->fn_sernum to invalidate all cached dst */
1471 	if (!err) {
1472 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1473 		fib6_update_sernum(net, ort);
1474 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475 		fib6_force_start_gc(net);
1476 	}
1477 
1478 	return err;
1479 }
1480 
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1482 {
1483 	struct rt6_exception_bucket *bucket;
1484 	struct rt6_exception *rt6_ex;
1485 	struct hlist_node *tmp;
1486 	int i;
1487 
1488 	spin_lock_bh(&rt6_exception_lock);
1489 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1490 	rt->exception_bucket_flushed = 1;
1491 
1492 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493 				    lockdep_is_held(&rt6_exception_lock));
1494 	if (!bucket)
1495 		goto out;
1496 
1497 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499 			rt6_remove_exception(bucket, rt6_ex);
1500 		WARN_ON_ONCE(bucket->depth);
1501 		bucket++;
1502 	}
1503 
1504 out:
1505 	spin_unlock_bh(&rt6_exception_lock);
1506 }
1507 
1508 /* Find cached rt in the hash table inside passed in rt
1509  * Caller has to hold rcu_read_lock()
1510  */
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512 					   struct in6_addr *daddr,
1513 					   struct in6_addr *saddr)
1514 {
1515 	struct rt6_exception_bucket *bucket;
1516 	struct in6_addr *src_key = NULL;
1517 	struct rt6_exception *rt6_ex;
1518 	struct rt6_info *res = NULL;
1519 
1520 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1521 
1522 #ifdef CONFIG_IPV6_SUBTREES
1523 	/* rt6i_src.plen != 0 indicates rt is in subtree
1524 	 * and exception table is indexed by a hash of
1525 	 * both rt6i_dst and rt6i_src.
1526 	 * Otherwise, the exception table is indexed by
1527 	 * a hash of only rt6i_dst.
1528 	 */
1529 	if (rt->fib6_src.plen)
1530 		src_key = saddr;
1531 #endif
1532 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1533 
1534 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535 		res = rt6_ex->rt6i;
1536 
1537 	return res;
1538 }
1539 
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 {
1543 	struct rt6_exception_bucket *bucket;
1544 	struct in6_addr *src_key = NULL;
1545 	struct rt6_exception *rt6_ex;
1546 	struct fib6_info *from;
1547 	int err;
1548 
1549 	from = rcu_dereference_protected(rt->from,
1550 					 lockdep_is_held(&rt6_exception_lock));
1551 	if (!from ||
1552 	    !(rt->rt6i_flags & RTF_CACHE))
1553 		return -EINVAL;
1554 
1555 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556 		return -ENOENT;
1557 
1558 	spin_lock_bh(&rt6_exception_lock);
1559 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560 				    lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1563 	 * and exception table is indexed by a hash of
1564 	 * both rt6i_dst and rt6i_src.
1565 	 * Otherwise, the exception table is indexed by
1566 	 * a hash of only rt6i_dst.
1567 	 */
1568 	if (from->fib6_src.plen)
1569 		src_key = &rt->rt6i_src.addr;
1570 #endif
1571 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1572 					       &rt->rt6i_dst.addr,
1573 					       src_key);
1574 	if (rt6_ex) {
1575 		rt6_remove_exception(bucket, rt6_ex);
1576 		err = 0;
1577 	} else {
1578 		err = -ENOENT;
1579 	}
1580 
1581 	spin_unlock_bh(&rt6_exception_lock);
1582 	return err;
1583 }
1584 
1585 /* Find rt6_ex which contains the passed in rt cache and
1586  * refresh its stamp
1587  */
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 {
1590 	struct rt6_exception_bucket *bucket;
1591 	struct fib6_info *from = rt->from;
1592 	struct in6_addr *src_key = NULL;
1593 	struct rt6_exception *rt6_ex;
1594 
1595 	if (!from ||
1596 	    !(rt->rt6i_flags & RTF_CACHE))
1597 		return;
1598 
1599 	rcu_read_lock();
1600 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1601 
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1604 	 * and exception table is indexed by a hash of
1605 	 * both rt6i_dst and rt6i_src.
1606 	 * Otherwise, the exception table is indexed by
1607 	 * a hash of only rt6i_dst.
1608 	 */
1609 	if (from->fib6_src.plen)
1610 		src_key = &rt->rt6i_src.addr;
1611 #endif
1612 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1613 					  &rt->rt6i_dst.addr,
1614 					  src_key);
1615 	if (rt6_ex)
1616 		rt6_ex->stamp = jiffies;
1617 
1618 	rcu_read_unlock();
1619 }
1620 
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 {
1623 	struct rt6_exception_bucket *bucket;
1624 	struct rt6_exception *rt6_ex;
1625 	int i;
1626 
1627 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628 					lockdep_is_held(&rt6_exception_lock));
1629 
1630 	if (bucket) {
1631 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1634 			}
1635 			bucket++;
1636 		}
1637 	}
1638 }
1639 
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641 					 struct rt6_info *rt, int mtu)
1642 {
1643 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1644 	 * lowest MTU in the path: always allow updating the route PMTU to
1645 	 * reflect PMTU decreases.
1646 	 *
1647 	 * If the new MTU is higher, and the route PMTU is equal to the local
1648 	 * MTU, this means the old MTU is the lowest in the path, so allow
1649 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1650 	 * handle this.
1651 	 */
1652 
1653 	if (dst_mtu(&rt->dst) >= mtu)
1654 		return true;
1655 
1656 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657 		return true;
1658 
1659 	return false;
1660 }
1661 
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663 				       struct fib6_info *rt, int mtu)
1664 {
1665 	struct rt6_exception_bucket *bucket;
1666 	struct rt6_exception *rt6_ex;
1667 	int i;
1668 
1669 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670 					lockdep_is_held(&rt6_exception_lock));
1671 
1672 	if (!bucket)
1673 		return;
1674 
1675 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677 			struct rt6_info *entry = rt6_ex->rt6i;
1678 
1679 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680 			 * route), the metrics of its rt->from have already
1681 			 * been updated.
1682 			 */
1683 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1685 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686 		}
1687 		bucket++;
1688 	}
1689 }
1690 
1691 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1692 
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694 					struct in6_addr *gateway)
1695 {
1696 	struct rt6_exception_bucket *bucket;
1697 	struct rt6_exception *rt6_ex;
1698 	struct hlist_node *tmp;
1699 	int i;
1700 
1701 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702 		return;
1703 
1704 	spin_lock_bh(&rt6_exception_lock);
1705 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 				     lockdep_is_held(&rt6_exception_lock));
1707 
1708 	if (bucket) {
1709 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 			hlist_for_each_entry_safe(rt6_ex, tmp,
1711 						  &bucket->chain, hlist) {
1712 				struct rt6_info *entry = rt6_ex->rt6i;
1713 
1714 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715 				    RTF_CACHE_GATEWAY &&
1716 				    ipv6_addr_equal(gateway,
1717 						    &entry->rt6i_gateway)) {
1718 					rt6_remove_exception(bucket, rt6_ex);
1719 				}
1720 			}
1721 			bucket++;
1722 		}
1723 	}
1724 
1725 	spin_unlock_bh(&rt6_exception_lock);
1726 }
1727 
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729 				      struct rt6_exception *rt6_ex,
1730 				      struct fib6_gc_args *gc_args,
1731 				      unsigned long now)
1732 {
1733 	struct rt6_info *rt = rt6_ex->rt6i;
1734 
1735 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1736 	 * even if others have still references to them, so that on next
1737 	 * dst_check() such references can be dropped.
1738 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739 	 * expired, independently from their aging, as per RFC 8201 section 4
1740 	 */
1741 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743 			RT6_TRACE("aging clone %p\n", rt);
1744 			rt6_remove_exception(bucket, rt6_ex);
1745 			return;
1746 		}
1747 	} else if (time_after(jiffies, rt->dst.expires)) {
1748 		RT6_TRACE("purging expired route %p\n", rt);
1749 		rt6_remove_exception(bucket, rt6_ex);
1750 		return;
1751 	}
1752 
1753 	if (rt->rt6i_flags & RTF_GATEWAY) {
1754 		struct neighbour *neigh;
1755 		__u8 neigh_flags = 0;
1756 
1757 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758 		if (neigh)
1759 			neigh_flags = neigh->flags;
1760 
1761 		if (!(neigh_flags & NTF_ROUTER)) {
1762 			RT6_TRACE("purging route %p via non-router but gateway\n",
1763 				  rt);
1764 			rt6_remove_exception(bucket, rt6_ex);
1765 			return;
1766 		}
1767 	}
1768 
1769 	gc_args->more++;
1770 }
1771 
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773 			struct fib6_gc_args *gc_args,
1774 			unsigned long now)
1775 {
1776 	struct rt6_exception_bucket *bucket;
1777 	struct rt6_exception *rt6_ex;
1778 	struct hlist_node *tmp;
1779 	int i;
1780 
1781 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1782 		return;
1783 
1784 	rcu_read_lock_bh();
1785 	spin_lock(&rt6_exception_lock);
1786 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787 				    lockdep_is_held(&rt6_exception_lock));
1788 
1789 	if (bucket) {
1790 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791 			hlist_for_each_entry_safe(rt6_ex, tmp,
1792 						  &bucket->chain, hlist) {
1793 				rt6_age_examine_exception(bucket, rt6_ex,
1794 							  gc_args, now);
1795 			}
1796 			bucket++;
1797 		}
1798 	}
1799 	spin_unlock(&rt6_exception_lock);
1800 	rcu_read_unlock_bh();
1801 }
1802 
1803 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1804 			       int oif, struct flowi6 *fl6,
1805 			       const struct sk_buff *skb, int flags)
1806 {
1807 	struct fib6_node *fn, *saved_fn;
1808 	struct fib6_info *f6i;
1809 	struct rt6_info *rt;
1810 	int strict = 0;
1811 
1812 	strict |= flags & RT6_LOOKUP_F_IFACE;
1813 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1814 	if (net->ipv6.devconf_all->forwarding == 0)
1815 		strict |= RT6_LOOKUP_F_REACHABLE;
1816 
1817 	rcu_read_lock();
1818 
1819 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1820 	saved_fn = fn;
1821 
1822 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1823 		oif = 0;
1824 
1825 redo_rt6_select:
1826 	f6i = rt6_select(net, fn, oif, strict);
1827 	if (f6i->fib6_nsiblings)
1828 		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1829 	if (f6i == net->ipv6.fib6_null_entry) {
1830 		fn = fib6_backtrack(fn, &fl6->saddr);
1831 		if (fn)
1832 			goto redo_rt6_select;
1833 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1834 			/* also consider unreachable route */
1835 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1836 			fn = saved_fn;
1837 			goto redo_rt6_select;
1838 		}
1839 	}
1840 
1841 	if (f6i == net->ipv6.fib6_null_entry) {
1842 		rt = net->ipv6.ip6_null_entry;
1843 		rcu_read_unlock();
1844 		dst_hold(&rt->dst);
1845 		trace_fib6_table_lookup(net, rt, table, fl6);
1846 		return rt;
1847 	}
1848 
1849 	/*Search through exception table */
1850 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1851 	if (rt) {
1852 		if (ip6_hold_safe(net, &rt, true))
1853 			dst_use_noref(&rt->dst, jiffies);
1854 
1855 		rcu_read_unlock();
1856 		trace_fib6_table_lookup(net, rt, table, fl6);
1857 		return rt;
1858 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1859 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1860 		/* Create a RTF_CACHE clone which will not be
1861 		 * owned by the fib6 tree.  It is for the special case where
1862 		 * the daddr in the skb during the neighbor look-up is different
1863 		 * from the fl6->daddr used to look-up route here.
1864 		 */
1865 		struct rt6_info *uncached_rt;
1866 
1867 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1868 
1869 		rcu_read_unlock();
1870 
1871 		if (uncached_rt) {
1872 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1873 			 * No need for another dst_hold()
1874 			 */
1875 			rt6_uncached_list_add(uncached_rt);
1876 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1877 		} else {
1878 			uncached_rt = net->ipv6.ip6_null_entry;
1879 			dst_hold(&uncached_rt->dst);
1880 		}
1881 
1882 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1883 		return uncached_rt;
1884 
1885 	} else {
1886 		/* Get a percpu copy */
1887 
1888 		struct rt6_info *pcpu_rt;
1889 
1890 		local_bh_disable();
1891 		pcpu_rt = rt6_get_pcpu_route(f6i);
1892 
1893 		if (!pcpu_rt)
1894 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1895 
1896 		local_bh_enable();
1897 		rcu_read_unlock();
1898 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1899 		return pcpu_rt;
1900 	}
1901 }
1902 EXPORT_SYMBOL_GPL(ip6_pol_route);
1903 
1904 static struct rt6_info *ip6_pol_route_input(struct net *net,
1905 					    struct fib6_table *table,
1906 					    struct flowi6 *fl6,
1907 					    const struct sk_buff *skb,
1908 					    int flags)
1909 {
1910 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1911 }
1912 
1913 struct dst_entry *ip6_route_input_lookup(struct net *net,
1914 					 struct net_device *dev,
1915 					 struct flowi6 *fl6,
1916 					 const struct sk_buff *skb,
1917 					 int flags)
1918 {
1919 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1920 		flags |= RT6_LOOKUP_F_IFACE;
1921 
1922 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1923 }
1924 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1925 
1926 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1927 				  struct flow_keys *keys,
1928 				  struct flow_keys *flkeys)
1929 {
1930 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1931 	const struct ipv6hdr *key_iph = outer_iph;
1932 	struct flow_keys *_flkeys = flkeys;
1933 	const struct ipv6hdr *inner_iph;
1934 	const struct icmp6hdr *icmph;
1935 	struct ipv6hdr _inner_iph;
1936 
1937 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938 		goto out;
1939 
1940 	icmph = icmp6_hdr(skb);
1941 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1942 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1943 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1944 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1945 		goto out;
1946 
1947 	inner_iph = skb_header_pointer(skb,
1948 				       skb_transport_offset(skb) + sizeof(*icmph),
1949 				       sizeof(_inner_iph), &_inner_iph);
1950 	if (!inner_iph)
1951 		goto out;
1952 
1953 	key_iph = inner_iph;
1954 	_flkeys = NULL;
1955 out:
1956 	if (_flkeys) {
1957 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1958 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1959 		keys->tags.flow_label = _flkeys->tags.flow_label;
1960 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1961 	} else {
1962 		keys->addrs.v6addrs.src = key_iph->saddr;
1963 		keys->addrs.v6addrs.dst = key_iph->daddr;
1964 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1965 		keys->basic.ip_proto = key_iph->nexthdr;
1966 	}
1967 }
1968 
1969 /* if skb is set it will be used and fl6 can be NULL */
1970 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1971 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1972 {
1973 	struct flow_keys hash_keys;
1974 	u32 mhash;
1975 
1976 	switch (ip6_multipath_hash_policy(net)) {
1977 	case 0:
1978 		memset(&hash_keys, 0, sizeof(hash_keys));
1979 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1980 		if (skb) {
1981 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1982 		} else {
1983 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1984 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1985 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1986 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1987 		}
1988 		break;
1989 	case 1:
1990 		if (skb) {
1991 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1992 			struct flow_keys keys;
1993 
1994 			/* short-circuit if we already have L4 hash present */
1995 			if (skb->l4_hash)
1996 				return skb_get_hash_raw(skb) >> 1;
1997 
1998 			memset(&hash_keys, 0, sizeof(hash_keys));
1999 
2000                         if (!flkeys) {
2001 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2002 				flkeys = &keys;
2003 			}
2004 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2005 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2006 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2007 			hash_keys.ports.src = flkeys->ports.src;
2008 			hash_keys.ports.dst = flkeys->ports.dst;
2009 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2010 		} else {
2011 			memset(&hash_keys, 0, sizeof(hash_keys));
2012 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2013 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2014 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2015 			hash_keys.ports.src = fl6->fl6_sport;
2016 			hash_keys.ports.dst = fl6->fl6_dport;
2017 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2018 		}
2019 		break;
2020 	}
2021 	mhash = flow_hash_from_keys(&hash_keys);
2022 
2023 	return mhash >> 1;
2024 }
2025 
2026 void ip6_route_input(struct sk_buff *skb)
2027 {
2028 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2029 	struct net *net = dev_net(skb->dev);
2030 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2031 	struct ip_tunnel_info *tun_info;
2032 	struct flowi6 fl6 = {
2033 		.flowi6_iif = skb->dev->ifindex,
2034 		.daddr = iph->daddr,
2035 		.saddr = iph->saddr,
2036 		.flowlabel = ip6_flowinfo(iph),
2037 		.flowi6_mark = skb->mark,
2038 		.flowi6_proto = iph->nexthdr,
2039 	};
2040 	struct flow_keys *flkeys = NULL, _flkeys;
2041 
2042 	tun_info = skb_tunnel_info(skb);
2043 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2044 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2045 
2046 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2047 		flkeys = &_flkeys;
2048 
2049 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2050 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2051 	skb_dst_drop(skb);
2052 	skb_dst_set(skb,
2053 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2054 }
2055 
2056 static struct rt6_info *ip6_pol_route_output(struct net *net,
2057 					     struct fib6_table *table,
2058 					     struct flowi6 *fl6,
2059 					     const struct sk_buff *skb,
2060 					     int flags)
2061 {
2062 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2063 }
2064 
2065 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2066 					 struct flowi6 *fl6, int flags)
2067 {
2068 	bool any_src;
2069 
2070 	if (rt6_need_strict(&fl6->daddr)) {
2071 		struct dst_entry *dst;
2072 
2073 		dst = l3mdev_link_scope_lookup(net, fl6);
2074 		if (dst)
2075 			return dst;
2076 	}
2077 
2078 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2079 
2080 	any_src = ipv6_addr_any(&fl6->saddr);
2081 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2082 	    (fl6->flowi6_oif && any_src))
2083 		flags |= RT6_LOOKUP_F_IFACE;
2084 
2085 	if (!any_src)
2086 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2087 	else if (sk)
2088 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2089 
2090 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2091 }
2092 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2093 
2094 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2095 {
2096 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2097 	struct net_device *loopback_dev = net->loopback_dev;
2098 	struct dst_entry *new = NULL;
2099 
2100 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2101 		       DST_OBSOLETE_DEAD, 0);
2102 	if (rt) {
2103 		rt6_info_init(rt);
2104 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2105 
2106 		new = &rt->dst;
2107 		new->__use = 1;
2108 		new->input = dst_discard;
2109 		new->output = dst_discard_out;
2110 
2111 		dst_copy_metrics(new, &ort->dst);
2112 
2113 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2114 		rt->rt6i_gateway = ort->rt6i_gateway;
2115 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2116 
2117 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2118 #ifdef CONFIG_IPV6_SUBTREES
2119 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2120 #endif
2121 	}
2122 
2123 	dst_release(dst_orig);
2124 	return new ? new : ERR_PTR(-ENOMEM);
2125 }
2126 
2127 /*
2128  *	Destination cache support functions
2129  */
2130 
2131 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2132 {
2133 	u32 rt_cookie = 0;
2134 
2135 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2136 		return false;
2137 
2138 	if (fib6_check_expired(f6i))
2139 		return false;
2140 
2141 	return true;
2142 }
2143 
2144 static struct dst_entry *rt6_check(struct rt6_info *rt,
2145 				   struct fib6_info *from,
2146 				   u32 cookie)
2147 {
2148 	u32 rt_cookie = 0;
2149 
2150 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2151 	    rt_cookie != cookie)
2152 		return NULL;
2153 
2154 	if (rt6_check_expired(rt))
2155 		return NULL;
2156 
2157 	return &rt->dst;
2158 }
2159 
2160 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2161 					    struct fib6_info *from,
2162 					    u32 cookie)
2163 {
2164 	if (!__rt6_check_expired(rt) &&
2165 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2166 	    fib6_check(from, cookie))
2167 		return &rt->dst;
2168 	else
2169 		return NULL;
2170 }
2171 
2172 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2173 {
2174 	struct dst_entry *dst_ret;
2175 	struct fib6_info *from;
2176 	struct rt6_info *rt;
2177 
2178 	rt = container_of(dst, struct rt6_info, dst);
2179 
2180 	rcu_read_lock();
2181 
2182 	/* All IPV6 dsts are created with ->obsolete set to the value
2183 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2184 	 * into this function always.
2185 	 */
2186 
2187 	from = rcu_dereference(rt->from);
2188 
2189 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2190 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2191 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2192 	else
2193 		dst_ret = rt6_check(rt, from, cookie);
2194 
2195 	rcu_read_unlock();
2196 
2197 	return dst_ret;
2198 }
2199 
2200 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2201 {
2202 	struct rt6_info *rt = (struct rt6_info *) dst;
2203 
2204 	if (rt) {
2205 		if (rt->rt6i_flags & RTF_CACHE) {
2206 			rcu_read_lock();
2207 			if (rt6_check_expired(rt)) {
2208 				rt6_remove_exception_rt(rt);
2209 				dst = NULL;
2210 			}
2211 			rcu_read_unlock();
2212 		} else {
2213 			dst_release(dst);
2214 			dst = NULL;
2215 		}
2216 	}
2217 	return dst;
2218 }
2219 
2220 static void ip6_link_failure(struct sk_buff *skb)
2221 {
2222 	struct rt6_info *rt;
2223 
2224 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2225 
2226 	rt = (struct rt6_info *) skb_dst(skb);
2227 	if (rt) {
2228 		rcu_read_lock();
2229 		if (rt->rt6i_flags & RTF_CACHE) {
2230 			if (dst_hold_safe(&rt->dst))
2231 				rt6_remove_exception_rt(rt);
2232 		} else {
2233 			struct fib6_info *from;
2234 			struct fib6_node *fn;
2235 
2236 			from = rcu_dereference(rt->from);
2237 			if (from) {
2238 				fn = rcu_dereference(from->fib6_node);
2239 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2240 					fn->fn_sernum = -1;
2241 			}
2242 		}
2243 		rcu_read_unlock();
2244 	}
2245 }
2246 
2247 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2248 {
2249 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2250 		struct fib6_info *from;
2251 
2252 		rcu_read_lock();
2253 		from = rcu_dereference(rt0->from);
2254 		if (from)
2255 			rt0->dst.expires = from->expires;
2256 		rcu_read_unlock();
2257 	}
2258 
2259 	dst_set_expires(&rt0->dst, timeout);
2260 	rt0->rt6i_flags |= RTF_EXPIRES;
2261 }
2262 
2263 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2264 {
2265 	struct net *net = dev_net(rt->dst.dev);
2266 
2267 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2268 	rt->rt6i_flags |= RTF_MODIFIED;
2269 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2270 }
2271 
2272 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2273 {
2274 	bool from_set;
2275 
2276 	rcu_read_lock();
2277 	from_set = !!rcu_dereference(rt->from);
2278 	rcu_read_unlock();
2279 
2280 	return !(rt->rt6i_flags & RTF_CACHE) &&
2281 		(rt->rt6i_flags & RTF_PCPU || from_set);
2282 }
2283 
2284 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2285 				 const struct ipv6hdr *iph, u32 mtu)
2286 {
2287 	const struct in6_addr *daddr, *saddr;
2288 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2289 
2290 	if (rt6->rt6i_flags & RTF_LOCAL)
2291 		return;
2292 
2293 	if (dst_metric_locked(dst, RTAX_MTU))
2294 		return;
2295 
2296 	if (iph) {
2297 		daddr = &iph->daddr;
2298 		saddr = &iph->saddr;
2299 	} else if (sk) {
2300 		daddr = &sk->sk_v6_daddr;
2301 		saddr = &inet6_sk(sk)->saddr;
2302 	} else {
2303 		daddr = NULL;
2304 		saddr = NULL;
2305 	}
2306 	dst_confirm_neigh(dst, daddr);
2307 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2308 	if (mtu >= dst_mtu(dst))
2309 		return;
2310 
2311 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2312 		rt6_do_update_pmtu(rt6, mtu);
2313 		/* update rt6_ex->stamp for cache */
2314 		if (rt6->rt6i_flags & RTF_CACHE)
2315 			rt6_update_exception_stamp_rt(rt6);
2316 	} else if (daddr) {
2317 		struct fib6_info *from;
2318 		struct rt6_info *nrt6;
2319 
2320 		rcu_read_lock();
2321 		from = rcu_dereference(rt6->from);
2322 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2323 		if (nrt6) {
2324 			rt6_do_update_pmtu(nrt6, mtu);
2325 			if (rt6_insert_exception(nrt6, from))
2326 				dst_release_immediate(&nrt6->dst);
2327 		}
2328 		rcu_read_unlock();
2329 	}
2330 }
2331 
2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333 			       struct sk_buff *skb, u32 mtu)
2334 {
2335 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2336 }
2337 
2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2339 		     int oif, u32 mark, kuid_t uid)
2340 {
2341 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2342 	struct dst_entry *dst;
2343 	struct flowi6 fl6;
2344 
2345 	memset(&fl6, 0, sizeof(fl6));
2346 	fl6.flowi6_oif = oif;
2347 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2348 	fl6.daddr = iph->daddr;
2349 	fl6.saddr = iph->saddr;
2350 	fl6.flowlabel = ip6_flowinfo(iph);
2351 	fl6.flowi6_uid = uid;
2352 
2353 	dst = ip6_route_output(net, NULL, &fl6);
2354 	if (!dst->error)
2355 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356 	dst_release(dst);
2357 }
2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359 
2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361 {
2362 	struct dst_entry *dst;
2363 
2364 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2365 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2366 
2367 	dst = __sk_dst_get(sk);
2368 	if (!dst || !dst->obsolete ||
2369 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2370 		return;
2371 
2372 	bh_lock_sock(sk);
2373 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374 		ip6_datagram_dst_update(sk, false);
2375 	bh_unlock_sock(sk);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2378 
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380 			   const struct flowi6 *fl6)
2381 {
2382 #ifdef CONFIG_IPV6_SUBTREES
2383 	struct ipv6_pinfo *np = inet6_sk(sk);
2384 #endif
2385 
2386 	ip6_dst_store(sk, dst,
2387 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388 		      &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2391 		      &np->saddr :
2392 #endif
2393 		      NULL);
2394 }
2395 
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2398 	struct flowi6 fl6;
2399 	struct in6_addr gateway;
2400 };
2401 
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403 					     struct fib6_table *table,
2404 					     struct flowi6 *fl6,
2405 					     const struct sk_buff *skb,
2406 					     int flags)
2407 {
2408 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409 	struct rt6_info *ret = NULL, *rt_cache;
2410 	struct fib6_info *rt;
2411 	struct fib6_node *fn;
2412 
2413 	/* Get the "current" route for this destination and
2414 	 * check if the redirect has come from appropriate router.
2415 	 *
2416 	 * RFC 4861 specifies that redirects should only be
2417 	 * accepted if they come from the nexthop to the target.
2418 	 * Due to the way the routes are chosen, this notion
2419 	 * is a bit fuzzy and one might need to check all possible
2420 	 * routes.
2421 	 */
2422 
2423 	rcu_read_lock();
2424 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2425 restart:
2426 	for_each_fib6_node_rt_rcu(fn) {
2427 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2428 			continue;
2429 		if (fib6_check_expired(rt))
2430 			continue;
2431 		if (rt->fib6_flags & RTF_REJECT)
2432 			break;
2433 		if (!(rt->fib6_flags & RTF_GATEWAY))
2434 			continue;
2435 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2436 			continue;
2437 		/* rt_cache's gateway might be different from its 'parent'
2438 		 * in the case of an ip redirect.
2439 		 * So we keep searching in the exception table if the gateway
2440 		 * is different.
2441 		 */
2442 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2443 			rt_cache = rt6_find_cached_rt(rt,
2444 						      &fl6->daddr,
2445 						      &fl6->saddr);
2446 			if (rt_cache &&
2447 			    ipv6_addr_equal(&rdfl->gateway,
2448 					    &rt_cache->rt6i_gateway)) {
2449 				ret = rt_cache;
2450 				break;
2451 			}
2452 			continue;
2453 		}
2454 		break;
2455 	}
2456 
2457 	if (!rt)
2458 		rt = net->ipv6.fib6_null_entry;
2459 	else if (rt->fib6_flags & RTF_REJECT) {
2460 		ret = net->ipv6.ip6_null_entry;
2461 		goto out;
2462 	}
2463 
2464 	if (rt == net->ipv6.fib6_null_entry) {
2465 		fn = fib6_backtrack(fn, &fl6->saddr);
2466 		if (fn)
2467 			goto restart;
2468 	}
2469 
2470 out:
2471 	if (ret)
2472 		dst_hold(&ret->dst);
2473 	else
2474 		ret = ip6_create_rt_rcu(rt);
2475 
2476 	rcu_read_unlock();
2477 
2478 	trace_fib6_table_lookup(net, ret, table, fl6);
2479 	return ret;
2480 };
2481 
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483 					    const struct flowi6 *fl6,
2484 					    const struct sk_buff *skb,
2485 					    const struct in6_addr *gateway)
2486 {
2487 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2488 	struct ip6rd_flowi rdfl;
2489 
2490 	rdfl.fl6 = *fl6;
2491 	rdfl.gateway = *gateway;
2492 
2493 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494 				flags, __ip6_route_redirect);
2495 }
2496 
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2498 		  kuid_t uid)
2499 {
2500 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501 	struct dst_entry *dst;
2502 	struct flowi6 fl6;
2503 
2504 	memset(&fl6, 0, sizeof(fl6));
2505 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2506 	fl6.flowi6_oif = oif;
2507 	fl6.flowi6_mark = mark;
2508 	fl6.daddr = iph->daddr;
2509 	fl6.saddr = iph->saddr;
2510 	fl6.flowlabel = ip6_flowinfo(iph);
2511 	fl6.flowi6_uid = uid;
2512 
2513 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2514 	rt6_do_redirect(dst, NULL, skb);
2515 	dst_release(dst);
2516 }
2517 EXPORT_SYMBOL_GPL(ip6_redirect);
2518 
2519 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2520 			    u32 mark)
2521 {
2522 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2523 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2524 	struct dst_entry *dst;
2525 	struct flowi6 fl6;
2526 
2527 	memset(&fl6, 0, sizeof(fl6));
2528 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2529 	fl6.flowi6_oif = oif;
2530 	fl6.flowi6_mark = mark;
2531 	fl6.daddr = msg->dest;
2532 	fl6.saddr = iph->daddr;
2533 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2534 
2535 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2536 	rt6_do_redirect(dst, NULL, skb);
2537 	dst_release(dst);
2538 }
2539 
2540 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2541 {
2542 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2543 		     sk->sk_uid);
2544 }
2545 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2546 
2547 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2548 {
2549 	struct net_device *dev = dst->dev;
2550 	unsigned int mtu = dst_mtu(dst);
2551 	struct net *net = dev_net(dev);
2552 
2553 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2554 
2555 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2556 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2557 
2558 	/*
2559 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2560 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2561 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2562 	 * rely only on pmtu discovery"
2563 	 */
2564 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2565 		mtu = IPV6_MAXPLEN;
2566 	return mtu;
2567 }
2568 
2569 static unsigned int ip6_mtu(const struct dst_entry *dst)
2570 {
2571 	struct inet6_dev *idev;
2572 	unsigned int mtu;
2573 
2574 	mtu = dst_metric_raw(dst, RTAX_MTU);
2575 	if (mtu)
2576 		goto out;
2577 
2578 	mtu = IPV6_MIN_MTU;
2579 
2580 	rcu_read_lock();
2581 	idev = __in6_dev_get(dst->dev);
2582 	if (idev)
2583 		mtu = idev->cnf.mtu6;
2584 	rcu_read_unlock();
2585 
2586 out:
2587 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2588 
2589 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2590 }
2591 
2592 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2593 				  struct flowi6 *fl6)
2594 {
2595 	struct dst_entry *dst;
2596 	struct rt6_info *rt;
2597 	struct inet6_dev *idev = in6_dev_get(dev);
2598 	struct net *net = dev_net(dev);
2599 
2600 	if (unlikely(!idev))
2601 		return ERR_PTR(-ENODEV);
2602 
2603 	rt = ip6_dst_alloc(net, dev, 0);
2604 	if (unlikely(!rt)) {
2605 		in6_dev_put(idev);
2606 		dst = ERR_PTR(-ENOMEM);
2607 		goto out;
2608 	}
2609 
2610 	rt->dst.flags |= DST_HOST;
2611 	rt->dst.input = ip6_input;
2612 	rt->dst.output  = ip6_output;
2613 	rt->rt6i_gateway  = fl6->daddr;
2614 	rt->rt6i_dst.addr = fl6->daddr;
2615 	rt->rt6i_dst.plen = 128;
2616 	rt->rt6i_idev     = idev;
2617 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2618 
2619 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2620 	 * do proper release of the net_device
2621 	 */
2622 	rt6_uncached_list_add(rt);
2623 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2624 
2625 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2626 
2627 out:
2628 	return dst;
2629 }
2630 
2631 static int ip6_dst_gc(struct dst_ops *ops)
2632 {
2633 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2634 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2635 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2636 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2637 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2638 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2639 	int entries;
2640 
2641 	entries = dst_entries_get_fast(ops);
2642 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2643 	    entries <= rt_max_size)
2644 		goto out;
2645 
2646 	net->ipv6.ip6_rt_gc_expire++;
2647 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2648 	entries = dst_entries_get_slow(ops);
2649 	if (entries < ops->gc_thresh)
2650 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2651 out:
2652 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2653 	return entries > rt_max_size;
2654 }
2655 
2656 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2657 			       struct fib6_config *cfg)
2658 {
2659 	struct dst_metrics *p;
2660 
2661 	if (!cfg->fc_mx)
2662 		return 0;
2663 
2664 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2665 	if (unlikely(!p))
2666 		return -ENOMEM;
2667 
2668 	refcount_set(&p->refcnt, 1);
2669 	rt->fib6_metrics = p;
2670 
2671 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2672 }
2673 
2674 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2675 					    struct fib6_config *cfg,
2676 					    const struct in6_addr *gw_addr,
2677 					    u32 tbid, int flags)
2678 {
2679 	struct flowi6 fl6 = {
2680 		.flowi6_oif = cfg->fc_ifindex,
2681 		.daddr = *gw_addr,
2682 		.saddr = cfg->fc_prefsrc,
2683 	};
2684 	struct fib6_table *table;
2685 	struct rt6_info *rt;
2686 
2687 	table = fib6_get_table(net, tbid);
2688 	if (!table)
2689 		return NULL;
2690 
2691 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2692 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2693 
2694 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2695 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2696 
2697 	/* if table lookup failed, fall back to full lookup */
2698 	if (rt == net->ipv6.ip6_null_entry) {
2699 		ip6_rt_put(rt);
2700 		rt = NULL;
2701 	}
2702 
2703 	return rt;
2704 }
2705 
2706 static int ip6_route_check_nh_onlink(struct net *net,
2707 				     struct fib6_config *cfg,
2708 				     const struct net_device *dev,
2709 				     struct netlink_ext_ack *extack)
2710 {
2711 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2712 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2713 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2714 	struct rt6_info *grt;
2715 	int err;
2716 
2717 	err = 0;
2718 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2719 	if (grt) {
2720 		if (!grt->dst.error &&
2721 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2722 			NL_SET_ERR_MSG(extack,
2723 				       "Nexthop has invalid gateway or device mismatch");
2724 			err = -EINVAL;
2725 		}
2726 
2727 		ip6_rt_put(grt);
2728 	}
2729 
2730 	return err;
2731 }
2732 
2733 static int ip6_route_check_nh(struct net *net,
2734 			      struct fib6_config *cfg,
2735 			      struct net_device **_dev,
2736 			      struct inet6_dev **idev)
2737 {
2738 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739 	struct net_device *dev = _dev ? *_dev : NULL;
2740 	struct rt6_info *grt = NULL;
2741 	int err = -EHOSTUNREACH;
2742 
2743 	if (cfg->fc_table) {
2744 		int flags = RT6_LOOKUP_F_IFACE;
2745 
2746 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2747 					  cfg->fc_table, flags);
2748 		if (grt) {
2749 			if (grt->rt6i_flags & RTF_GATEWAY ||
2750 			    (dev && dev != grt->dst.dev)) {
2751 				ip6_rt_put(grt);
2752 				grt = NULL;
2753 			}
2754 		}
2755 	}
2756 
2757 	if (!grt)
2758 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2759 
2760 	if (!grt)
2761 		goto out;
2762 
2763 	if (dev) {
2764 		if (dev != grt->dst.dev) {
2765 			ip6_rt_put(grt);
2766 			goto out;
2767 		}
2768 	} else {
2769 		*_dev = dev = grt->dst.dev;
2770 		*idev = grt->rt6i_idev;
2771 		dev_hold(dev);
2772 		in6_dev_hold(grt->rt6i_idev);
2773 	}
2774 
2775 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2776 		err = 0;
2777 
2778 	ip6_rt_put(grt);
2779 
2780 out:
2781 	return err;
2782 }
2783 
2784 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2785 			   struct net_device **_dev, struct inet6_dev **idev,
2786 			   struct netlink_ext_ack *extack)
2787 {
2788 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2789 	int gwa_type = ipv6_addr_type(gw_addr);
2790 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2791 	const struct net_device *dev = *_dev;
2792 	bool need_addr_check = !dev;
2793 	int err = -EINVAL;
2794 
2795 	/* if gw_addr is local we will fail to detect this in case
2796 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2797 	 * will return already-added prefix route via interface that
2798 	 * prefix route was assigned to, which might be non-loopback.
2799 	 */
2800 	if (dev &&
2801 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2802 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2803 		goto out;
2804 	}
2805 
2806 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2807 		/* IPv6 strictly inhibits using not link-local
2808 		 * addresses as nexthop address.
2809 		 * Otherwise, router will not able to send redirects.
2810 		 * It is very good, but in some (rare!) circumstances
2811 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2812 		 * some exceptions. --ANK
2813 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2814 		 * addressing
2815 		 */
2816 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2817 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2818 			goto out;
2819 		}
2820 
2821 		if (cfg->fc_flags & RTNH_F_ONLINK)
2822 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2823 		else
2824 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2825 
2826 		if (err)
2827 			goto out;
2828 	}
2829 
2830 	/* reload in case device was changed */
2831 	dev = *_dev;
2832 
2833 	err = -EINVAL;
2834 	if (!dev) {
2835 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2836 		goto out;
2837 	} else if (dev->flags & IFF_LOOPBACK) {
2838 		NL_SET_ERR_MSG(extack,
2839 			       "Egress device can not be loopback device for this route");
2840 		goto out;
2841 	}
2842 
2843 	/* if we did not check gw_addr above, do so now that the
2844 	 * egress device has been resolved.
2845 	 */
2846 	if (need_addr_check &&
2847 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2848 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2849 		goto out;
2850 	}
2851 
2852 	err = 0;
2853 out:
2854 	return err;
2855 }
2856 
2857 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2858 					      gfp_t gfp_flags,
2859 					      struct netlink_ext_ack *extack)
2860 {
2861 	struct net *net = cfg->fc_nlinfo.nl_net;
2862 	struct fib6_info *rt = NULL;
2863 	struct net_device *dev = NULL;
2864 	struct inet6_dev *idev = NULL;
2865 	struct fib6_table *table;
2866 	int addr_type;
2867 	int err = -EINVAL;
2868 
2869 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2870 	if (cfg->fc_flags & RTF_PCPU) {
2871 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2872 		goto out;
2873 	}
2874 
2875 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2876 	if (cfg->fc_flags & RTF_CACHE) {
2877 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2878 		goto out;
2879 	}
2880 
2881 	if (cfg->fc_type > RTN_MAX) {
2882 		NL_SET_ERR_MSG(extack, "Invalid route type");
2883 		goto out;
2884 	}
2885 
2886 	if (cfg->fc_dst_len > 128) {
2887 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2888 		goto out;
2889 	}
2890 	if (cfg->fc_src_len > 128) {
2891 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2892 		goto out;
2893 	}
2894 #ifndef CONFIG_IPV6_SUBTREES
2895 	if (cfg->fc_src_len) {
2896 		NL_SET_ERR_MSG(extack,
2897 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2898 		goto out;
2899 	}
2900 #endif
2901 	if (cfg->fc_ifindex) {
2902 		err = -ENODEV;
2903 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2904 		if (!dev)
2905 			goto out;
2906 		idev = in6_dev_get(dev);
2907 		if (!idev)
2908 			goto out;
2909 	}
2910 
2911 	if (cfg->fc_metric == 0)
2912 		cfg->fc_metric = IP6_RT_PRIO_USER;
2913 
2914 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2915 		if (!dev) {
2916 			NL_SET_ERR_MSG(extack,
2917 				       "Nexthop device required for onlink");
2918 			err = -ENODEV;
2919 			goto out;
2920 		}
2921 
2922 		if (!(dev->flags & IFF_UP)) {
2923 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2924 			err = -ENETDOWN;
2925 			goto out;
2926 		}
2927 	}
2928 
2929 	err = -ENOBUFS;
2930 	if (cfg->fc_nlinfo.nlh &&
2931 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2932 		table = fib6_get_table(net, cfg->fc_table);
2933 		if (!table) {
2934 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2935 			table = fib6_new_table(net, cfg->fc_table);
2936 		}
2937 	} else {
2938 		table = fib6_new_table(net, cfg->fc_table);
2939 	}
2940 
2941 	if (!table)
2942 		goto out;
2943 
2944 	err = -ENOMEM;
2945 	rt = fib6_info_alloc(gfp_flags);
2946 	if (!rt)
2947 		goto out;
2948 
2949 	if (cfg->fc_flags & RTF_ADDRCONF)
2950 		rt->dst_nocount = true;
2951 
2952 	err = ip6_convert_metrics(net, rt, cfg);
2953 	if (err < 0)
2954 		goto out;
2955 
2956 	if (cfg->fc_flags & RTF_EXPIRES)
2957 		fib6_set_expires(rt, jiffies +
2958 				clock_t_to_jiffies(cfg->fc_expires));
2959 	else
2960 		fib6_clean_expires(rt);
2961 
2962 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2963 		cfg->fc_protocol = RTPROT_BOOT;
2964 	rt->fib6_protocol = cfg->fc_protocol;
2965 
2966 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2967 
2968 	if (cfg->fc_encap) {
2969 		struct lwtunnel_state *lwtstate;
2970 
2971 		err = lwtunnel_build_state(cfg->fc_encap_type,
2972 					   cfg->fc_encap, AF_INET6, cfg,
2973 					   &lwtstate, extack);
2974 		if (err)
2975 			goto out;
2976 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2977 	}
2978 
2979 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2980 	rt->fib6_dst.plen = cfg->fc_dst_len;
2981 	if (rt->fib6_dst.plen == 128)
2982 		rt->dst_host = true;
2983 
2984 #ifdef CONFIG_IPV6_SUBTREES
2985 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2986 	rt->fib6_src.plen = cfg->fc_src_len;
2987 #endif
2988 
2989 	rt->fib6_metric = cfg->fc_metric;
2990 	rt->fib6_nh.nh_weight = 1;
2991 
2992 	rt->fib6_type = cfg->fc_type;
2993 
2994 	/* We cannot add true routes via loopback here,
2995 	   they would result in kernel looping; promote them to reject routes
2996 	 */
2997 	if ((cfg->fc_flags & RTF_REJECT) ||
2998 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2999 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3000 	     !(cfg->fc_flags & RTF_LOCAL))) {
3001 		/* hold loopback dev/idev if we haven't done so. */
3002 		if (dev != net->loopback_dev) {
3003 			if (dev) {
3004 				dev_put(dev);
3005 				in6_dev_put(idev);
3006 			}
3007 			dev = net->loopback_dev;
3008 			dev_hold(dev);
3009 			idev = in6_dev_get(dev);
3010 			if (!idev) {
3011 				err = -ENODEV;
3012 				goto out;
3013 			}
3014 		}
3015 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3016 		goto install_route;
3017 	}
3018 
3019 	if (cfg->fc_flags & RTF_GATEWAY) {
3020 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3021 		if (err)
3022 			goto out;
3023 
3024 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3025 	}
3026 
3027 	err = -ENODEV;
3028 	if (!dev)
3029 		goto out;
3030 
3031 	if (idev->cnf.disable_ipv6) {
3032 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3033 		err = -EACCES;
3034 		goto out;
3035 	}
3036 
3037 	if (!(dev->flags & IFF_UP)) {
3038 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3039 		err = -ENETDOWN;
3040 		goto out;
3041 	}
3042 
3043 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3044 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3045 			NL_SET_ERR_MSG(extack, "Invalid source address");
3046 			err = -EINVAL;
3047 			goto out;
3048 		}
3049 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3050 		rt->fib6_prefsrc.plen = 128;
3051 	} else
3052 		rt->fib6_prefsrc.plen = 0;
3053 
3054 	rt->fib6_flags = cfg->fc_flags;
3055 
3056 install_route:
3057 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058 	    !netif_carrier_ok(dev))
3059 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3060 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3061 	rt->fib6_nh.nh_dev = dev;
3062 	rt->fib6_table = table;
3063 
3064 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3065 
3066 	if (idev)
3067 		in6_dev_put(idev);
3068 
3069 	return rt;
3070 out:
3071 	if (dev)
3072 		dev_put(dev);
3073 	if (idev)
3074 		in6_dev_put(idev);
3075 
3076 	fib6_info_release(rt);
3077 	return ERR_PTR(err);
3078 }
3079 
3080 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3081 		  struct netlink_ext_ack *extack)
3082 {
3083 	struct fib6_info *rt;
3084 	int err;
3085 
3086 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3087 	if (IS_ERR(rt))
3088 		return PTR_ERR(rt);
3089 
3090 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3091 	fib6_info_release(rt);
3092 
3093 	return err;
3094 }
3095 
3096 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3097 {
3098 	struct net *net = info->nl_net;
3099 	struct fib6_table *table;
3100 	int err;
3101 
3102 	if (rt == net->ipv6.fib6_null_entry) {
3103 		err = -ENOENT;
3104 		goto out;
3105 	}
3106 
3107 	table = rt->fib6_table;
3108 	spin_lock_bh(&table->tb6_lock);
3109 	err = fib6_del(rt, info);
3110 	spin_unlock_bh(&table->tb6_lock);
3111 
3112 out:
3113 	fib6_info_release(rt);
3114 	return err;
3115 }
3116 
3117 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3118 {
3119 	struct nl_info info = { .nl_net = net };
3120 
3121 	return __ip6_del_rt(rt, &info);
3122 }
3123 
3124 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3125 {
3126 	struct nl_info *info = &cfg->fc_nlinfo;
3127 	struct net *net = info->nl_net;
3128 	struct sk_buff *skb = NULL;
3129 	struct fib6_table *table;
3130 	int err = -ENOENT;
3131 
3132 	if (rt == net->ipv6.fib6_null_entry)
3133 		goto out_put;
3134 	table = rt->fib6_table;
3135 	spin_lock_bh(&table->tb6_lock);
3136 
3137 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3138 		struct fib6_info *sibling, *next_sibling;
3139 
3140 		/* prefer to send a single notification with all hops */
3141 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3142 		if (skb) {
3143 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3144 
3145 			if (rt6_fill_node(net, skb, rt, NULL,
3146 					  NULL, NULL, 0, RTM_DELROUTE,
3147 					  info->portid, seq, 0) < 0) {
3148 				kfree_skb(skb);
3149 				skb = NULL;
3150 			} else
3151 				info->skip_notify = 1;
3152 		}
3153 
3154 		list_for_each_entry_safe(sibling, next_sibling,
3155 					 &rt->fib6_siblings,
3156 					 fib6_siblings) {
3157 			err = fib6_del(sibling, info);
3158 			if (err)
3159 				goto out_unlock;
3160 		}
3161 	}
3162 
3163 	err = fib6_del(rt, info);
3164 out_unlock:
3165 	spin_unlock_bh(&table->tb6_lock);
3166 out_put:
3167 	fib6_info_release(rt);
3168 
3169 	if (skb) {
3170 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3171 			    info->nlh, gfp_any());
3172 	}
3173 	return err;
3174 }
3175 
3176 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3177 {
3178 	int rc = -ESRCH;
3179 
3180 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3181 		goto out;
3182 
3183 	if (cfg->fc_flags & RTF_GATEWAY &&
3184 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3185 		goto out;
3186 	if (dst_hold_safe(&rt->dst))
3187 		rc = rt6_remove_exception_rt(rt);
3188 out:
3189 	return rc;
3190 }
3191 
3192 static int ip6_route_del(struct fib6_config *cfg,
3193 			 struct netlink_ext_ack *extack)
3194 {
3195 	struct rt6_info *rt_cache;
3196 	struct fib6_table *table;
3197 	struct fib6_info *rt;
3198 	struct fib6_node *fn;
3199 	int err = -ESRCH;
3200 
3201 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3202 	if (!table) {
3203 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3204 		return err;
3205 	}
3206 
3207 	rcu_read_lock();
3208 
3209 	fn = fib6_locate(&table->tb6_root,
3210 			 &cfg->fc_dst, cfg->fc_dst_len,
3211 			 &cfg->fc_src, cfg->fc_src_len,
3212 			 !(cfg->fc_flags & RTF_CACHE));
3213 
3214 	if (fn) {
3215 		for_each_fib6_node_rt_rcu(fn) {
3216 			if (cfg->fc_flags & RTF_CACHE) {
3217 				int rc;
3218 
3219 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3220 							      &cfg->fc_src);
3221 				if (rt_cache) {
3222 					rc = ip6_del_cached_rt(rt_cache, cfg);
3223 					if (rc != -ESRCH)
3224 						return rc;
3225 				}
3226 				continue;
3227 			}
3228 			if (cfg->fc_ifindex &&
3229 			    (!rt->fib6_nh.nh_dev ||
3230 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3231 				continue;
3232 			if (cfg->fc_flags & RTF_GATEWAY &&
3233 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3234 				continue;
3235 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3236 				continue;
3237 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3238 				continue;
3239 			fib6_info_hold(rt);
3240 			rcu_read_unlock();
3241 
3242 			/* if gateway was specified only delete the one hop */
3243 			if (cfg->fc_flags & RTF_GATEWAY)
3244 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3245 
3246 			return __ip6_del_rt_siblings(rt, cfg);
3247 		}
3248 	}
3249 	rcu_read_unlock();
3250 
3251 	return err;
3252 }
3253 
3254 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3255 {
3256 	struct netevent_redirect netevent;
3257 	struct rt6_info *rt, *nrt = NULL;
3258 	struct ndisc_options ndopts;
3259 	struct inet6_dev *in6_dev;
3260 	struct neighbour *neigh;
3261 	struct fib6_info *from;
3262 	struct rd_msg *msg;
3263 	int optlen, on_link;
3264 	u8 *lladdr;
3265 
3266 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3267 	optlen -= sizeof(*msg);
3268 
3269 	if (optlen < 0) {
3270 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3271 		return;
3272 	}
3273 
3274 	msg = (struct rd_msg *)icmp6_hdr(skb);
3275 
3276 	if (ipv6_addr_is_multicast(&msg->dest)) {
3277 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3278 		return;
3279 	}
3280 
3281 	on_link = 0;
3282 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3283 		on_link = 1;
3284 	} else if (ipv6_addr_type(&msg->target) !=
3285 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3286 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3287 		return;
3288 	}
3289 
3290 	in6_dev = __in6_dev_get(skb->dev);
3291 	if (!in6_dev)
3292 		return;
3293 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3294 		return;
3295 
3296 	/* RFC2461 8.1:
3297 	 *	The IP source address of the Redirect MUST be the same as the current
3298 	 *	first-hop router for the specified ICMP Destination Address.
3299 	 */
3300 
3301 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3302 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3303 		return;
3304 	}
3305 
3306 	lladdr = NULL;
3307 	if (ndopts.nd_opts_tgt_lladdr) {
3308 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3309 					     skb->dev);
3310 		if (!lladdr) {
3311 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3312 			return;
3313 		}
3314 	}
3315 
3316 	rt = (struct rt6_info *) dst;
3317 	if (rt->rt6i_flags & RTF_REJECT) {
3318 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3319 		return;
3320 	}
3321 
3322 	/* Redirect received -> path was valid.
3323 	 * Look, redirects are sent only in response to data packets,
3324 	 * so that this nexthop apparently is reachable. --ANK
3325 	 */
3326 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3327 
3328 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3329 	if (!neigh)
3330 		return;
3331 
3332 	/*
3333 	 *	We have finally decided to accept it.
3334 	 */
3335 
3336 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3337 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3338 		     NEIGH_UPDATE_F_OVERRIDE|
3339 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3340 				     NEIGH_UPDATE_F_ISROUTER)),
3341 		     NDISC_REDIRECT, &ndopts);
3342 
3343 	rcu_read_lock();
3344 	from = rcu_dereference(rt->from);
3345 	fib6_info_hold(from);
3346 	rcu_read_unlock();
3347 
3348 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3349 	if (!nrt)
3350 		goto out;
3351 
3352 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3353 	if (on_link)
3354 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3355 
3356 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3357 
3358 	/* No need to remove rt from the exception table if rt is
3359 	 * a cached route because rt6_insert_exception() will
3360 	 * takes care of it
3361 	 */
3362 	if (rt6_insert_exception(nrt, from)) {
3363 		dst_release_immediate(&nrt->dst);
3364 		goto out;
3365 	}
3366 
3367 	netevent.old = &rt->dst;
3368 	netevent.new = &nrt->dst;
3369 	netevent.daddr = &msg->dest;
3370 	netevent.neigh = neigh;
3371 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3372 
3373 out:
3374 	fib6_info_release(from);
3375 	neigh_release(neigh);
3376 }
3377 
3378 #ifdef CONFIG_IPV6_ROUTE_INFO
3379 static struct fib6_info *rt6_get_route_info(struct net *net,
3380 					   const struct in6_addr *prefix, int prefixlen,
3381 					   const struct in6_addr *gwaddr,
3382 					   struct net_device *dev)
3383 {
3384 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3385 	int ifindex = dev->ifindex;
3386 	struct fib6_node *fn;
3387 	struct fib6_info *rt = NULL;
3388 	struct fib6_table *table;
3389 
3390 	table = fib6_get_table(net, tb_id);
3391 	if (!table)
3392 		return NULL;
3393 
3394 	rcu_read_lock();
3395 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3396 	if (!fn)
3397 		goto out;
3398 
3399 	for_each_fib6_node_rt_rcu(fn) {
3400 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3401 			continue;
3402 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3403 			continue;
3404 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3405 			continue;
3406 		fib6_info_hold(rt);
3407 		break;
3408 	}
3409 out:
3410 	rcu_read_unlock();
3411 	return rt;
3412 }
3413 
3414 static struct fib6_info *rt6_add_route_info(struct net *net,
3415 					   const struct in6_addr *prefix, int prefixlen,
3416 					   const struct in6_addr *gwaddr,
3417 					   struct net_device *dev,
3418 					   unsigned int pref)
3419 {
3420 	struct fib6_config cfg = {
3421 		.fc_metric	= IP6_RT_PRIO_USER,
3422 		.fc_ifindex	= dev->ifindex,
3423 		.fc_dst_len	= prefixlen,
3424 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3425 				  RTF_UP | RTF_PREF(pref),
3426 		.fc_protocol = RTPROT_RA,
3427 		.fc_type = RTN_UNICAST,
3428 		.fc_nlinfo.portid = 0,
3429 		.fc_nlinfo.nlh = NULL,
3430 		.fc_nlinfo.nl_net = net,
3431 	};
3432 
3433 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3434 	cfg.fc_dst = *prefix;
3435 	cfg.fc_gateway = *gwaddr;
3436 
3437 	/* We should treat it as a default route if prefix length is 0. */
3438 	if (!prefixlen)
3439 		cfg.fc_flags |= RTF_DEFAULT;
3440 
3441 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3442 
3443 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3444 }
3445 #endif
3446 
3447 struct fib6_info *rt6_get_dflt_router(struct net *net,
3448 				     const struct in6_addr *addr,
3449 				     struct net_device *dev)
3450 {
3451 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3452 	struct fib6_info *rt;
3453 	struct fib6_table *table;
3454 
3455 	table = fib6_get_table(net, tb_id);
3456 	if (!table)
3457 		return NULL;
3458 
3459 	rcu_read_lock();
3460 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3461 		if (dev == rt->fib6_nh.nh_dev &&
3462 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3463 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3464 			break;
3465 	}
3466 	if (rt)
3467 		fib6_info_hold(rt);
3468 	rcu_read_unlock();
3469 	return rt;
3470 }
3471 
3472 struct fib6_info *rt6_add_dflt_router(struct net *net,
3473 				     const struct in6_addr *gwaddr,
3474 				     struct net_device *dev,
3475 				     unsigned int pref)
3476 {
3477 	struct fib6_config cfg = {
3478 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3479 		.fc_metric	= IP6_RT_PRIO_USER,
3480 		.fc_ifindex	= dev->ifindex,
3481 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3482 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3483 		.fc_protocol = RTPROT_RA,
3484 		.fc_type = RTN_UNICAST,
3485 		.fc_nlinfo.portid = 0,
3486 		.fc_nlinfo.nlh = NULL,
3487 		.fc_nlinfo.nl_net = net,
3488 	};
3489 
3490 	cfg.fc_gateway = *gwaddr;
3491 
3492 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3493 		struct fib6_table *table;
3494 
3495 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3496 		if (table)
3497 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3498 	}
3499 
3500 	return rt6_get_dflt_router(net, gwaddr, dev);
3501 }
3502 
3503 static void __rt6_purge_dflt_routers(struct net *net,
3504 				     struct fib6_table *table)
3505 {
3506 	struct fib6_info *rt;
3507 
3508 restart:
3509 	rcu_read_lock();
3510 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3511 		struct net_device *dev = fib6_info_nh_dev(rt);
3512 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3513 
3514 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3515 		    (!idev || idev->cnf.accept_ra != 2)) {
3516 			fib6_info_hold(rt);
3517 			rcu_read_unlock();
3518 			ip6_del_rt(net, rt);
3519 			goto restart;
3520 		}
3521 	}
3522 	rcu_read_unlock();
3523 
3524 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3525 }
3526 
3527 void rt6_purge_dflt_routers(struct net *net)
3528 {
3529 	struct fib6_table *table;
3530 	struct hlist_head *head;
3531 	unsigned int h;
3532 
3533 	rcu_read_lock();
3534 
3535 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3536 		head = &net->ipv6.fib_table_hash[h];
3537 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3538 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3539 				__rt6_purge_dflt_routers(net, table);
3540 		}
3541 	}
3542 
3543 	rcu_read_unlock();
3544 }
3545 
3546 static void rtmsg_to_fib6_config(struct net *net,
3547 				 struct in6_rtmsg *rtmsg,
3548 				 struct fib6_config *cfg)
3549 {
3550 	memset(cfg, 0, sizeof(*cfg));
3551 
3552 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3553 			 : RT6_TABLE_MAIN;
3554 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3555 	cfg->fc_metric = rtmsg->rtmsg_metric;
3556 	cfg->fc_expires = rtmsg->rtmsg_info;
3557 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3558 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3559 	cfg->fc_flags = rtmsg->rtmsg_flags;
3560 	cfg->fc_type = rtmsg->rtmsg_type;
3561 
3562 	cfg->fc_nlinfo.nl_net = net;
3563 
3564 	cfg->fc_dst = rtmsg->rtmsg_dst;
3565 	cfg->fc_src = rtmsg->rtmsg_src;
3566 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3567 }
3568 
3569 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3570 {
3571 	struct fib6_config cfg;
3572 	struct in6_rtmsg rtmsg;
3573 	int err;
3574 
3575 	switch (cmd) {
3576 	case SIOCADDRT:		/* Add a route */
3577 	case SIOCDELRT:		/* Delete a route */
3578 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3579 			return -EPERM;
3580 		err = copy_from_user(&rtmsg, arg,
3581 				     sizeof(struct in6_rtmsg));
3582 		if (err)
3583 			return -EFAULT;
3584 
3585 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3586 
3587 		rtnl_lock();
3588 		switch (cmd) {
3589 		case SIOCADDRT:
3590 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3591 			break;
3592 		case SIOCDELRT:
3593 			err = ip6_route_del(&cfg, NULL);
3594 			break;
3595 		default:
3596 			err = -EINVAL;
3597 		}
3598 		rtnl_unlock();
3599 
3600 		return err;
3601 	}
3602 
3603 	return -EINVAL;
3604 }
3605 
3606 /*
3607  *	Drop the packet on the floor
3608  */
3609 
3610 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3611 {
3612 	int type;
3613 	struct dst_entry *dst = skb_dst(skb);
3614 	switch (ipstats_mib_noroutes) {
3615 	case IPSTATS_MIB_INNOROUTES:
3616 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3617 		if (type == IPV6_ADDR_ANY) {
3618 			IP6_INC_STATS(dev_net(dst->dev),
3619 				      __in6_dev_get_safely(skb->dev),
3620 				      IPSTATS_MIB_INADDRERRORS);
3621 			break;
3622 		}
3623 		/* FALLTHROUGH */
3624 	case IPSTATS_MIB_OUTNOROUTES:
3625 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3626 			      ipstats_mib_noroutes);
3627 		break;
3628 	}
3629 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3630 	kfree_skb(skb);
3631 	return 0;
3632 }
3633 
3634 static int ip6_pkt_discard(struct sk_buff *skb)
3635 {
3636 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3637 }
3638 
3639 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3640 {
3641 	skb->dev = skb_dst(skb)->dev;
3642 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3643 }
3644 
3645 static int ip6_pkt_prohibit(struct sk_buff *skb)
3646 {
3647 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3648 }
3649 
3650 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3651 {
3652 	skb->dev = skb_dst(skb)->dev;
3653 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3654 }
3655 
3656 /*
3657  *	Allocate a dst for local (unicast / anycast) address.
3658  */
3659 
3660 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3661 				     struct inet6_dev *idev,
3662 				     const struct in6_addr *addr,
3663 				     bool anycast, gfp_t gfp_flags)
3664 {
3665 	u32 tb_id;
3666 	struct net_device *dev = idev->dev;
3667 	struct fib6_info *f6i;
3668 
3669 	f6i = fib6_info_alloc(gfp_flags);
3670 	if (!f6i)
3671 		return ERR_PTR(-ENOMEM);
3672 
3673 	f6i->dst_nocount = true;
3674 	f6i->dst_host = true;
3675 	f6i->fib6_protocol = RTPROT_KERNEL;
3676 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3677 	if (anycast) {
3678 		f6i->fib6_type = RTN_ANYCAST;
3679 		f6i->fib6_flags |= RTF_ANYCAST;
3680 	} else {
3681 		f6i->fib6_type = RTN_LOCAL;
3682 		f6i->fib6_flags |= RTF_LOCAL;
3683 	}
3684 
3685 	f6i->fib6_nh.nh_gw = *addr;
3686 	dev_hold(dev);
3687 	f6i->fib6_nh.nh_dev = dev;
3688 	f6i->fib6_dst.addr = *addr;
3689 	f6i->fib6_dst.plen = 128;
3690 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3691 	f6i->fib6_table = fib6_get_table(net, tb_id);
3692 
3693 	return f6i;
3694 }
3695 
3696 /* remove deleted ip from prefsrc entries */
3697 struct arg_dev_net_ip {
3698 	struct net_device *dev;
3699 	struct net *net;
3700 	struct in6_addr *addr;
3701 };
3702 
3703 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3704 {
3705 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3706 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3707 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3708 
3709 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3710 	    rt != net->ipv6.fib6_null_entry &&
3711 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3712 		spin_lock_bh(&rt6_exception_lock);
3713 		/* remove prefsrc entry */
3714 		rt->fib6_prefsrc.plen = 0;
3715 		/* need to update cache as well */
3716 		rt6_exceptions_remove_prefsrc(rt);
3717 		spin_unlock_bh(&rt6_exception_lock);
3718 	}
3719 	return 0;
3720 }
3721 
3722 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3723 {
3724 	struct net *net = dev_net(ifp->idev->dev);
3725 	struct arg_dev_net_ip adni = {
3726 		.dev = ifp->idev->dev,
3727 		.net = net,
3728 		.addr = &ifp->addr,
3729 	};
3730 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3731 }
3732 
3733 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3734 
3735 /* Remove routers and update dst entries when gateway turn into host. */
3736 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3737 {
3738 	struct in6_addr *gateway = (struct in6_addr *)arg;
3739 
3740 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3741 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3742 		return -1;
3743 	}
3744 
3745 	/* Further clean up cached routes in exception table.
3746 	 * This is needed because cached route may have a different
3747 	 * gateway than its 'parent' in the case of an ip redirect.
3748 	 */
3749 	rt6_exceptions_clean_tohost(rt, gateway);
3750 
3751 	return 0;
3752 }
3753 
3754 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3755 {
3756 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3757 }
3758 
3759 struct arg_netdev_event {
3760 	const struct net_device *dev;
3761 	union {
3762 		unsigned int nh_flags;
3763 		unsigned long event;
3764 	};
3765 };
3766 
3767 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3768 {
3769 	struct fib6_info *iter;
3770 	struct fib6_node *fn;
3771 
3772 	fn = rcu_dereference_protected(rt->fib6_node,
3773 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3774 	iter = rcu_dereference_protected(fn->leaf,
3775 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3776 	while (iter) {
3777 		if (iter->fib6_metric == rt->fib6_metric &&
3778 		    rt6_qualify_for_ecmp(iter))
3779 			return iter;
3780 		iter = rcu_dereference_protected(iter->rt6_next,
3781 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3782 	}
3783 
3784 	return NULL;
3785 }
3786 
3787 static bool rt6_is_dead(const struct fib6_info *rt)
3788 {
3789 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3790 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3791 	     fib6_ignore_linkdown(rt)))
3792 		return true;
3793 
3794 	return false;
3795 }
3796 
3797 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3798 {
3799 	struct fib6_info *iter;
3800 	int total = 0;
3801 
3802 	if (!rt6_is_dead(rt))
3803 		total += rt->fib6_nh.nh_weight;
3804 
3805 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3806 		if (!rt6_is_dead(iter))
3807 			total += iter->fib6_nh.nh_weight;
3808 	}
3809 
3810 	return total;
3811 }
3812 
3813 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3814 {
3815 	int upper_bound = -1;
3816 
3817 	if (!rt6_is_dead(rt)) {
3818 		*weight += rt->fib6_nh.nh_weight;
3819 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3820 						    total) - 1;
3821 	}
3822 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3823 }
3824 
3825 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3826 {
3827 	struct fib6_info *iter;
3828 	int weight = 0;
3829 
3830 	rt6_upper_bound_set(rt, &weight, total);
3831 
3832 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3833 		rt6_upper_bound_set(iter, &weight, total);
3834 }
3835 
3836 void rt6_multipath_rebalance(struct fib6_info *rt)
3837 {
3838 	struct fib6_info *first;
3839 	int total;
3840 
3841 	/* In case the entire multipath route was marked for flushing,
3842 	 * then there is no need to rebalance upon the removal of every
3843 	 * sibling route.
3844 	 */
3845 	if (!rt->fib6_nsiblings || rt->should_flush)
3846 		return;
3847 
3848 	/* During lookup routes are evaluated in order, so we need to
3849 	 * make sure upper bounds are assigned from the first sibling
3850 	 * onwards.
3851 	 */
3852 	first = rt6_multipath_first_sibling(rt);
3853 	if (WARN_ON_ONCE(!first))
3854 		return;
3855 
3856 	total = rt6_multipath_total_weight(first);
3857 	rt6_multipath_upper_bound_set(first, total);
3858 }
3859 
3860 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3861 {
3862 	const struct arg_netdev_event *arg = p_arg;
3863 	struct net *net = dev_net(arg->dev);
3864 
3865 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3866 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3867 		fib6_update_sernum_upto_root(net, rt);
3868 		rt6_multipath_rebalance(rt);
3869 	}
3870 
3871 	return 0;
3872 }
3873 
3874 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3875 {
3876 	struct arg_netdev_event arg = {
3877 		.dev = dev,
3878 		{
3879 			.nh_flags = nh_flags,
3880 		},
3881 	};
3882 
3883 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3884 		arg.nh_flags |= RTNH_F_LINKDOWN;
3885 
3886 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3887 }
3888 
3889 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3890 				   const struct net_device *dev)
3891 {
3892 	struct fib6_info *iter;
3893 
3894 	if (rt->fib6_nh.nh_dev == dev)
3895 		return true;
3896 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3897 		if (iter->fib6_nh.nh_dev == dev)
3898 			return true;
3899 
3900 	return false;
3901 }
3902 
3903 static void rt6_multipath_flush(struct fib6_info *rt)
3904 {
3905 	struct fib6_info *iter;
3906 
3907 	rt->should_flush = 1;
3908 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3909 		iter->should_flush = 1;
3910 }
3911 
3912 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3913 					     const struct net_device *down_dev)
3914 {
3915 	struct fib6_info *iter;
3916 	unsigned int dead = 0;
3917 
3918 	if (rt->fib6_nh.nh_dev == down_dev ||
3919 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3920 		dead++;
3921 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3922 		if (iter->fib6_nh.nh_dev == down_dev ||
3923 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3924 			dead++;
3925 
3926 	return dead;
3927 }
3928 
3929 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3930 				       const struct net_device *dev,
3931 				       unsigned int nh_flags)
3932 {
3933 	struct fib6_info *iter;
3934 
3935 	if (rt->fib6_nh.nh_dev == dev)
3936 		rt->fib6_nh.nh_flags |= nh_flags;
3937 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3938 		if (iter->fib6_nh.nh_dev == dev)
3939 			iter->fib6_nh.nh_flags |= nh_flags;
3940 }
3941 
3942 /* called with write lock held for table with rt */
3943 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3944 {
3945 	const struct arg_netdev_event *arg = p_arg;
3946 	const struct net_device *dev = arg->dev;
3947 	struct net *net = dev_net(dev);
3948 
3949 	if (rt == net->ipv6.fib6_null_entry)
3950 		return 0;
3951 
3952 	switch (arg->event) {
3953 	case NETDEV_UNREGISTER:
3954 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3955 	case NETDEV_DOWN:
3956 		if (rt->should_flush)
3957 			return -1;
3958 		if (!rt->fib6_nsiblings)
3959 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3960 		if (rt6_multipath_uses_dev(rt, dev)) {
3961 			unsigned int count;
3962 
3963 			count = rt6_multipath_dead_count(rt, dev);
3964 			if (rt->fib6_nsiblings + 1 == count) {
3965 				rt6_multipath_flush(rt);
3966 				return -1;
3967 			}
3968 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3969 						   RTNH_F_LINKDOWN);
3970 			fib6_update_sernum(net, rt);
3971 			rt6_multipath_rebalance(rt);
3972 		}
3973 		return -2;
3974 	case NETDEV_CHANGE:
3975 		if (rt->fib6_nh.nh_dev != dev ||
3976 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3977 			break;
3978 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3979 		rt6_multipath_rebalance(rt);
3980 		break;
3981 	}
3982 
3983 	return 0;
3984 }
3985 
3986 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3987 {
3988 	struct arg_netdev_event arg = {
3989 		.dev = dev,
3990 		{
3991 			.event = event,
3992 		},
3993 	};
3994 
3995 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3996 }
3997 
3998 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3999 {
4000 	rt6_sync_down_dev(dev, event);
4001 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4002 	neigh_ifdown(&nd_tbl, dev);
4003 }
4004 
4005 struct rt6_mtu_change_arg {
4006 	struct net_device *dev;
4007 	unsigned int mtu;
4008 };
4009 
4010 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4011 {
4012 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4013 	struct inet6_dev *idev;
4014 
4015 	/* In IPv6 pmtu discovery is not optional,
4016 	   so that RTAX_MTU lock cannot disable it.
4017 	   We still use this lock to block changes
4018 	   caused by addrconf/ndisc.
4019 	*/
4020 
4021 	idev = __in6_dev_get(arg->dev);
4022 	if (!idev)
4023 		return 0;
4024 
4025 	/* For administrative MTU increase, there is no way to discover
4026 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4027 	   Since RFC 1981 doesn't include administrative MTU increase
4028 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4029 	 */
4030 	if (rt->fib6_nh.nh_dev == arg->dev &&
4031 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4032 		u32 mtu = rt->fib6_pmtu;
4033 
4034 		if (mtu >= arg->mtu ||
4035 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4036 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4037 
4038 		spin_lock_bh(&rt6_exception_lock);
4039 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4040 		spin_unlock_bh(&rt6_exception_lock);
4041 	}
4042 	return 0;
4043 }
4044 
4045 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4046 {
4047 	struct rt6_mtu_change_arg arg = {
4048 		.dev = dev,
4049 		.mtu = mtu,
4050 	};
4051 
4052 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4053 }
4054 
4055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4056 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4057 	[RTA_OIF]               = { .type = NLA_U32 },
4058 	[RTA_IIF]		= { .type = NLA_U32 },
4059 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4060 	[RTA_METRICS]           = { .type = NLA_NESTED },
4061 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4062 	[RTA_PREF]              = { .type = NLA_U8 },
4063 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4064 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4065 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4066 	[RTA_UID]		= { .type = NLA_U32 },
4067 	[RTA_MARK]		= { .type = NLA_U32 },
4068 };
4069 
4070 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4071 			      struct fib6_config *cfg,
4072 			      struct netlink_ext_ack *extack)
4073 {
4074 	struct rtmsg *rtm;
4075 	struct nlattr *tb[RTA_MAX+1];
4076 	unsigned int pref;
4077 	int err;
4078 
4079 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4080 			  NULL);
4081 	if (err < 0)
4082 		goto errout;
4083 
4084 	err = -EINVAL;
4085 	rtm = nlmsg_data(nlh);
4086 	memset(cfg, 0, sizeof(*cfg));
4087 
4088 	cfg->fc_table = rtm->rtm_table;
4089 	cfg->fc_dst_len = rtm->rtm_dst_len;
4090 	cfg->fc_src_len = rtm->rtm_src_len;
4091 	cfg->fc_flags = RTF_UP;
4092 	cfg->fc_protocol = rtm->rtm_protocol;
4093 	cfg->fc_type = rtm->rtm_type;
4094 
4095 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4096 	    rtm->rtm_type == RTN_BLACKHOLE ||
4097 	    rtm->rtm_type == RTN_PROHIBIT ||
4098 	    rtm->rtm_type == RTN_THROW)
4099 		cfg->fc_flags |= RTF_REJECT;
4100 
4101 	if (rtm->rtm_type == RTN_LOCAL)
4102 		cfg->fc_flags |= RTF_LOCAL;
4103 
4104 	if (rtm->rtm_flags & RTM_F_CLONED)
4105 		cfg->fc_flags |= RTF_CACHE;
4106 
4107 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4108 
4109 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4110 	cfg->fc_nlinfo.nlh = nlh;
4111 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4112 
4113 	if (tb[RTA_GATEWAY]) {
4114 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4115 		cfg->fc_flags |= RTF_GATEWAY;
4116 	}
4117 
4118 	if (tb[RTA_DST]) {
4119 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4120 
4121 		if (nla_len(tb[RTA_DST]) < plen)
4122 			goto errout;
4123 
4124 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4125 	}
4126 
4127 	if (tb[RTA_SRC]) {
4128 		int plen = (rtm->rtm_src_len + 7) >> 3;
4129 
4130 		if (nla_len(tb[RTA_SRC]) < plen)
4131 			goto errout;
4132 
4133 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4134 	}
4135 
4136 	if (tb[RTA_PREFSRC])
4137 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4138 
4139 	if (tb[RTA_OIF])
4140 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4141 
4142 	if (tb[RTA_PRIORITY])
4143 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4144 
4145 	if (tb[RTA_METRICS]) {
4146 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4147 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4148 	}
4149 
4150 	if (tb[RTA_TABLE])
4151 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4152 
4153 	if (tb[RTA_MULTIPATH]) {
4154 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4155 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4156 
4157 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4158 						     cfg->fc_mp_len, extack);
4159 		if (err < 0)
4160 			goto errout;
4161 	}
4162 
4163 	if (tb[RTA_PREF]) {
4164 		pref = nla_get_u8(tb[RTA_PREF]);
4165 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4166 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4167 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4168 		cfg->fc_flags |= RTF_PREF(pref);
4169 	}
4170 
4171 	if (tb[RTA_ENCAP])
4172 		cfg->fc_encap = tb[RTA_ENCAP];
4173 
4174 	if (tb[RTA_ENCAP_TYPE]) {
4175 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4176 
4177 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4178 		if (err < 0)
4179 			goto errout;
4180 	}
4181 
4182 	if (tb[RTA_EXPIRES]) {
4183 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4184 
4185 		if (addrconf_finite_timeout(timeout)) {
4186 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4187 			cfg->fc_flags |= RTF_EXPIRES;
4188 		}
4189 	}
4190 
4191 	err = 0;
4192 errout:
4193 	return err;
4194 }
4195 
4196 struct rt6_nh {
4197 	struct fib6_info *fib6_info;
4198 	struct fib6_config r_cfg;
4199 	struct list_head next;
4200 };
4201 
4202 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4203 {
4204 	struct rt6_nh *nh;
4205 
4206 	list_for_each_entry(nh, rt6_nh_list, next) {
4207 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4208 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4209 		        nh->r_cfg.fc_ifindex);
4210 	}
4211 }
4212 
4213 static int ip6_route_info_append(struct net *net,
4214 				 struct list_head *rt6_nh_list,
4215 				 struct fib6_info *rt,
4216 				 struct fib6_config *r_cfg)
4217 {
4218 	struct rt6_nh *nh;
4219 	int err = -EEXIST;
4220 
4221 	list_for_each_entry(nh, rt6_nh_list, next) {
4222 		/* check if fib6_info already exists */
4223 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4224 			return err;
4225 	}
4226 
4227 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4228 	if (!nh)
4229 		return -ENOMEM;
4230 	nh->fib6_info = rt;
4231 	err = ip6_convert_metrics(net, rt, r_cfg);
4232 	if (err) {
4233 		kfree(nh);
4234 		return err;
4235 	}
4236 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4237 	list_add_tail(&nh->next, rt6_nh_list);
4238 
4239 	return 0;
4240 }
4241 
4242 static void ip6_route_mpath_notify(struct fib6_info *rt,
4243 				   struct fib6_info *rt_last,
4244 				   struct nl_info *info,
4245 				   __u16 nlflags)
4246 {
4247 	/* if this is an APPEND route, then rt points to the first route
4248 	 * inserted and rt_last points to last route inserted. Userspace
4249 	 * wants a consistent dump of the route which starts at the first
4250 	 * nexthop. Since sibling routes are always added at the end of
4251 	 * the list, find the first sibling of the last route appended
4252 	 */
4253 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4254 		rt = list_first_entry(&rt_last->fib6_siblings,
4255 				      struct fib6_info,
4256 				      fib6_siblings);
4257 	}
4258 
4259 	if (rt)
4260 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4261 }
4262 
4263 static int ip6_route_multipath_add(struct fib6_config *cfg,
4264 				   struct netlink_ext_ack *extack)
4265 {
4266 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4267 	struct nl_info *info = &cfg->fc_nlinfo;
4268 	struct fib6_config r_cfg;
4269 	struct rtnexthop *rtnh;
4270 	struct fib6_info *rt;
4271 	struct rt6_nh *err_nh;
4272 	struct rt6_nh *nh, *nh_safe;
4273 	__u16 nlflags;
4274 	int remaining;
4275 	int attrlen;
4276 	int err = 1;
4277 	int nhn = 0;
4278 	int replace = (cfg->fc_nlinfo.nlh &&
4279 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4280 	LIST_HEAD(rt6_nh_list);
4281 
4282 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4283 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4284 		nlflags |= NLM_F_APPEND;
4285 
4286 	remaining = cfg->fc_mp_len;
4287 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4288 
4289 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4290 	 * fib6_info structs per nexthop
4291 	 */
4292 	while (rtnh_ok(rtnh, remaining)) {
4293 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4294 		if (rtnh->rtnh_ifindex)
4295 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4296 
4297 		attrlen = rtnh_attrlen(rtnh);
4298 		if (attrlen > 0) {
4299 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4300 
4301 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4302 			if (nla) {
4303 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4304 				r_cfg.fc_flags |= RTF_GATEWAY;
4305 			}
4306 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4307 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4308 			if (nla)
4309 				r_cfg.fc_encap_type = nla_get_u16(nla);
4310 		}
4311 
4312 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4313 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4314 		if (IS_ERR(rt)) {
4315 			err = PTR_ERR(rt);
4316 			rt = NULL;
4317 			goto cleanup;
4318 		}
4319 
4320 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4321 
4322 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4323 					    rt, &r_cfg);
4324 		if (err) {
4325 			fib6_info_release(rt);
4326 			goto cleanup;
4327 		}
4328 
4329 		rtnh = rtnh_next(rtnh, &remaining);
4330 	}
4331 
4332 	/* for add and replace send one notification with all nexthops.
4333 	 * Skip the notification in fib6_add_rt2node and send one with
4334 	 * the full route when done
4335 	 */
4336 	info->skip_notify = 1;
4337 
4338 	err_nh = NULL;
4339 	list_for_each_entry(nh, &rt6_nh_list, next) {
4340 		rt_last = nh->fib6_info;
4341 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4342 		fib6_info_release(nh->fib6_info);
4343 
4344 		/* save reference to first route for notification */
4345 		if (!rt_notif && !err)
4346 			rt_notif = nh->fib6_info;
4347 
4348 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4349 		nh->fib6_info = NULL;
4350 		if (err) {
4351 			if (replace && nhn)
4352 				ip6_print_replace_route_err(&rt6_nh_list);
4353 			err_nh = nh;
4354 			goto add_errout;
4355 		}
4356 
4357 		/* Because each route is added like a single route we remove
4358 		 * these flags after the first nexthop: if there is a collision,
4359 		 * we have already failed to add the first nexthop:
4360 		 * fib6_add_rt2node() has rejected it; when replacing, old
4361 		 * nexthops have been replaced by first new, the rest should
4362 		 * be added to it.
4363 		 */
4364 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4365 						     NLM_F_REPLACE);
4366 		nhn++;
4367 	}
4368 
4369 	/* success ... tell user about new route */
4370 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4371 	goto cleanup;
4372 
4373 add_errout:
4374 	/* send notification for routes that were added so that
4375 	 * the delete notifications sent by ip6_route_del are
4376 	 * coherent
4377 	 */
4378 	if (rt_notif)
4379 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4380 
4381 	/* Delete routes that were already added */
4382 	list_for_each_entry(nh, &rt6_nh_list, next) {
4383 		if (err_nh == nh)
4384 			break;
4385 		ip6_route_del(&nh->r_cfg, extack);
4386 	}
4387 
4388 cleanup:
4389 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4390 		if (nh->fib6_info)
4391 			fib6_info_release(nh->fib6_info);
4392 		list_del(&nh->next);
4393 		kfree(nh);
4394 	}
4395 
4396 	return err;
4397 }
4398 
4399 static int ip6_route_multipath_del(struct fib6_config *cfg,
4400 				   struct netlink_ext_ack *extack)
4401 {
4402 	struct fib6_config r_cfg;
4403 	struct rtnexthop *rtnh;
4404 	int remaining;
4405 	int attrlen;
4406 	int err = 1, last_err = 0;
4407 
4408 	remaining = cfg->fc_mp_len;
4409 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4410 
4411 	/* Parse a Multipath Entry */
4412 	while (rtnh_ok(rtnh, remaining)) {
4413 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4414 		if (rtnh->rtnh_ifindex)
4415 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4416 
4417 		attrlen = rtnh_attrlen(rtnh);
4418 		if (attrlen > 0) {
4419 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4420 
4421 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4422 			if (nla) {
4423 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4424 				r_cfg.fc_flags |= RTF_GATEWAY;
4425 			}
4426 		}
4427 		err = ip6_route_del(&r_cfg, extack);
4428 		if (err)
4429 			last_err = err;
4430 
4431 		rtnh = rtnh_next(rtnh, &remaining);
4432 	}
4433 
4434 	return last_err;
4435 }
4436 
4437 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4438 			      struct netlink_ext_ack *extack)
4439 {
4440 	struct fib6_config cfg;
4441 	int err;
4442 
4443 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4444 	if (err < 0)
4445 		return err;
4446 
4447 	if (cfg.fc_mp)
4448 		return ip6_route_multipath_del(&cfg, extack);
4449 	else {
4450 		cfg.fc_delete_all_nh = 1;
4451 		return ip6_route_del(&cfg, extack);
4452 	}
4453 }
4454 
4455 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4456 			      struct netlink_ext_ack *extack)
4457 {
4458 	struct fib6_config cfg;
4459 	int err;
4460 
4461 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4462 	if (err < 0)
4463 		return err;
4464 
4465 	if (cfg.fc_mp)
4466 		return ip6_route_multipath_add(&cfg, extack);
4467 	else
4468 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4469 }
4470 
4471 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4472 {
4473 	int nexthop_len = 0;
4474 
4475 	if (rt->fib6_nsiblings) {
4476 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4477 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4478 			    + nla_total_size(16) /* RTA_GATEWAY */
4479 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4480 
4481 		nexthop_len *= rt->fib6_nsiblings;
4482 	}
4483 
4484 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4485 	       + nla_total_size(16) /* RTA_SRC */
4486 	       + nla_total_size(16) /* RTA_DST */
4487 	       + nla_total_size(16) /* RTA_GATEWAY */
4488 	       + nla_total_size(16) /* RTA_PREFSRC */
4489 	       + nla_total_size(4) /* RTA_TABLE */
4490 	       + nla_total_size(4) /* RTA_IIF */
4491 	       + nla_total_size(4) /* RTA_OIF */
4492 	       + nla_total_size(4) /* RTA_PRIORITY */
4493 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4494 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4495 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4496 	       + nla_total_size(1) /* RTA_PREF */
4497 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4498 	       + nexthop_len;
4499 }
4500 
4501 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4502 			    unsigned int *flags, bool skip_oif)
4503 {
4504 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4505 		*flags |= RTNH_F_DEAD;
4506 
4507 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4508 		*flags |= RTNH_F_LINKDOWN;
4509 
4510 		rcu_read_lock();
4511 		if (fib6_ignore_linkdown(rt))
4512 			*flags |= RTNH_F_DEAD;
4513 		rcu_read_unlock();
4514 	}
4515 
4516 	if (rt->fib6_flags & RTF_GATEWAY) {
4517 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4518 			goto nla_put_failure;
4519 	}
4520 
4521 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4522 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4523 		*flags |= RTNH_F_OFFLOAD;
4524 
4525 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4526 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4527 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4528 		goto nla_put_failure;
4529 
4530 	if (rt->fib6_nh.nh_lwtstate &&
4531 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4532 		goto nla_put_failure;
4533 
4534 	return 0;
4535 
4536 nla_put_failure:
4537 	return -EMSGSIZE;
4538 }
4539 
4540 /* add multipath next hop */
4541 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4542 {
4543 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4544 	struct rtnexthop *rtnh;
4545 	unsigned int flags = 0;
4546 
4547 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4548 	if (!rtnh)
4549 		goto nla_put_failure;
4550 
4551 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4552 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4553 
4554 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4555 		goto nla_put_failure;
4556 
4557 	rtnh->rtnh_flags = flags;
4558 
4559 	/* length of rtnetlink header + attributes */
4560 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4561 
4562 	return 0;
4563 
4564 nla_put_failure:
4565 	return -EMSGSIZE;
4566 }
4567 
4568 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4569 			 struct fib6_info *rt, struct dst_entry *dst,
4570 			 struct in6_addr *dest, struct in6_addr *src,
4571 			 int iif, int type, u32 portid, u32 seq,
4572 			 unsigned int flags)
4573 {
4574 	struct rtmsg *rtm;
4575 	struct nlmsghdr *nlh;
4576 	long expires = 0;
4577 	u32 *pmetrics;
4578 	u32 table;
4579 
4580 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4581 	if (!nlh)
4582 		return -EMSGSIZE;
4583 
4584 	rtm = nlmsg_data(nlh);
4585 	rtm->rtm_family = AF_INET6;
4586 	rtm->rtm_dst_len = rt->fib6_dst.plen;
4587 	rtm->rtm_src_len = rt->fib6_src.plen;
4588 	rtm->rtm_tos = 0;
4589 	if (rt->fib6_table)
4590 		table = rt->fib6_table->tb6_id;
4591 	else
4592 		table = RT6_TABLE_UNSPEC;
4593 	rtm->rtm_table = table;
4594 	if (nla_put_u32(skb, RTA_TABLE, table))
4595 		goto nla_put_failure;
4596 
4597 	rtm->rtm_type = rt->fib6_type;
4598 	rtm->rtm_flags = 0;
4599 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4600 	rtm->rtm_protocol = rt->fib6_protocol;
4601 
4602 	if (rt->fib6_flags & RTF_CACHE)
4603 		rtm->rtm_flags |= RTM_F_CLONED;
4604 
4605 	if (dest) {
4606 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4607 			goto nla_put_failure;
4608 		rtm->rtm_dst_len = 128;
4609 	} else if (rtm->rtm_dst_len)
4610 		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4611 			goto nla_put_failure;
4612 #ifdef CONFIG_IPV6_SUBTREES
4613 	if (src) {
4614 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4615 			goto nla_put_failure;
4616 		rtm->rtm_src_len = 128;
4617 	} else if (rtm->rtm_src_len &&
4618 		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4619 		goto nla_put_failure;
4620 #endif
4621 	if (iif) {
4622 #ifdef CONFIG_IPV6_MROUTE
4623 		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4624 			int err = ip6mr_get_route(net, skb, rtm, portid);
4625 
4626 			if (err == 0)
4627 				return 0;
4628 			if (err < 0)
4629 				goto nla_put_failure;
4630 		} else
4631 #endif
4632 			if (nla_put_u32(skb, RTA_IIF, iif))
4633 				goto nla_put_failure;
4634 	} else if (dest) {
4635 		struct in6_addr saddr_buf;
4636 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4637 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4638 			goto nla_put_failure;
4639 	}
4640 
4641 	if (rt->fib6_prefsrc.plen) {
4642 		struct in6_addr saddr_buf;
4643 		saddr_buf = rt->fib6_prefsrc.addr;
4644 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4645 			goto nla_put_failure;
4646 	}
4647 
4648 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4649 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4650 		goto nla_put_failure;
4651 
4652 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4653 		goto nla_put_failure;
4654 
4655 	/* For multipath routes, walk the siblings list and add
4656 	 * each as a nexthop within RTA_MULTIPATH.
4657 	 */
4658 	if (rt->fib6_nsiblings) {
4659 		struct fib6_info *sibling, *next_sibling;
4660 		struct nlattr *mp;
4661 
4662 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4663 		if (!mp)
4664 			goto nla_put_failure;
4665 
4666 		if (rt6_add_nexthop(skb, rt) < 0)
4667 			goto nla_put_failure;
4668 
4669 		list_for_each_entry_safe(sibling, next_sibling,
4670 					 &rt->fib6_siblings, fib6_siblings) {
4671 			if (rt6_add_nexthop(skb, sibling) < 0)
4672 				goto nla_put_failure;
4673 		}
4674 
4675 		nla_nest_end(skb, mp);
4676 	} else {
4677 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4678 			goto nla_put_failure;
4679 	}
4680 
4681 	if (rt->fib6_flags & RTF_EXPIRES) {
4682 		expires = dst ? dst->expires : rt->expires;
4683 		expires -= jiffies;
4684 	}
4685 
4686 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4687 		goto nla_put_failure;
4688 
4689 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4690 		goto nla_put_failure;
4691 
4692 
4693 	nlmsg_end(skb, nlh);
4694 	return 0;
4695 
4696 nla_put_failure:
4697 	nlmsg_cancel(skb, nlh);
4698 	return -EMSGSIZE;
4699 }
4700 
4701 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4702 {
4703 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4704 	struct net *net = arg->net;
4705 
4706 	if (rt == net->ipv6.fib6_null_entry)
4707 		return 0;
4708 
4709 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4710 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4711 
4712 		/* user wants prefix routes only */
4713 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4714 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4715 			/* success since this is not a prefix route */
4716 			return 1;
4717 		}
4718 	}
4719 
4720 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4721 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4722 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4723 }
4724 
4725 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4726 			      struct netlink_ext_ack *extack)
4727 {
4728 	struct net *net = sock_net(in_skb->sk);
4729 	struct nlattr *tb[RTA_MAX+1];
4730 	int err, iif = 0, oif = 0;
4731 	struct fib6_info *from;
4732 	struct dst_entry *dst;
4733 	struct rt6_info *rt;
4734 	struct sk_buff *skb;
4735 	struct rtmsg *rtm;
4736 	struct flowi6 fl6;
4737 	bool fibmatch;
4738 
4739 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4740 			  extack);
4741 	if (err < 0)
4742 		goto errout;
4743 
4744 	err = -EINVAL;
4745 	memset(&fl6, 0, sizeof(fl6));
4746 	rtm = nlmsg_data(nlh);
4747 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4748 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4749 
4750 	if (tb[RTA_SRC]) {
4751 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4752 			goto errout;
4753 
4754 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4755 	}
4756 
4757 	if (tb[RTA_DST]) {
4758 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4759 			goto errout;
4760 
4761 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4762 	}
4763 
4764 	if (tb[RTA_IIF])
4765 		iif = nla_get_u32(tb[RTA_IIF]);
4766 
4767 	if (tb[RTA_OIF])
4768 		oif = nla_get_u32(tb[RTA_OIF]);
4769 
4770 	if (tb[RTA_MARK])
4771 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4772 
4773 	if (tb[RTA_UID])
4774 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4775 					   nla_get_u32(tb[RTA_UID]));
4776 	else
4777 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4778 
4779 	if (iif) {
4780 		struct net_device *dev;
4781 		int flags = 0;
4782 
4783 		rcu_read_lock();
4784 
4785 		dev = dev_get_by_index_rcu(net, iif);
4786 		if (!dev) {
4787 			rcu_read_unlock();
4788 			err = -ENODEV;
4789 			goto errout;
4790 		}
4791 
4792 		fl6.flowi6_iif = iif;
4793 
4794 		if (!ipv6_addr_any(&fl6.saddr))
4795 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4796 
4797 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4798 
4799 		rcu_read_unlock();
4800 	} else {
4801 		fl6.flowi6_oif = oif;
4802 
4803 		dst = ip6_route_output(net, NULL, &fl6);
4804 	}
4805 
4806 
4807 	rt = container_of(dst, struct rt6_info, dst);
4808 	if (rt->dst.error) {
4809 		err = rt->dst.error;
4810 		ip6_rt_put(rt);
4811 		goto errout;
4812 	}
4813 
4814 	if (rt == net->ipv6.ip6_null_entry) {
4815 		err = rt->dst.error;
4816 		ip6_rt_put(rt);
4817 		goto errout;
4818 	}
4819 
4820 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4821 	if (!skb) {
4822 		ip6_rt_put(rt);
4823 		err = -ENOBUFS;
4824 		goto errout;
4825 	}
4826 
4827 	skb_dst_set(skb, &rt->dst);
4828 
4829 	rcu_read_lock();
4830 	from = rcu_dereference(rt->from);
4831 
4832 	if (fibmatch)
4833 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4834 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4835 				    nlh->nlmsg_seq, 0);
4836 	else
4837 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4838 				    &fl6.saddr, iif, RTM_NEWROUTE,
4839 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4840 				    0);
4841 	rcu_read_unlock();
4842 
4843 	if (err < 0) {
4844 		kfree_skb(skb);
4845 		goto errout;
4846 	}
4847 
4848 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4849 errout:
4850 	return err;
4851 }
4852 
4853 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4854 		     unsigned int nlm_flags)
4855 {
4856 	struct sk_buff *skb;
4857 	struct net *net = info->nl_net;
4858 	u32 seq;
4859 	int err;
4860 
4861 	err = -ENOBUFS;
4862 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4863 
4864 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4865 	if (!skb)
4866 		goto errout;
4867 
4868 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4869 			    event, info->portid, seq, nlm_flags);
4870 	if (err < 0) {
4871 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4872 		WARN_ON(err == -EMSGSIZE);
4873 		kfree_skb(skb);
4874 		goto errout;
4875 	}
4876 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4877 		    info->nlh, gfp_any());
4878 	return;
4879 errout:
4880 	if (err < 0)
4881 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4882 }
4883 
4884 static int ip6_route_dev_notify(struct notifier_block *this,
4885 				unsigned long event, void *ptr)
4886 {
4887 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4888 	struct net *net = dev_net(dev);
4889 
4890 	if (!(dev->flags & IFF_LOOPBACK))
4891 		return NOTIFY_OK;
4892 
4893 	if (event == NETDEV_REGISTER) {
4894 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4895 		net->ipv6.ip6_null_entry->dst.dev = dev;
4896 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4898 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4899 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4900 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4901 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4902 #endif
4903 	 } else if (event == NETDEV_UNREGISTER &&
4904 		    dev->reg_state != NETREG_UNREGISTERED) {
4905 		/* NETDEV_UNREGISTER could be fired for multiple times by
4906 		 * netdev_wait_allrefs(). Make sure we only call this once.
4907 		 */
4908 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4910 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4911 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4912 #endif
4913 	}
4914 
4915 	return NOTIFY_OK;
4916 }
4917 
4918 /*
4919  *	/proc
4920  */
4921 
4922 #ifdef CONFIG_PROC_FS
4923 
4924 static const struct file_operations ipv6_route_proc_fops = {
4925 	.open		= ipv6_route_open,
4926 	.read		= seq_read,
4927 	.llseek		= seq_lseek,
4928 	.release	= seq_release_net,
4929 };
4930 
4931 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4932 {
4933 	struct net *net = (struct net *)seq->private;
4934 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4935 		   net->ipv6.rt6_stats->fib_nodes,
4936 		   net->ipv6.rt6_stats->fib_route_nodes,
4937 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4938 		   net->ipv6.rt6_stats->fib_rt_entries,
4939 		   net->ipv6.rt6_stats->fib_rt_cache,
4940 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4941 		   net->ipv6.rt6_stats->fib_discarded_routes);
4942 
4943 	return 0;
4944 }
4945 
4946 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4947 {
4948 	return single_open_net(inode, file, rt6_stats_seq_show);
4949 }
4950 
4951 static const struct file_operations rt6_stats_seq_fops = {
4952 	.open	 = rt6_stats_seq_open,
4953 	.read	 = seq_read,
4954 	.llseek	 = seq_lseek,
4955 	.release = single_release_net,
4956 };
4957 #endif	/* CONFIG_PROC_FS */
4958 
4959 #ifdef CONFIG_SYSCTL
4960 
4961 static
4962 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4963 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4964 {
4965 	struct net *net;
4966 	int delay;
4967 	if (!write)
4968 		return -EINVAL;
4969 
4970 	net = (struct net *)ctl->extra1;
4971 	delay = net->ipv6.sysctl.flush_delay;
4972 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4973 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4974 	return 0;
4975 }
4976 
4977 struct ctl_table ipv6_route_table_template[] = {
4978 	{
4979 		.procname	=	"flush",
4980 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4981 		.maxlen		=	sizeof(int),
4982 		.mode		=	0200,
4983 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4984 	},
4985 	{
4986 		.procname	=	"gc_thresh",
4987 		.data		=	&ip6_dst_ops_template.gc_thresh,
4988 		.maxlen		=	sizeof(int),
4989 		.mode		=	0644,
4990 		.proc_handler	=	proc_dointvec,
4991 	},
4992 	{
4993 		.procname	=	"max_size",
4994 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4995 		.maxlen		=	sizeof(int),
4996 		.mode		=	0644,
4997 		.proc_handler	=	proc_dointvec,
4998 	},
4999 	{
5000 		.procname	=	"gc_min_interval",
5001 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5002 		.maxlen		=	sizeof(int),
5003 		.mode		=	0644,
5004 		.proc_handler	=	proc_dointvec_jiffies,
5005 	},
5006 	{
5007 		.procname	=	"gc_timeout",
5008 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5009 		.maxlen		=	sizeof(int),
5010 		.mode		=	0644,
5011 		.proc_handler	=	proc_dointvec_jiffies,
5012 	},
5013 	{
5014 		.procname	=	"gc_interval",
5015 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5016 		.maxlen		=	sizeof(int),
5017 		.mode		=	0644,
5018 		.proc_handler	=	proc_dointvec_jiffies,
5019 	},
5020 	{
5021 		.procname	=	"gc_elasticity",
5022 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5023 		.maxlen		=	sizeof(int),
5024 		.mode		=	0644,
5025 		.proc_handler	=	proc_dointvec,
5026 	},
5027 	{
5028 		.procname	=	"mtu_expires",
5029 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5030 		.maxlen		=	sizeof(int),
5031 		.mode		=	0644,
5032 		.proc_handler	=	proc_dointvec_jiffies,
5033 	},
5034 	{
5035 		.procname	=	"min_adv_mss",
5036 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5037 		.maxlen		=	sizeof(int),
5038 		.mode		=	0644,
5039 		.proc_handler	=	proc_dointvec,
5040 	},
5041 	{
5042 		.procname	=	"gc_min_interval_ms",
5043 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5044 		.maxlen		=	sizeof(int),
5045 		.mode		=	0644,
5046 		.proc_handler	=	proc_dointvec_ms_jiffies,
5047 	},
5048 	{ }
5049 };
5050 
5051 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5052 {
5053 	struct ctl_table *table;
5054 
5055 	table = kmemdup(ipv6_route_table_template,
5056 			sizeof(ipv6_route_table_template),
5057 			GFP_KERNEL);
5058 
5059 	if (table) {
5060 		table[0].data = &net->ipv6.sysctl.flush_delay;
5061 		table[0].extra1 = net;
5062 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5063 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5064 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5065 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5066 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5067 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5068 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5069 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5070 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5071 
5072 		/* Don't export sysctls to unprivileged users */
5073 		if (net->user_ns != &init_user_ns)
5074 			table[0].procname = NULL;
5075 	}
5076 
5077 	return table;
5078 }
5079 #endif
5080 
5081 static int __net_init ip6_route_net_init(struct net *net)
5082 {
5083 	int ret = -ENOMEM;
5084 
5085 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5086 	       sizeof(net->ipv6.ip6_dst_ops));
5087 
5088 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5089 		goto out_ip6_dst_ops;
5090 
5091 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5092 					    sizeof(*net->ipv6.fib6_null_entry),
5093 					    GFP_KERNEL);
5094 	if (!net->ipv6.fib6_null_entry)
5095 		goto out_ip6_dst_entries;
5096 
5097 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5098 					   sizeof(*net->ipv6.ip6_null_entry),
5099 					   GFP_KERNEL);
5100 	if (!net->ipv6.ip6_null_entry)
5101 		goto out_fib6_null_entry;
5102 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5103 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5104 			 ip6_template_metrics, true);
5105 
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107 	net->ipv6.fib6_has_custom_rules = false;
5108 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5109 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5110 					       GFP_KERNEL);
5111 	if (!net->ipv6.ip6_prohibit_entry)
5112 		goto out_ip6_null_entry;
5113 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5114 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5115 			 ip6_template_metrics, true);
5116 
5117 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5118 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5119 					       GFP_KERNEL);
5120 	if (!net->ipv6.ip6_blk_hole_entry)
5121 		goto out_ip6_prohibit_entry;
5122 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5123 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5124 			 ip6_template_metrics, true);
5125 #endif
5126 
5127 	net->ipv6.sysctl.flush_delay = 0;
5128 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5129 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5130 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5131 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5132 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5133 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5134 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5135 
5136 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5137 
5138 	ret = 0;
5139 out:
5140 	return ret;
5141 
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143 out_ip6_prohibit_entry:
5144 	kfree(net->ipv6.ip6_prohibit_entry);
5145 out_ip6_null_entry:
5146 	kfree(net->ipv6.ip6_null_entry);
5147 #endif
5148 out_fib6_null_entry:
5149 	kfree(net->ipv6.fib6_null_entry);
5150 out_ip6_dst_entries:
5151 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5152 out_ip6_dst_ops:
5153 	goto out;
5154 }
5155 
5156 static void __net_exit ip6_route_net_exit(struct net *net)
5157 {
5158 	kfree(net->ipv6.fib6_null_entry);
5159 	kfree(net->ipv6.ip6_null_entry);
5160 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5161 	kfree(net->ipv6.ip6_prohibit_entry);
5162 	kfree(net->ipv6.ip6_blk_hole_entry);
5163 #endif
5164 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5165 }
5166 
5167 static int __net_init ip6_route_net_init_late(struct net *net)
5168 {
5169 #ifdef CONFIG_PROC_FS
5170 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5171 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5172 #endif
5173 	return 0;
5174 }
5175 
5176 static void __net_exit ip6_route_net_exit_late(struct net *net)
5177 {
5178 #ifdef CONFIG_PROC_FS
5179 	remove_proc_entry("ipv6_route", net->proc_net);
5180 	remove_proc_entry("rt6_stats", net->proc_net);
5181 #endif
5182 }
5183 
5184 static struct pernet_operations ip6_route_net_ops = {
5185 	.init = ip6_route_net_init,
5186 	.exit = ip6_route_net_exit,
5187 };
5188 
5189 static int __net_init ipv6_inetpeer_init(struct net *net)
5190 {
5191 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5192 
5193 	if (!bp)
5194 		return -ENOMEM;
5195 	inet_peer_base_init(bp);
5196 	net->ipv6.peers = bp;
5197 	return 0;
5198 }
5199 
5200 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5201 {
5202 	struct inet_peer_base *bp = net->ipv6.peers;
5203 
5204 	net->ipv6.peers = NULL;
5205 	inetpeer_invalidate_tree(bp);
5206 	kfree(bp);
5207 }
5208 
5209 static struct pernet_operations ipv6_inetpeer_ops = {
5210 	.init	=	ipv6_inetpeer_init,
5211 	.exit	=	ipv6_inetpeer_exit,
5212 };
5213 
5214 static struct pernet_operations ip6_route_net_late_ops = {
5215 	.init = ip6_route_net_init_late,
5216 	.exit = ip6_route_net_exit_late,
5217 };
5218 
5219 static struct notifier_block ip6_route_dev_notifier = {
5220 	.notifier_call = ip6_route_dev_notify,
5221 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5222 };
5223 
5224 void __init ip6_route_init_special_entries(void)
5225 {
5226 	/* Registering of the loopback is done before this portion of code,
5227 	 * the loopback reference in rt6_info will not be taken, do it
5228 	 * manually for init_net */
5229 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5230 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5231 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5232   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5233 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5234 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5235 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5236 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5237   #endif
5238 }
5239 
5240 int __init ip6_route_init(void)
5241 {
5242 	int ret;
5243 	int cpu;
5244 
5245 	ret = -ENOMEM;
5246 	ip6_dst_ops_template.kmem_cachep =
5247 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5248 				  SLAB_HWCACHE_ALIGN, NULL);
5249 	if (!ip6_dst_ops_template.kmem_cachep)
5250 		goto out;
5251 
5252 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5253 	if (ret)
5254 		goto out_kmem_cache;
5255 
5256 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5257 	if (ret)
5258 		goto out_dst_entries;
5259 
5260 	ret = register_pernet_subsys(&ip6_route_net_ops);
5261 	if (ret)
5262 		goto out_register_inetpeer;
5263 
5264 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5265 
5266 	ret = fib6_init();
5267 	if (ret)
5268 		goto out_register_subsys;
5269 
5270 	ret = xfrm6_init();
5271 	if (ret)
5272 		goto out_fib6_init;
5273 
5274 	ret = fib6_rules_init();
5275 	if (ret)
5276 		goto xfrm6_init;
5277 
5278 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5279 	if (ret)
5280 		goto fib6_rules_init;
5281 
5282 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5283 				   inet6_rtm_newroute, NULL, 0);
5284 	if (ret < 0)
5285 		goto out_register_late_subsys;
5286 
5287 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5288 				   inet6_rtm_delroute, NULL, 0);
5289 	if (ret < 0)
5290 		goto out_register_late_subsys;
5291 
5292 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5293 				   inet6_rtm_getroute, NULL,
5294 				   RTNL_FLAG_DOIT_UNLOCKED);
5295 	if (ret < 0)
5296 		goto out_register_late_subsys;
5297 
5298 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5299 	if (ret)
5300 		goto out_register_late_subsys;
5301 
5302 	for_each_possible_cpu(cpu) {
5303 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5304 
5305 		INIT_LIST_HEAD(&ul->head);
5306 		spin_lock_init(&ul->lock);
5307 	}
5308 
5309 out:
5310 	return ret;
5311 
5312 out_register_late_subsys:
5313 	rtnl_unregister_all(PF_INET6);
5314 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5315 fib6_rules_init:
5316 	fib6_rules_cleanup();
5317 xfrm6_init:
5318 	xfrm6_fini();
5319 out_fib6_init:
5320 	fib6_gc_cleanup();
5321 out_register_subsys:
5322 	unregister_pernet_subsys(&ip6_route_net_ops);
5323 out_register_inetpeer:
5324 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5325 out_dst_entries:
5326 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5327 out_kmem_cache:
5328 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5329 	goto out;
5330 }
5331 
5332 void ip6_route_cleanup(void)
5333 {
5334 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5335 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5336 	fib6_rules_cleanup();
5337 	xfrm6_fini();
5338 	fib6_gc_cleanup();
5339 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5340 	unregister_pernet_subsys(&ip6_route_net_ops);
5341 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5342 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5343 }
5344