xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ftable.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2021 Racktop Systems, Inc.
24  */
25 
26 /*
27  * This file contains consumer routines of the IPv4 forwarding engine
28  */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #include <sys/dlpi.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/policy.h>
38 
39 #include <sys/systm.h>
40 #include <sys/strsun.h>
41 #include <sys/kmem.h>
42 #include <sys/param.h>
43 #include <sys/socket.h>
44 #include <sys/strsubr.h>
45 #include <net/if.h>
46 #include <net/route.h>
47 #include <netinet/in.h>
48 #include <net/if_dl.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/ipsec_impl.h>
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ip6.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/arp.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_ftable.h>
64 #include <inet/ip_rts.h>
65 #include <inet/nd.h>
66 
67 #include <net/pfkeyv2.h>
68 #include <inet/sadb.h>
69 #include <inet/tcp.h>
70 #include <inet/ipclassifier.h>
71 #include <sys/zone.h>
72 #include <net/radix.h>
73 #include <sys/tsol/label.h>
74 #include <sys/tsol/tnet.h>
75 
76 #define	IS_DEFAULT_ROUTE(ire)	\
77 	(((ire)->ire_type & IRE_DEFAULT) || \
78 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
79 
80 #define	IP_SRC_MULTIHOMING(isv6, ipst)			\
81 	(isv6 ? ipst->ips_ipv6_strict_src_multihoming :	\
82 	ipst->ips_ip_strict_src_multihoming)
83 
84 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
85 static void	ire_del_host_redir(ire_t *, char *);
86 static boolean_t ire_find_best_route(struct radix_node *, void *);
87 
88 /*
89  * Lookup a route in forwarding table. A specific lookup is indicated by
90  * passing the required parameters and indicating the match required in the
91  * flag field.
92  *
93  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
94  */
95 ire_t *
96 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
97     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
98     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
99 {
100 	ire_t *ire;
101 	struct rt_sockaddr rdst, rmask;
102 	struct rt_entry *rt;
103 	ire_ftable_args_t margs;
104 
105 	ASSERT(ill == NULL || !ill->ill_isv6);
106 
107 	/*
108 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
109 	 * is set.
110 	 */
111 	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
112 		return (NULL);
113 
114 	bzero(&rdst, sizeof (rdst));
115 	rdst.rt_sin_len = sizeof (rdst);
116 	rdst.rt_sin_family = AF_INET;
117 	rdst.rt_sin_addr.s_addr = addr;
118 
119 	bzero(&rmask, sizeof (rmask));
120 	rmask.rt_sin_len = sizeof (rmask);
121 	rmask.rt_sin_family = AF_INET;
122 	rmask.rt_sin_addr.s_addr = mask;
123 
124 	bzero(&margs, sizeof (margs));
125 	margs.ift_addr = addr;
126 	margs.ift_mask = mask;
127 	margs.ift_gateway = gateway;
128 	margs.ift_type = type;
129 	margs.ift_ill = ill;
130 	margs.ift_zoneid = zoneid;
131 	margs.ift_tsl = tsl;
132 	margs.ift_flags = flags;
133 
134 	/*
135 	 * The flags argument passed to ire_ftable_lookup may cause the
136 	 * search to return, not the longest matching prefix, but the
137 	 * "best matching prefix", i.e., the longest prefix that also
138 	 * satisfies constraints imposed via the permutation of flags
139 	 * passed in. To achieve this, we invoke ire_match_args() on
140 	 * each matching leaf in the  radix tree. ire_match_args is
141 	 * invoked by the callback function ire_find_best_route()
142 	 * We hold the global tree lock in read mode when calling
143 	 * rn_match_args. Before dropping the global tree lock, ensure
144 	 * that the radix node can't be deleted by incrementing ire_refcnt.
145 	 */
146 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
147 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
148 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
149 	ire = margs.ift_best_ire;
150 	if (rt == NULL) {
151 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
152 		return (NULL);
153 	}
154 	ASSERT(ire != NULL);
155 
156 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
157 
158 	/*
159 	 * round-robin only if we have more than one route in the bucket.
160 	 * ips_ip_ecmp_behavior controls when we do ECMP
161 	 *	2:	always
162 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
163 	 *	0:	never
164 	 */
165 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
166 		if (ipst->ips_ip_ecmp_behavior == 2 ||
167 		    (ipst->ips_ip_ecmp_behavior == 1 &&
168 		    IS_DEFAULT_ROUTE(ire))) {
169 			ire_t	*next_ire;
170 
171 			margs.ift_best_ire = NULL;
172 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
173 			    xmit_hint, ire, ipst);
174 			if (next_ire == NULL) {
175 				/* keep ire if next_ire is null */
176 				goto done;
177 			}
178 			ire_refrele(ire);
179 			ire = next_ire;
180 		}
181 	}
182 
183 done:
184 	/* Return generation before dropping lock */
185 	if (generationp != NULL)
186 		*generationp = ire->ire_generation;
187 
188 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
189 
190 	/*
191 	 * For shared-IP zones we need additional checks to what was
192 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
193 	 *
194 	 * When ip_restrict_interzone_loopback is set, then
195 	 * we ensure that IRE_LOCAL are only used for loopback
196 	 * between zones when the logical "Ethernet" would
197 	 * have looped them back. That is, if in the absense of
198 	 * the IRE_LOCAL we would have sent to packet out the
199 	 * same ill.
200 	 */
201 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
202 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
203 	    ipst->ips_ip_restrict_interzone_loopback) {
204 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
205 		ASSERT(ire != NULL);
206 	}
207 	return (ire);
208 }
209 
210 /*
211  * This function is called by
212  * ip_input/ire_route_recursive when doing a route lookup on only the
213  * destination address.
214  *
215  * The optimizations of this function over ire_ftable_lookup are:
216  *	o removing unnecessary flag matching
217  *	o doing longest prefix match instead of overloading it further
218  *	  with the unnecessary "best_prefix_match"
219  *
220  * If no route is found we return IRE_NOROUTE.
221  */
222 ire_t *
223 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
224     uint_t *generationp)
225 {
226 	ire_t *ire;
227 	struct rt_sockaddr rdst;
228 	struct rt_entry *rt;
229 	irb_t *irb;
230 
231 	rdst.rt_sin_len = sizeof (rdst);
232 	rdst.rt_sin_family = AF_INET;
233 	rdst.rt_sin_addr.s_addr = addr;
234 
235 	/*
236 	 * This is basically inlining  a simpler version of ire_match_args
237 	 */
238 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
239 
240 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
241 	    ipst->ips_ip_ftable, NULL, NULL);
242 
243 	if (rt == NULL)
244 		goto bad;
245 
246 	irb = &rt->rt_irb;
247 	if (irb->irb_ire_cnt == 0)
248 		goto bad;
249 
250 	rw_enter(&irb->irb_lock, RW_READER);
251 	ire = irb->irb_ire;
252 	if (ire == NULL) {
253 		rw_exit(&irb->irb_lock);
254 		goto bad;
255 	}
256 	while (IRE_IS_CONDEMNED(ire)) {
257 		ire = ire->ire_next;
258 		if (ire == NULL) {
259 			rw_exit(&irb->irb_lock);
260 			goto bad;
261 		}
262 	}
263 
264 	/* we have a ire that matches */
265 	ire_refhold(ire);
266 	rw_exit(&irb->irb_lock);
267 
268 	/*
269 	 * round-robin only if we have more than one route in the bucket.
270 	 * ips_ip_ecmp_behavior controls when we do ECMP
271 	 *	2:	always
272 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
273 	 *	0:	never
274 	 *
275 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
276 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
277 	 * and the IRE_INTERFACESs are likely to be shorter matches.
278 	 */
279 	if (ire->ire_bucket->irb_ire_cnt > 1) {
280 		if (ipst->ips_ip_ecmp_behavior == 2 ||
281 		    (ipst->ips_ip_ecmp_behavior == 1 &&
282 		    IS_DEFAULT_ROUTE(ire))) {
283 			ire_t	*next_ire;
284 			ire_ftable_args_t margs;
285 
286 			bzero(&margs, sizeof (margs));
287 			margs.ift_addr = addr;
288 			margs.ift_zoneid = ALL_ZONES;
289 
290 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
291 			    xmit_hint, ire, ipst);
292 			if (next_ire == NULL) {
293 				/* keep ire if next_ire is null */
294 				if (generationp != NULL)
295 					*generationp = ire->ire_generation;
296 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
297 				return (ire);
298 			}
299 			ire_refrele(ire);
300 			ire = next_ire;
301 		}
302 	}
303 	/* Return generation before dropping lock */
304 	if (generationp != NULL)
305 		*generationp = ire->ire_generation;
306 
307 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
308 
309 	/*
310 	 * Since we only did ALL_ZONES matches there is no special handling
311 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
312 	 */
313 	return (ire);
314 
315 bad:
316 	if (generationp != NULL)
317 		*generationp = IRE_GENERATION_VERIFY;
318 
319 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
320 	return (ire_reject(ipst, B_FALSE));
321 }
322 
323 /*
324  * Find the ill matching a multicast group.
325  * Allows different routes for multicast addresses
326  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
327  * which point at different interfaces. This is used when IP_MULTICAST_IF
328  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
329  * specify the interface to join on.
330  *
331  * Supports link-local addresses by using ire_route_recursive which follows
332  * the ill when recursing.
333  *
334  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
335  * and the MULTIRT property can be different for different groups, we
336  * extract RTF_MULTIRT from the special unicast route added for a group
337  * with CGTP and pass that back in the multirtp argument.
338  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
339  * We have a setsrcp argument for the same reason.
340  */
341 ill_t *
342 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
343     boolean_t *multirtp, ipaddr_t *setsrcp)
344 {
345 	ire_t	*ire;
346 	ill_t	*ill;
347 
348 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
349 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
350 	ASSERT(ire != NULL);
351 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
352 		ire_refrele(ire);
353 		return (NULL);
354 	}
355 
356 	if (multirtp != NULL)
357 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
358 
359 	ill = ire_nexthop_ill(ire);
360 	ire_refrele(ire);
361 	return (ill);
362 }
363 
364 /*
365  * Delete the passed in ire if the gateway addr matches
366  */
367 void
368 ire_del_host_redir(ire_t *ire, char *gateway)
369 {
370 	if ((ire->ire_flags & RTF_DYNAMIC) &&
371 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
372 		ire_delete(ire);
373 }
374 
375 /*
376  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
377  * pointing at the specified gateway and
378  * delete them. This routine is called only
379  * when a default gateway is going away.
380  */
381 void
382 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
383 {
384 	struct rtfuncarg rtfarg;
385 
386 	bzero(&rtfarg, sizeof (rtfarg));
387 	rtfarg.rt_func = ire_del_host_redir;
388 	rtfarg.rt_arg = (void *)&gateway;
389 	rtfarg.rt_zoneid = ALL_ZONES;
390 	rtfarg.rt_ipst = ipst;
391 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
392 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
393 }
394 
395 /*
396  * Obtain the rt_entry and rt_irb for the route to be added to
397  * the ips_ip_ftable.
398  * First attempt to add a node to the radix tree via rn_addroute. If the
399  * route already exists, return the bucket for the existing route.
400  *
401  * Locking notes: Need to hold the global radix tree lock in write mode to
402  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
403  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
404  * while holding the irb_lock, but not the radix tree lock.
405  */
406 irb_t *
407 ire_get_bucket(ire_t *ire)
408 {
409 	struct radix_node *rn;
410 	struct rt_entry *rt;
411 	struct rt_sockaddr rmask, rdst;
412 	irb_t *irb = NULL;
413 	ip_stack_t *ipst = ire->ire_ipst;
414 
415 	ASSERT(ipst->ips_ip_ftable != NULL);
416 
417 	/* first try to see if route exists (based on rtalloc1) */
418 	bzero(&rdst, sizeof (rdst));
419 	rdst.rt_sin_len = sizeof (rdst);
420 	rdst.rt_sin_family = AF_INET;
421 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
422 
423 	bzero(&rmask, sizeof (rmask));
424 	rmask.rt_sin_len = sizeof (rmask);
425 	rmask.rt_sin_family = AF_INET;
426 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
427 
428 	/*
429 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
430 	 */
431 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
432 	/* kmem_alloc failed */
433 	if (rt == NULL)
434 		return (NULL);
435 
436 	bzero(rt, sizeof (*rt));
437 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
438 	rt->rt_dst = rdst;
439 	irb = &rt->rt_irb;
440 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
441 	irb->irb_ipst = ipst;
442 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
443 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
444 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
445 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
446 	if (rn == NULL) {
447 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
448 		Free(rt, rt_entry_cache);
449 		rt = NULL;
450 		irb = NULL;
451 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
452 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
453 		    ipst->ips_ip_ftable);
454 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
455 			/* found a non-root match */
456 			rt = (struct rt_entry *)rn;
457 		}
458 	}
459 	if (rt != NULL) {
460 		irb = &rt->rt_irb;
461 		irb_refhold(irb);
462 	}
463 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
464 	return (irb);
465 }
466 
467 /*
468  * This function is used when the caller wants to know the outbound
469  * interface for a packet given only the address.
470  * If this is a offlink IP address and there are multiple
471  * routes to this destination, this routine will utilise the
472  * first route it finds to IP address
473  * Return values:
474  *	0	- FAILURE
475  *	nonzero	- ifindex
476  */
477 uint_t
478 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
479 {
480 	uint_t ifindex = 0;
481 	ire_t *ire;
482 	ill_t *ill;
483 	netstack_t *ns;
484 	ip_stack_t *ipst;
485 
486 	if (zoneid == ALL_ZONES)
487 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
488 	else
489 		ns = netstack_find_by_zoneid(zoneid);
490 	ASSERT(ns != NULL);
491 
492 	/*
493 	 * For exclusive stacks we set the zoneid to zero
494 	 * since IP uses the global zoneid in the exclusive stacks.
495 	 */
496 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
497 		zoneid = GLOBAL_ZONEID;
498 	ipst = ns->netstack_ip;
499 
500 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
501 
502 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
503 		ill = ire_nexthop_ill(ire);
504 		if (ill != NULL) {
505 			ifindex = ill->ill_phyint->phyint_ifindex;
506 			ill_refrele(ill);
507 		}
508 		ire_refrele(ire);
509 	}
510 	netstack_rele(ns);
511 	return (ifindex);
512 }
513 
514 /*
515  * Routine to find the route to a destination. If a ifindex is supplied
516  * it tries to match the route to the corresponding ipif for the ifindex
517  */
518 static	ire_t *
519 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
520 {
521 	ire_t *ire = NULL;
522 	int match_flags;
523 
524 	match_flags = MATCH_IRE_DSTONLY;
525 
526 	/* XXX pass NULL tsl for now */
527 
528 	if (dst_addr->sa_family == AF_INET) {
529 		ire = ire_route_recursive_v4(
530 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
531 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
532 		    NULL, NULL);
533 	} else {
534 		ire = ire_route_recursive_v6(
535 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
536 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
537 		    NULL, NULL);
538 	}
539 	ASSERT(ire != NULL);
540 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
541 		ire_refrele(ire);
542 		return (NULL);
543 	}
544 	return (ire);
545 }
546 
547 /*
548  * This routine is called by IP Filter to send a packet out on the wire
549  * to a specified dstination (which may be onlink or offlink). The ifindex may
550  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
551  * an outgoing interface and requires the nexthop to be on that interface.
552  * IP WILL NOT DO the following to the data packet before sending it out:
553  *	a. manipulate ttl
554  *	b. ipsec work
555  *	c. fragmentation
556  *
557  * If the packet has been prepared for hardware checksum then it will be
558  * passed off to ip_send_align_cksum() to check that the flags set on the
559  * packet are in alignment with the capabilities of the new outgoing NIC.
560  *
561  * Return values:
562  *	0:		IP was able to send of the data pkt
563  *	ECOMM:		Could not send packet
564  *	ENONET		No route to dst. It is up to the caller
565  *			to send icmp unreachable error message,
566  *	EINPROGRESS	The macaddr of the onlink dst or that
567  *			of the offlink dst's nexthop needs to get
568  *			resolved before packet can be sent to dst.
569  *			Thus transmission is not guaranteed.
570  *			Note: No longer have visibility to the ARP queue
571  *			hence no EINPROGRESS.
572  */
573 int
574 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
575     zoneid_t zoneid)
576 {
577 	ipaddr_t nexthop;
578 	netstack_t *ns;
579 	ip_stack_t *ipst;
580 	ip_xmit_attr_t ixas;
581 	int error;
582 
583 	ASSERT(mp != NULL);
584 
585 	if (zoneid == ALL_ZONES)
586 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
587 	else
588 		ns = netstack_find_by_zoneid(zoneid);
589 	ASSERT(ns != NULL);
590 
591 	/*
592 	 * For exclusive stacks we set the zoneid to zero
593 	 * since IP uses the global zoneid in the exclusive stacks.
594 	 */
595 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
596 		zoneid = GLOBAL_ZONEID;
597 	ipst = ns->netstack_ip;
598 
599 	ASSERT(dst_addr->sa_family == AF_INET ||
600 	    dst_addr->sa_family == AF_INET6);
601 
602 	bzero(&ixas, sizeof (ixas));
603 	/*
604 	 * No IPsec, no fragmentation, and don't let any hooks see
605 	 * the packet.
606 	 */
607 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
608 	ixas.ixa_cred = kcred;
609 	ixas.ixa_cpid = NOPID;
610 	ixas.ixa_tsl = NULL;
611 	ixas.ixa_ipst = ipst;
612 	ixas.ixa_ifindex = ifindex;
613 
614 	if (dst_addr->sa_family == AF_INET) {
615 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
616 
617 		ixas.ixa_flags |= IXAF_IS_IPV4;
618 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
619 		if (nexthop != ipha->ipha_dst) {
620 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
621 			ixas.ixa_nexthop_v4 = nexthop;
622 		}
623 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
624 	} else {
625 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
626 		in6_addr_t *nexthop6;
627 
628 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
629 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
630 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
631 			ixas.ixa_nexthop_v6 = *nexthop6;
632 		}
633 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
634 	}
635 	error = ip_output_simple(mp, &ixas);
636 	ixa_cleanup(&ixas);
637 
638 	netstack_rele(ns);
639 	switch (error) {
640 	case 0:
641 		break;
642 
643 	case EHOSTUNREACH:
644 	case ENETUNREACH:
645 		error = ENONET;
646 		break;
647 
648 	default:
649 		error = ECOMM;
650 		break;
651 	}
652 	return (error);
653 }
654 
655 /*
656  * callback function provided by ire_ftable_lookup when calling
657  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
658  * the radix tree.
659  */
660 boolean_t
661 ire_find_best_route(struct radix_node *rn, void *arg)
662 {
663 	struct rt_entry *rt = (struct rt_entry *)rn;
664 	irb_t *irb_ptr;
665 	ire_t *ire;
666 	ire_ftable_args_t *margs = arg;
667 	ipaddr_t match_mask;
668 
669 	ASSERT(rt != NULL);
670 
671 	irb_ptr = &rt->rt_irb;
672 
673 	if (irb_ptr->irb_ire_cnt == 0)
674 		return (B_FALSE);
675 
676 	rw_enter(&irb_ptr->irb_lock, RW_READER);
677 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
678 		if (IRE_IS_CONDEMNED(ire))
679 			continue;
680 		ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
681 		if (margs->ift_flags & MATCH_IRE_MASK)
682 			match_mask = margs->ift_mask;
683 		else
684 			match_mask = ire->ire_mask;
685 
686 		if (ire_match_args(ire, margs->ift_addr, match_mask,
687 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
688 		    margs->ift_zoneid, margs->ift_tsl,
689 		    margs->ift_flags)) {
690 			ire_refhold(ire);
691 			rw_exit(&irb_ptr->irb_lock);
692 			margs->ift_best_ire = ire;
693 			return (B_TRUE);
694 		}
695 	}
696 	rw_exit(&irb_ptr->irb_lock);
697 	return (B_FALSE);
698 }
699 
700 /*
701  * ftable irb_t structures are dynamically allocated, and we need to
702  * check if the irb_t (and associated ftable tree attachment) needs to
703  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
704  * be verified are:
705  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
706  * - no other threads holding references to ire's in the bucket,
707  *   i.e., irb_nire == 0
708  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
709  * - need to hold the global tree lock and irb_lock in write mode.
710  */
711 void
712 irb_refrele_ftable(irb_t *irb)
713 {
714 	for (;;) {
715 		rw_enter(&irb->irb_lock, RW_WRITER);
716 		ASSERT(irb->irb_refcnt != 0);
717 		if (irb->irb_refcnt != 1) {
718 			/*
719 			 * Someone has a reference to this radix node
720 			 * or there is some bucket walker.
721 			 */
722 			irb->irb_refcnt--;
723 			rw_exit(&irb->irb_lock);
724 			return;
725 		} else {
726 			/*
727 			 * There is no other walker, nor is there any
728 			 * other thread that holds a direct ref to this
729 			 * radix node. Do the clean up if needed. Call
730 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
731 			 */
732 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
733 				ire_t *ire_list;
734 
735 				ire_list = ire_unlink(irb);
736 				rw_exit(&irb->irb_lock);
737 
738 				if (ire_list != NULL)
739 					ire_cleanup(ire_list);
740 				/*
741 				 * more CONDEMNED entries could have
742 				 * been added while we dropped the lock,
743 				 * so we have to re-check.
744 				 */
745 				continue;
746 			}
747 
748 			/*
749 			 * Now check if there are still any ires
750 			 * associated with this radix node.
751 			 */
752 			if (irb->irb_nire != 0) {
753 				/*
754 				 * someone is still holding on
755 				 * to ires in this bucket
756 				 */
757 				irb->irb_refcnt--;
758 				rw_exit(&irb->irb_lock);
759 				return;
760 			} else {
761 				/*
762 				 * Everything is clear. Zero walkers,
763 				 * Zero threads with a ref to this
764 				 * radix node, Zero ires associated with
765 				 * this radix node. Due to lock order,
766 				 * check the above conditions again
767 				 * after grabbing all locks in the right order
768 				 */
769 				rw_exit(&irb->irb_lock);
770 				if (irb_inactive(irb))
771 					return;
772 				/*
773 				 * irb_inactive could not free the irb.
774 				 * See if there are any walkers, if not
775 				 * try to clean up again.
776 				 */
777 			}
778 		}
779 	}
780 }
781 
782 /*
783  * IRE iterator used by ire_ftable_lookup to process multiple equal
784  * routes. Given a starting point in the hash list (hash), walk the IREs
785  * in the bucket skipping deleted entries. We treat the bucket as a circular
786  * list for the purposes of walking it.
787  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
788  * not applicable (ire_match_args failed) then it returns a subsequent one.
789  * If we fail to find an IRE we return NULL.
790  *
791  * Assumes that the caller holds a reference on the IRE bucket and a read lock
792  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
793  *
794  * Applies to IPv4 and IPv6.
795  *
796  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
797  * address and bucket, we compare against ire_type for the orig_ire. We also
798  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
799  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
800  *
801  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
802  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
803  * in which the zone has an IP address. We check this for the global zone
804  * even if no shared-IP zones are configured.
805  */
806 ire_t *
807 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
808     ire_t *orig_ire, ip_stack_t *ipst)
809 {
810 	ire_t		*ire, *maybe_ire = NULL;
811 	uint_t		maybe_badcnt = 0;
812 	uint_t		maxwalk;
813 
814 	/* Fold in more bits from the hint/hash */
815 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
816 
817 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
818 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
819 	if (maxwalk == 0) {
820 		rw_exit(&irb_ptr->irb_lock);
821 		return (NULL);
822 	}
823 
824 	hash %= maxwalk;
825 	irb_refhold_locked(irb_ptr);
826 	rw_exit(&irb_ptr->irb_lock);
827 
828 	/*
829 	 * Round-robin the routers list looking for a route that
830 	 * matches the passed in parameters.
831 	 * First we skip "hash" number of non-condemned IREs.
832 	 * Then we match the IRE.
833 	 * If we find an ire which has a non-zero ire_badcnt then we remember
834 	 * it and keep on looking for a lower ire_badcnt.
835 	 * If we come to the end of the list we continue (treat the
836 	 * bucket list as a circular list) but we match less than "max"
837 	 * entries.
838 	 */
839 	ire = irb_ptr->irb_ire;
840 	while (maxwalk > 0) {
841 		if (IRE_IS_CONDEMNED(ire))
842 			goto next_ire_skip;
843 
844 		/* Skip the first "hash" entries to do ECMP */
845 		if (hash != 0) {
846 			hash--;
847 			goto next_ire_skip;
848 		}
849 
850 		/* See CGTP comment above */
851 		if (ire->ire_type != orig_ire->ire_type ||
852 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
853 			goto next_ire;
854 
855 		/*
856 		 * Note: Since IPv6 has hash buckets instead of radix
857 		 * buckers we need to explicitly compare the addresses.
858 		 * That makes this less efficient since we will be called
859 		 * even if there is no alternatives just because the
860 		 * bucket has multiple IREs for different addresses.
861 		 */
862 		if (ire->ire_ipversion == IPV6_VERSION) {
863 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
864 			    &ire->ire_addr_v6))
865 				goto next_ire;
866 		}
867 
868 		/*
869 		 * For some reason find_best_route uses ire_mask. We do
870 		 * the same.
871 		 */
872 		if (ire->ire_ipversion == IPV4_VERSION ?
873 		    !ire_match_args(ire, margs->ift_addr,
874 		    ire->ire_mask, margs->ift_gateway,
875 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
876 		    margs->ift_tsl, margs->ift_flags) :
877 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
878 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
879 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
880 		    margs->ift_tsl, margs->ift_flags))
881 			goto next_ire;
882 
883 		if (margs->ift_zoneid != ALL_ZONES &&
884 		    (ire->ire_type & IRE_OFFLINK)) {
885 			/*
886 			 * When we're in a zone, we're only
887 			 * interested in routers that are
888 			 * reachable through ipifs within our zone.
889 			 */
890 			if (ire->ire_ipversion == IPV4_VERSION) {
891 				if (!ire_gateway_ok_zone_v4(
892 				    ire->ire_gateway_addr, margs->ift_zoneid,
893 				    ire->ire_ill, margs->ift_tsl, ipst,
894 				    B_TRUE))
895 					goto next_ire;
896 			} else {
897 				if (!ire_gateway_ok_zone_v6(
898 				    &ire->ire_gateway_addr_v6,
899 				    margs->ift_zoneid, ire->ire_ill,
900 				    margs->ift_tsl, ipst, B_TRUE))
901 					goto next_ire;
902 			}
903 		}
904 		mutex_enter(&ire->ire_lock);
905 		/* Look for stale ire_badcnt and clear */
906 		if (ire->ire_badcnt != 0 &&
907 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
908 		    ipst->ips_ip_ire_badcnt_lifetime))
909 			ire->ire_badcnt = 0;
910 		mutex_exit(&ire->ire_lock);
911 
912 		if (ire->ire_badcnt == 0) {
913 			/* We found one with a zero badcnt; done */
914 			ire_refhold(ire);
915 			/*
916 			 * Care needed since irb_refrele grabs WLOCK to free
917 			 * the irb_t.
918 			 */
919 			if (ire->ire_ipversion == IPV4_VERSION) {
920 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
921 				irb_refrele(irb_ptr);
922 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
923 			} else {
924 				rw_exit(&ipst->ips_ip6_ire_head_lock);
925 				irb_refrele(irb_ptr);
926 				rw_enter(&ipst->ips_ip6_ire_head_lock,
927 				    RW_READER);
928 			}
929 			return (ire);
930 		}
931 		/*
932 		 * keep looking to see if there is a better (lower
933 		 * badcnt) matching IRE, but save this one as a last resort.
934 		 * If we find a lower badcnt pick that one as the last* resort.
935 		 */
936 		if (maybe_ire == NULL) {
937 			maybe_ire = ire;
938 			maybe_badcnt = ire->ire_badcnt;
939 		} else if (ire->ire_badcnt < maybe_badcnt) {
940 			maybe_ire = ire;
941 			maybe_badcnt = ire->ire_badcnt;
942 		}
943 
944 next_ire:
945 		maxwalk--;
946 next_ire_skip:
947 		ire = ire->ire_next;
948 		if (ire == NULL)
949 			ire = irb_ptr->irb_ire;
950 	}
951 	if (maybe_ire != NULL)
952 		ire_refhold(maybe_ire);
953 
954 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
955 	if (ire->ire_ipversion == IPV4_VERSION) {
956 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
957 		irb_refrele(irb_ptr);
958 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
959 	} else {
960 		rw_exit(&ipst->ips_ip6_ire_head_lock);
961 		irb_refrele(irb_ptr);
962 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
963 	}
964 	return (maybe_ire);
965 }
966 
967 void
968 irb_refhold_rn(struct radix_node *rn)
969 {
970 	if ((rn->rn_flags & RNF_ROOT) == 0)
971 		irb_refhold(&((rt_t *)(rn))->rt_irb);
972 }
973 
974 void
975 irb_refrele_rn(struct radix_node *rn)
976 {
977 	if ((rn->rn_flags & RNF_ROOT) == 0)
978 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
979 }
980 
981 
982 /*
983  * ip_select_src_ill() is used by ip_select_route() to find the src_ill
984  * to be used for source-aware routing table lookup. This function will
985  * ignore IPIF_UNNUMBERED interface addresses, and will only return a
986  * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
987  * interfaces).
988  */
989 static ill_t *
990 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
991 {
992 	ipif_t *ipif;
993 	ill_t *ill;
994 	boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
995 	ipaddr_t v4src;
996 
997 	if (isv6) {
998 		ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
999 	} else {
1000 		IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
1001 		ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
1002 	}
1003 	if (ipif == NULL)
1004 		return (NULL);
1005 	ill = ipif->ipif_ill;
1006 	ill_refhold(ill);
1007 	ipif_refrele(ipif);
1008 	return (ill);
1009 }
1010 
1011 /*
1012  * verify that v6src is configured on ill
1013  */
1014 static boolean_t
1015 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1016 {
1017 	ipif_t *ipif;
1018 	ip_stack_t *ipst;
1019 	ipaddr_t v4src;
1020 
1021 	if (ill == NULL)
1022 		return (B_FALSE);
1023 	ipst = ill->ill_ipst;
1024 
1025 	if (ill->ill_isv6) {
1026 		ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1027 	} else {
1028 		IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1029 		ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1030 	}
1031 
1032 	if (ipif != NULL) {
1033 		ipif_refrele(ipif);
1034 		return (B_TRUE);
1035 	} else {
1036 		return (B_FALSE);
1037 	}
1038 }
1039 
1040 /*
1041  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1042  * routes this routine sets up a ire_nce_cache as well. The caller needs to
1043  * lookup an nce for the multicast case.
1044  *
1045  * When src_multihoming is set to 2 (strict src multihoming) we use the source
1046  * address to select the interface and route. If IP_BOUND_IF etc are
1047  * specified, we require that they specify an interface on which the
1048  * source address is assigned.
1049  *
1050  * When src_multihoming is set to 1 (preferred src aware route
1051  * selection)  the unicast lookup prefers a matching source
1052  * (i.e., that the route points out an ill on which the source is assigned), but
1053  * if no such route is found we fallback to not considering the source in the
1054  * route lookup.
1055  *
1056  * We skip the src_multihoming check when the source isn't (yet) set, and
1057  * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1058  * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1059  * when secpolicy_net_rawaccess().
1060  */
1061 ire_t *
1062 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1063     ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1064     int *errorp, boolean_t *multirtp)
1065 {
1066 	uint_t		match_args;
1067 	uint_t		ire_type;
1068 	ill_t		*ill = NULL;
1069 	ire_t		*ire;
1070 	ip_stack_t	*ipst = ixa->ixa_ipst;
1071 	ipaddr_t	v4dst;
1072 	in6_addr_t	v6nexthop;
1073 	iaflags_t	ixaflags = ixa->ixa_flags;
1074 	nce_t		*nce;
1075 	boolean_t	preferred_src_aware = B_FALSE;
1076 	boolean_t	verify_src;
1077 	boolean_t	isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1078 	int		src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1079 
1080 	/*
1081 	 * We only verify that the src has been configured on a selected
1082 	 * interface if the src is not :: or INADDR_ANY, and if the
1083 	 * IXAF_VERIFY_SOURCE flag is set.
1084 	 */
1085 	verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1086 	    (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1087 
1088 	match_args = MATCH_IRE_SECATTR;
1089 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1090 	if (setsrcp != NULL)
1091 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1092 	if (errorp != NULL)
1093 		ASSERT(*errorp == 0);
1094 
1095 	/*
1096 	 * The content of the ixa will be different if IP_NEXTHOP,
1097 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1098 	 */
1099 
1100 	if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1101 		/* Pick up the IRE_MULTICAST for the ill */
1102 		if (ixa->ixa_multicast_ifindex != 0) {
1103 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1104 			    isv6, ipst);
1105 		} else if (ixaflags & IXAF_SCOPEID_SET) {
1106 			/* sin6_scope_id takes precedence over ixa_ifindex */
1107 			ASSERT(ixa->ixa_scopeid != 0);
1108 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1109 			    isv6, ipst);
1110 		} else if (ixa->ixa_ifindex != 0) {
1111 			/*
1112 			 * In the ipmp case, the ixa_ifindex is set to
1113 			 * point at an under_ill and we would return the
1114 			 * ire_multicast() corresponding to that under_ill.
1115 			 */
1116 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1117 			    isv6, ipst);
1118 		} else if (src_multihoming != 0 && verify_src) {
1119 			/* Look up the ill based on the source address */
1120 			ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1121 			/*
1122 			 * Since we looked up the ill from the source there
1123 			 * is no need to verify that the source is on the ill
1124 			 * below.
1125 			 */
1126 			verify_src = B_FALSE;
1127 			if (ill != NULL && IS_VNI(ill)) {
1128 				ill_t *usesrc = ill;
1129 
1130 				ill = ill_lookup_usesrc(usesrc);
1131 				ill_refrele(usesrc);
1132 			}
1133 		} else if (!isv6) {
1134 			ipaddr_t	v4setsrc = INADDR_ANY;
1135 
1136 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1137 			    ipst, multirtp, &v4setsrc);
1138 			if (setsrcp != NULL)
1139 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1140 		} else {
1141 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1142 			    ipst, multirtp, setsrcp);
1143 		}
1144 		if (ill != NULL && IS_VNI(ill)) {
1145 			ill_refrele(ill);
1146 			ill = NULL;
1147 		}
1148 		if (ill == NULL) {
1149 			if (errorp != NULL)
1150 				*errorp = ENXIO;
1151 			/* Get a hold on the IRE_NOROUTE */
1152 			ire = ire_reject(ipst, isv6);
1153 			return (ire);
1154 		}
1155 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
1156 			ill_refrele(ill);
1157 			if (errorp != NULL)
1158 				*errorp = EHOSTUNREACH;
1159 			/* Get a hold on the IRE_NOROUTE */
1160 			ire = ire_reject(ipst, isv6);
1161 			return (ire);
1162 		}
1163 		/*
1164 		 * If we are doing the strictest src_multihoming, then
1165 		 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1166 		 * an interface that is consistent with the source address.
1167 		 */
1168 		if (verify_src && src_multihoming == 2 &&
1169 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1170 			if (errorp != NULL)
1171 				*errorp = EADDRNOTAVAIL;
1172 			ill_refrele(ill);
1173 			/* Get a hold on the IRE_NOROUTE */
1174 			ire = ire_reject(ipst, isv6);
1175 			return (ire);
1176 		}
1177 		/* Get a refcnt on the single IRE_MULTICAST per ill */
1178 		ire = ire_multicast(ill);
1179 		ill_refrele(ill);
1180 		if (generationp != NULL)
1181 			*generationp = ire->ire_generation;
1182 		if (errorp != NULL &&
1183 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1184 			*errorp = EHOSTUNREACH;
1185 		}
1186 		return (ire);
1187 	}
1188 
1189 	/* Now for unicast and broadcast */
1190 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1191 		if (ixaflags & IXAF_SCOPEID_SET) {
1192 			/* sin6_scope_id takes precedence over ixa_ifindex */
1193 			ASSERT(ixa->ixa_scopeid != 0);
1194 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1195 			    isv6, ipst);
1196 		} else {
1197 			ASSERT(ixa->ixa_ifindex != 0);
1198 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1199 			    isv6, ipst);
1200 		}
1201 		if (ill != NULL && IS_VNI(ill)) {
1202 			ill_refrele(ill);
1203 			ill = NULL;
1204 		}
1205 		if (ill == NULL) {
1206 			if (errorp != NULL)
1207 				*errorp = ENXIO;
1208 			/* Get a hold on the IRE_NOROUTE */
1209 			ire = ire_reject(ipst, isv6);
1210 			return (ire);
1211 		}
1212 
1213 		match_args |= MATCH_IRE_ILL;
1214 
1215 		/*
1216 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1217 		 * so for both of them we need to be able look for an under
1218 		 * interface.
1219 		 */
1220 		if (IS_UNDER_IPMP(ill))
1221 			match_args |= MATCH_IRE_TESTHIDDEN;
1222 
1223 		/*
1224 		 * If we are doing the strictest src_multihoming, then
1225 		 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1226 		 * an interface that is consistent with the source address.
1227 		 */
1228 		if (verify_src && src_multihoming == 2 &&
1229 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1230 			if (errorp != NULL)
1231 				*errorp = EADDRNOTAVAIL;
1232 			ill_refrele(ill);
1233 			/* Get a hold on the IRE_NOROUTE */
1234 			ire = ire_reject(ipst, isv6);
1235 			return (ire);
1236 		}
1237 	} else if (src_multihoming != 0 && verify_src) {
1238 		/* Look up the ill based on the source address */
1239 		ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1240 		if (ill == NULL) {
1241 			char addrbuf[INET6_ADDRSTRLEN];
1242 
1243 			ip3dbg(("%s not a valid src for unicast",
1244 			    inet_ntop(AF_INET6, &v6src, addrbuf,
1245 			    sizeof (addrbuf))));
1246 			if (errorp != NULL)
1247 				*errorp = EADDRNOTAVAIL;
1248 			/* Get a hold on the IRE_NOROUTE */
1249 			ire = ire_reject(ipst, isv6);
1250 			return (ire);
1251 		}
1252 		match_args |= MATCH_IRE_SRC_ILL;
1253 		preferred_src_aware = (src_multihoming == 1);
1254 	}
1255 
1256 	if (ixaflags & IXAF_NEXTHOP_SET) {
1257 		/* IP_NEXTHOP was set */
1258 		v6nexthop = ixa->ixa_nexthop_v6;
1259 	} else {
1260 		v6nexthop = *v6dst;
1261 	}
1262 
1263 	ire_type = 0;
1264 
1265 	/*
1266 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1267 	 * we only look for an onlink IRE.
1268 	 */
1269 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1270 		match_args |= MATCH_IRE_TYPE;
1271 		ire_type = IRE_ONLINK;
1272 	}
1273 
1274 retry:
1275 	if (!isv6) {
1276 		ipaddr_t	v4nexthop;
1277 		ipaddr_t	v4setsrc = INADDR_ANY;
1278 
1279 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1280 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1281 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1282 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1283 		if (setsrcp != NULL)
1284 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1285 	} else {
1286 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1287 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1288 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1289 	}
1290 
1291 #ifdef DEBUG
1292 	if (match_args & MATCH_IRE_TESTHIDDEN) {
1293 		ip3dbg(("looking for hidden; dst %x ire %p\n",
1294 		    v4dst, (void *)ire));
1295 	}
1296 #endif
1297 	if (ill != NULL) {
1298 		ill_refrele(ill);
1299 		ill = NULL;
1300 	}
1301 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1302 	    (ire->ire_type & IRE_MULTICAST)) {
1303 		if (preferred_src_aware) {
1304 			/*
1305 			 * "Preferred Source Aware" send mode. If we cannot
1306 			 * find an ire whose ire_ill had the desired source
1307 			 * address retry after relaxing the ill matching
1308 			 * constraint.
1309 			 */
1310 			ire_refrele(ire);
1311 			preferred_src_aware = B_FALSE;
1312 			match_args &= ~MATCH_IRE_SRC_ILL;
1313 			goto retry;
1314 		}
1315 		/* No ire_nce_cache */
1316 		return (ire);
1317 	}
1318 
1319 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
1320 	mutex_enter(&ire->ire_lock);
1321 	nce = ire->ire_nce_cache;
1322 	if (nce == NULL || nce->nce_is_condemned) {
1323 		mutex_exit(&ire->ire_lock);
1324 		(void) ire_revalidate_nce(ire);
1325 	} else {
1326 		mutex_exit(&ire->ire_lock);
1327 	}
1328 	return (ire);
1329 }
1330 
1331 /*
1332  * Find a route given some xmit attributes and a packet.
1333  * Generic for IPv4 and IPv6
1334  *
1335  * This never returns NULL. But when it returns the IRE_NOROUTE
1336  * it might set errorp.
1337  */
1338 ire_t *
1339 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1340     int *errorp, boolean_t *multirtp)
1341 {
1342 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1343 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1344 		in6_addr_t	v6dst, v6src;
1345 
1346 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1347 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1348 
1349 		return (ip_select_route(&v6dst, v6src, ixa, generationp,
1350 		    NULL, errorp, multirtp));
1351 	} else {
1352 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
1353 
1354 		return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1355 		    ixa, generationp, NULL, errorp, multirtp));
1356 	}
1357 }
1358 
1359 ire_t *
1360 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1361     uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1362 {
1363 	in6_addr_t	v6dst, v6src;
1364 	ire_t		*ire;
1365 	in6_addr_t	setsrc;
1366 
1367 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1368 
1369 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1370 	IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1371 
1372 	setsrc = ipv6_all_zeros;
1373 	ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1374 	    multirtp);
1375 	if (v4setsrcp != NULL)
1376 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1377 	return (ire);
1378 }
1379 
1380 /*
1381  * Recursively look for a route to the destination. Can also match on
1382  * the zoneid, ill, and label. Used for the data paths. See also
1383  * ire_route_recursive.
1384  *
1385  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1386  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1387  * forwarding.
1388  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1389  * resolve the gateway.
1390  *
1391  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1392  * instead.
1393  *
1394  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1395  * is an error.
1396  * Allow at most one RTF_INDIRECT.
1397  */
1398 ire_t *
1399 ire_route_recursive_impl_v4(ire_t *ire,
1400     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1401     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1402     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1403     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1404 {
1405 	int		i, j;
1406 	ire_t		*ires[MAX_IRE_RECURSION];
1407 	uint_t		generation;
1408 	uint_t		generations[MAX_IRE_RECURSION];
1409 	boolean_t	need_refrele = B_FALSE;
1410 	boolean_t	invalidate = B_FALSE;
1411 	ill_t		*ill = NULL;
1412 	uint_t		maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1413 
1414 	if (setsrcp != NULL)
1415 		ASSERT(*setsrcp == INADDR_ANY);
1416 	if (gwattrp != NULL)
1417 		ASSERT(*gwattrp == NULL);
1418 
1419 	/*
1420 	 * We iterate up to three times to resolve a route, even though
1421 	 * we have four slots in the array. The extra slot is for an
1422 	 * IRE_IF_CLONE we might need to create.
1423 	 */
1424 	i = 0;
1425 	while (i < MAX_IRE_RECURSION - 1) {
1426 		/* ire_ftable_lookup handles round-robin/ECMP */
1427 		if (ire == NULL) {
1428 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1429 			    (ill != NULL? ill : ill_arg), zoneid, tsl,
1430 			    match_args, xmit_hint, ipst, &generation);
1431 		} else {
1432 			/* Caller passed it; extra hold since we will rele */
1433 			ire_refhold(ire);
1434 			if (generationp != NULL)
1435 				generation = *generationp;
1436 			else
1437 				generation = IRE_GENERATION_VERIFY;
1438 		}
1439 		if (ire == NULL) {
1440 			if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1441 				ire = ires[0];
1442 				ire_refhold(ire);
1443 			} else {
1444 				ire = ire_reject(ipst, B_FALSE);
1445 			}
1446 			goto error;
1447 		}
1448 
1449 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1450 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1451 			goto error;
1452 
1453 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1454 		/*
1455 		 * Verify that the IRE_IF_CLONE has a consistent generation
1456 		 * number.
1457 		 */
1458 		if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1459 			ire_refrele(ire);
1460 			ire = NULL;
1461 			continue;
1462 		}
1463 
1464 		/*
1465 		 * Don't allow anything unusual past the first iteration.
1466 		 * After the first lookup, we should no longer look for
1467 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1468 		 * routes.
1469 		 *
1470 		 * In addition, after we have found a direct IRE_OFFLINK,
1471 		 * we should only look for interface or clone routes.
1472 		 */
1473 		match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1474 
1475 		if ((ire->ire_type & IRE_OFFLINK) &&
1476 		    !(ire->ire_flags & RTF_INDIRECT)) {
1477 			ire_type = IRE_IF_ALL;
1478 		} else {
1479 			/*
1480 			 * no more local, loopback, broadcast routes
1481 			 */
1482 			if (!(match_args & MATCH_IRE_TYPE))
1483 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1484 			ire_type &= ~maskoff;
1485 		}
1486 		match_args |= MATCH_IRE_TYPE;
1487 
1488 		/* We have a usable IRE */
1489 		ires[i] = ire;
1490 		generations[i] = generation;
1491 		i++;
1492 
1493 		/* The first RTF_SETSRC address is passed back if setsrcp */
1494 		if ((ire->ire_flags & RTF_SETSRC) &&
1495 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
1496 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1497 			*setsrcp = ire->ire_setsrc_addr;
1498 		}
1499 
1500 		/* The first ire_gw_secattr is passed back if gwattrp */
1501 		if (ire->ire_gw_secattr != NULL &&
1502 		    gwattrp != NULL && *gwattrp == NULL)
1503 			*gwattrp = ire->ire_gw_secattr;
1504 
1505 		/*
1506 		 * Check if we have a short-cut pointer to an IRE for this
1507 		 * destination, and that the cached dependency isn't stale.
1508 		 * In that case we've rejoined an existing tree towards a
1509 		 * parent, thus we don't need to continue the loop to
1510 		 * discover the rest of the tree.
1511 		 */
1512 		mutex_enter(&ire->ire_lock);
1513 		if (ire->ire_dep_parent != NULL &&
1514 		    ire->ire_dep_parent->ire_generation ==
1515 		    ire->ire_dep_parent_generation) {
1516 			mutex_exit(&ire->ire_lock);
1517 			ire = NULL;
1518 			goto done;
1519 		}
1520 		mutex_exit(&ire->ire_lock);
1521 
1522 		/*
1523 		 * If this type should have an ire_nce_cache (even if it
1524 		 * doesn't yet have one) then we are done. Includes
1525 		 * IRE_INTERFACE with a full 32 bit mask.
1526 		 */
1527 		if (ire->ire_nce_capable) {
1528 			ire = NULL;
1529 			goto done;
1530 		}
1531 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1532 		/*
1533 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1534 		 * particular destination
1535 		 */
1536 		if (ire->ire_type & IRE_INTERFACE) {
1537 			in6_addr_t	v6nexthop;
1538 			ire_t		*clone;
1539 
1540 			ASSERT(ire->ire_masklen != IPV4_ABITS);
1541 
1542 			/*
1543 			 * In the case of ip_input and ILLF_FORWARDING not
1544 			 * being set, and in the case of RTM_GET, there is
1545 			 * no point in allocating an IRE_IF_CLONE. We return
1546 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1547 			 * result in a ire_dep_parent which is IRE_IF_*
1548 			 * without an IRE_IF_CLONE.
1549 			 * We recover from that when we need to send packets
1550 			 * by ensuring that the generations become
1551 			 * IRE_GENERATION_VERIFY in this case.
1552 			 */
1553 			if (!(irr_flags & IRR_ALLOCATE)) {
1554 				invalidate = B_TRUE;
1555 				ire = NULL;
1556 				goto done;
1557 			}
1558 
1559 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1560 
1561 			clone = ire_create_if_clone(ire, &v6nexthop,
1562 			    &generation);
1563 			if (clone == NULL) {
1564 				/*
1565 				 * Temporary failure - no memory.
1566 				 * Don't want caller to cache IRE_NOROUTE.
1567 				 */
1568 				invalidate = B_TRUE;
1569 				ire = ire_blackhole(ipst, B_FALSE);
1570 				goto error;
1571 			}
1572 			/*
1573 			 * Make clone next to last entry and the
1574 			 * IRE_INTERFACE the last in the dependency
1575 			 * chain since the clone depends on the
1576 			 * IRE_INTERFACE.
1577 			 */
1578 			ASSERT(i >= 1);
1579 			ASSERT(i < MAX_IRE_RECURSION);
1580 
1581 			ires[i] = ires[i-1];
1582 			generations[i] = generations[i-1];
1583 			ires[i-1] = clone;
1584 			generations[i-1] = generation;
1585 			i++;
1586 
1587 			ire = NULL;
1588 			goto done;
1589 		}
1590 
1591 		/*
1592 		 * We only match on the type and optionally ILL when
1593 		 * recursing. The type match is used by some callers
1594 		 * to exclude certain types (such as IRE_IF_CLONE or
1595 		 * IRE_LOCAL|IRE_LOOPBACK).
1596 		 *
1597 		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1598 		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1599 		 * ire_ill, so we set ill to the ire_ill;
1600 		 */
1601 		match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1602 		nexthop = ire->ire_gateway_addr;
1603 		if (ill == NULL && ire->ire_ill != NULL) {
1604 			ill = ire->ire_ill;
1605 			need_refrele = B_TRUE;
1606 			ill_refhold(ill);
1607 			match_args |= MATCH_IRE_ILL;
1608 		}
1609 		ire = NULL;
1610 	}
1611 	ASSERT(ire == NULL);
1612 	ire = ire_reject(ipst, B_FALSE);
1613 
1614 error:
1615 	ASSERT(ire != NULL);
1616 	if (need_refrele)
1617 		ill_refrele(ill);
1618 
1619 	/*
1620 	 * In the case of MULTIRT we want to try a different IRE the next
1621 	 * time. We let the next packet retry in that case.
1622 	 */
1623 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1624 		(void) ire_no_good(ires[0]);
1625 
1626 cleanup:
1627 	/* cleanup ires[i] */
1628 	ire_dep_unbuild(ires, i);
1629 	for (j = 0; j < i; j++)
1630 		ire_refrele(ires[j]);
1631 
1632 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1633 	    (irr_flags & IRR_INCOMPLETE));
1634 	/*
1635 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1636 	 * ip_select_route since the reject or lack of memory might be gone.
1637 	 */
1638 	if (generationp != NULL)
1639 		*generationp = IRE_GENERATION_VERIFY;
1640 	return (ire);
1641 
1642 done:
1643 	ASSERT(ire == NULL);
1644 	if (need_refrele) {
1645 		ill_refrele(ill);
1646 		ill = NULL;
1647 	}
1648 
1649 	/* Build dependencies */
1650 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1651 		/* Something in chain was condemned; tear it apart */
1652 		ire = ire_reject(ipst, B_FALSE);
1653 		goto cleanup;
1654 	}
1655 
1656 	/*
1657 	 * Release all refholds except the one for ires[0] that we
1658 	 * will return to the caller.
1659 	 */
1660 	for (j = 1; j < i; j++)
1661 		ire_refrele(ires[j]);
1662 
1663 	if (invalidate) {
1664 		/*
1665 		 * Since we needed to allocate but couldn't we need to make
1666 		 * sure that the dependency chain is rebuilt the next time.
1667 		 */
1668 		ire_dep_invalidate_generations(ires[0]);
1669 		generation = IRE_GENERATION_VERIFY;
1670 	} else {
1671 		/*
1672 		 * IREs can have been added or deleted while we did the
1673 		 * recursive lookup and we can't catch those until we've built
1674 		 * the dependencies. We verify the stored
1675 		 * ire_dep_parent_generation to catch any such changes and
1676 		 * return IRE_GENERATION_VERIFY (which will cause
1677 		 * ip_select_route to be called again so we can redo the
1678 		 * recursive lookup next time we send a packet.
1679 		 */
1680 		if (ires[0]->ire_dep_parent == NULL)
1681 			generation = ires[0]->ire_generation;
1682 		else
1683 			generation = ire_dep_validate_generations(ires[0]);
1684 		if (generations[0] != ires[0]->ire_generation) {
1685 			/* Something changed at the top */
1686 			generation = IRE_GENERATION_VERIFY;
1687 		}
1688 	}
1689 	if (generationp != NULL)
1690 		*generationp = generation;
1691 
1692 	return (ires[0]);
1693 }
1694 
1695 ire_t *
1696 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1697     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1698     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1699     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1700 {
1701 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1702 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1703 	    gwattrp, generationp));
1704 }
1705 
1706 /*
1707  * Recursively look for a route to the destination.
1708  * We only handle a destination match here, yet we have the same arguments
1709  * as the full match to allow function pointers to select between the two.
1710  *
1711  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1712  * instead.
1713  *
1714  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1715  * is an error.
1716  * Allow at most one RTF_INDIRECT.
1717  */
1718 ire_t *
1719 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1720     uint32_t xmit_hint, ip_stack_t *ipst)
1721 {
1722 	ire_t	*ire;
1723 	ire_t	*ire1;
1724 	uint_t	generation;
1725 
1726 	/* ire_ftable_lookup handles round-robin/ECMP */
1727 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1728 	    &generation);
1729 	ASSERT(ire != NULL);
1730 	/*
1731 	 * If the IRE has a current cached parent we know that the whole
1732 	 * parent chain is current, hence we don't need to discover and
1733 	 * build any dependencies by doing a recursive lookup.
1734 	 */
1735 	mutex_enter(&ire->ire_lock);
1736 	if (ire->ire_dep_parent != NULL) {
1737 		if (ire->ire_dep_parent->ire_generation ==
1738 		    ire->ire_dep_parent_generation) {
1739 			mutex_exit(&ire->ire_lock);
1740 			return (ire);
1741 		}
1742 		mutex_exit(&ire->ire_lock);
1743 	} else {
1744 		mutex_exit(&ire->ire_lock);
1745 		/*
1746 		 * If this type should have an ire_nce_cache (even if it
1747 		 * doesn't yet have one) then we are done. Includes
1748 		 * IRE_INTERFACE with a full 32 bit mask.
1749 		 */
1750 		if (ire->ire_nce_capable)
1751 			return (ire);
1752 	}
1753 
1754 	/*
1755 	 * Fallback to loop in the normal code starting with the ire
1756 	 * we found. Normally this would return the same ire.
1757 	 */
1758 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1759 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1760 	    &generation);
1761 	ire_refrele(ire);
1762 	return (ire1);
1763 }
1764 
1765 /*
1766  * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1767  * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1768  * are not consistent, and TRUE otherwise.
1769  */
1770 boolean_t
1771 ire_clone_verify(ire_t *ire)
1772 {
1773 	ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1774 	mutex_enter(&ire->ire_lock);
1775 	if (ire->ire_dep_parent != NULL &&
1776 	    ire->ire_dep_parent->ire_generation !=
1777 	    ire->ire_dep_parent_generation) {
1778 		mutex_exit(&ire->ire_lock);
1779 		ire_delete(ire);
1780 		return (B_FALSE);
1781 	}
1782 	mutex_exit(&ire->ire_lock);
1783 	return (B_TRUE);
1784 }
1785