xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ftable.c (revision 6a1af1a67532df169a657cce07140be64bdea084)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * This file contains consumer routines of the IPv4 forwarding engine
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #include <sys/dlpi.h>
34 #include <sys/ddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/policy.h>
37 
38 #include <sys/systm.h>
39 #include <sys/strsun.h>
40 #include <sys/kmem.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <sys/strsubr.h>
44 #include <net/if.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <net/if_dl.h>
48 #include <netinet/ip6.h>
49 #include <netinet/icmp6.h>
50 
51 #include <inet/ipsec_impl.h>
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/arp.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_ftable.h>
63 #include <inet/ip_rts.h>
64 #include <inet/nd.h>
65 
66 #include <net/pfkeyv2.h>
67 #include <inet/sadb.h>
68 #include <inet/tcp.h>
69 #include <inet/ipclassifier.h>
70 #include <sys/zone.h>
71 #include <net/radix.h>
72 #include <sys/tsol/label.h>
73 #include <sys/tsol/tnet.h>
74 
75 #define	IS_DEFAULT_ROUTE(ire)	\
76 	(((ire)->ire_type & IRE_DEFAULT) || \
77 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
78 
79 #define	IP_SRC_MULTIHOMING(isv6, ipst) 			\
80 	(isv6 ? ipst->ips_ipv6_strict_src_multihoming :	\
81 	ipst->ips_ip_strict_src_multihoming)
82 
83 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
84 static void	ire_del_host_redir(ire_t *, char *);
85 static boolean_t ire_find_best_route(struct radix_node *, void *);
86 
87 /*
88  * Lookup a route in forwarding table. A specific lookup is indicated by
89  * passing the required parameters and indicating the match required in the
90  * flag field.
91  *
92  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
93  */
94 ire_t *
95 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
96     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
97     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
98 {
99 	ire_t *ire;
100 	struct rt_sockaddr rdst, rmask;
101 	struct rt_entry *rt;
102 	ire_ftable_args_t margs;
103 
104 	ASSERT(ill == NULL || !ill->ill_isv6);
105 
106 	/*
107 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
108 	 * is set.
109 	 */
110 	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
111 		return (NULL);
112 
113 	bzero(&rdst, sizeof (rdst));
114 	rdst.rt_sin_len = sizeof (rdst);
115 	rdst.rt_sin_family = AF_INET;
116 	rdst.rt_sin_addr.s_addr = addr;
117 
118 	bzero(&rmask, sizeof (rmask));
119 	rmask.rt_sin_len = sizeof (rmask);
120 	rmask.rt_sin_family = AF_INET;
121 	rmask.rt_sin_addr.s_addr = mask;
122 
123 	bzero(&margs, sizeof (margs));
124 	margs.ift_addr = addr;
125 	margs.ift_mask = mask;
126 	margs.ift_gateway = gateway;
127 	margs.ift_type = type;
128 	margs.ift_ill = ill;
129 	margs.ift_zoneid = zoneid;
130 	margs.ift_tsl = tsl;
131 	margs.ift_flags = flags;
132 
133 	/*
134 	 * The flags argument passed to ire_ftable_lookup may cause the
135 	 * search to return, not the longest matching prefix, but the
136 	 * "best matching prefix", i.e., the longest prefix that also
137 	 * satisfies constraints imposed via the permutation of flags
138 	 * passed in. To achieve this, we invoke ire_match_args() on
139 	 * each matching leaf in the  radix tree. ire_match_args is
140 	 * invoked by the callback function ire_find_best_route()
141 	 * We hold the global tree lock in read mode when calling
142 	 * rn_match_args. Before dropping the global tree lock, ensure
143 	 * that the radix node can't be deleted by incrementing ire_refcnt.
144 	 */
145 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
146 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
147 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
148 	ire = margs.ift_best_ire;
149 	if (rt == NULL) {
150 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
151 		return (NULL);
152 	}
153 	ASSERT(ire != NULL);
154 
155 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
156 
157 	/*
158 	 * round-robin only if we have more than one route in the bucket.
159 	 * ips_ip_ecmp_behavior controls when we do ECMP
160 	 *	2:	always
161 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
162 	 *	0:	never
163 	 */
164 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
165 		if (ipst->ips_ip_ecmp_behavior == 2 ||
166 		    (ipst->ips_ip_ecmp_behavior == 1 &&
167 		    IS_DEFAULT_ROUTE(ire))) {
168 			ire_t	*next_ire;
169 
170 			margs.ift_best_ire = NULL;
171 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
172 			    xmit_hint, ire, ipst);
173 			if (next_ire == NULL) {
174 				/* keep ire if next_ire is null */
175 				goto done;
176 			}
177 			ire_refrele(ire);
178 			ire = next_ire;
179 		}
180 	}
181 
182 done:
183 	/* Return generation before dropping lock */
184 	if (generationp != NULL)
185 		*generationp = ire->ire_generation;
186 
187 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
188 
189 	/*
190 	 * For shared-IP zones we need additional checks to what was
191 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
192 	 *
193 	 * When ip_restrict_interzone_loopback is set, then
194 	 * we ensure that IRE_LOCAL are only used for loopback
195 	 * between zones when the logical "Ethernet" would
196 	 * have looped them back. That is, if in the absense of
197 	 * the IRE_LOCAL we would have sent to packet out the
198 	 * same ill.
199 	 */
200 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
201 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
202 	    ipst->ips_ip_restrict_interzone_loopback) {
203 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
204 		ASSERT(ire != NULL);
205 	}
206 	return (ire);
207 }
208 
209 /*
210  * This function is called by
211  * ip_input/ire_route_recursive when doing a route lookup on only the
212  * destination address.
213  *
214  * The optimizations of this function over ire_ftable_lookup are:
215  *	o removing unnecessary flag matching
216  *	o doing longest prefix match instead of overloading it further
217  *	  with the unnecessary "best_prefix_match"
218  *
219  * If no route is found we return IRE_NOROUTE.
220  */
221 ire_t *
222 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
223     uint_t *generationp)
224 {
225 	ire_t *ire;
226 	struct rt_sockaddr rdst;
227 	struct rt_entry *rt;
228 	irb_t *irb;
229 
230 	rdst.rt_sin_len = sizeof (rdst);
231 	rdst.rt_sin_family = AF_INET;
232 	rdst.rt_sin_addr.s_addr = addr;
233 
234 	/*
235 	 * This is basically inlining  a simpler version of ire_match_args
236 	 */
237 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
238 
239 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
240 	    ipst->ips_ip_ftable, NULL, NULL);
241 
242 	if (rt == NULL)
243 		goto bad;
244 
245 	irb = &rt->rt_irb;
246 	if (irb->irb_ire_cnt == 0)
247 		goto bad;
248 
249 	rw_enter(&irb->irb_lock, RW_READER);
250 	ire = irb->irb_ire;
251 	if (ire == NULL) {
252 		rw_exit(&irb->irb_lock);
253 		goto bad;
254 	}
255 	while (IRE_IS_CONDEMNED(ire)) {
256 		ire = ire->ire_next;
257 		if (ire == NULL) {
258 			rw_exit(&irb->irb_lock);
259 			goto bad;
260 		}
261 	}
262 
263 	/* we have a ire that matches */
264 	ire_refhold(ire);
265 	rw_exit(&irb->irb_lock);
266 
267 	/*
268 	 * round-robin only if we have more than one route in the bucket.
269 	 * ips_ip_ecmp_behavior controls when we do ECMP
270 	 *	2:	always
271 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
272 	 *	0:	never
273 	 *
274 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
275 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
276 	 * and the IRE_INTERFACESs are likely to be shorter matches.
277 	 */
278 	if (ire->ire_bucket->irb_ire_cnt > 1) {
279 		if (ipst->ips_ip_ecmp_behavior == 2 ||
280 		    (ipst->ips_ip_ecmp_behavior == 1 &&
281 		    IS_DEFAULT_ROUTE(ire))) {
282 			ire_t	*next_ire;
283 			ire_ftable_args_t margs;
284 
285 			bzero(&margs, sizeof (margs));
286 			margs.ift_addr = addr;
287 			margs.ift_zoneid = ALL_ZONES;
288 
289 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
290 			    xmit_hint, ire, ipst);
291 			if (next_ire == NULL) {
292 				/* keep ire if next_ire is null */
293 				if (generationp != NULL)
294 					*generationp = ire->ire_generation;
295 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
296 				return (ire);
297 			}
298 			ire_refrele(ire);
299 			ire = next_ire;
300 		}
301 	}
302 	/* Return generation before dropping lock */
303 	if (generationp != NULL)
304 		*generationp = ire->ire_generation;
305 
306 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
307 
308 	/*
309 	 * Since we only did ALL_ZONES matches there is no special handling
310 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
311 	 */
312 	return (ire);
313 
314 bad:
315 	if (generationp != NULL)
316 		*generationp = IRE_GENERATION_VERIFY;
317 
318 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
319 	return (ire_reject(ipst, B_FALSE));
320 }
321 
322 /*
323  * Find the ill matching a multicast group.
324  * Allows different routes for multicast addresses
325  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
326  * which point at different interfaces. This is used when IP_MULTICAST_IF
327  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
328  * specify the interface to join on.
329  *
330  * Supports link-local addresses by using ire_route_recursive which follows
331  * the ill when recursing.
332  *
333  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
334  * and the MULTIRT property can be different for different groups, we
335  * extract RTF_MULTIRT from the special unicast route added for a group
336  * with CGTP and pass that back in the multirtp argument.
337  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
338  * We have a setsrcp argument for the same reason.
339  */
340 ill_t *
341 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
342     boolean_t *multirtp, ipaddr_t *setsrcp)
343 {
344 	ire_t	*ire;
345 	ill_t	*ill;
346 
347 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
348 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
349 	ASSERT(ire != NULL);
350 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
351 		ire_refrele(ire);
352 		return (NULL);
353 	}
354 
355 	if (multirtp != NULL)
356 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
357 
358 	ill = ire_nexthop_ill(ire);
359 	ire_refrele(ire);
360 	return (ill);
361 }
362 
363 /*
364  * Delete the passed in ire if the gateway addr matches
365  */
366 void
367 ire_del_host_redir(ire_t *ire, char *gateway)
368 {
369 	if ((ire->ire_flags & RTF_DYNAMIC) &&
370 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
371 		ire_delete(ire);
372 }
373 
374 /*
375  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
376  * pointing at the specified gateway and
377  * delete them. This routine is called only
378  * when a default gateway is going away.
379  */
380 void
381 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
382 {
383 	struct rtfuncarg rtfarg;
384 
385 	bzero(&rtfarg, sizeof (rtfarg));
386 	rtfarg.rt_func = ire_del_host_redir;
387 	rtfarg.rt_arg = (void *)&gateway;
388 	rtfarg.rt_zoneid = ALL_ZONES;
389 	rtfarg.rt_ipst = ipst;
390 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
391 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
392 }
393 
394 /*
395  * Obtain the rt_entry and rt_irb for the route to be added to
396  * the ips_ip_ftable.
397  * First attempt to add a node to the radix tree via rn_addroute. If the
398  * route already exists, return the bucket for the existing route.
399  *
400  * Locking notes: Need to hold the global radix tree lock in write mode to
401  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
402  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
403  * while holding the irb_lock, but not the radix tree lock.
404  */
405 irb_t *
406 ire_get_bucket(ire_t *ire)
407 {
408 	struct radix_node *rn;
409 	struct rt_entry *rt;
410 	struct rt_sockaddr rmask, rdst;
411 	irb_t *irb = NULL;
412 	ip_stack_t *ipst = ire->ire_ipst;
413 
414 	ASSERT(ipst->ips_ip_ftable != NULL);
415 
416 	/* first try to see if route exists (based on rtalloc1) */
417 	bzero(&rdst, sizeof (rdst));
418 	rdst.rt_sin_len = sizeof (rdst);
419 	rdst.rt_sin_family = AF_INET;
420 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
421 
422 	bzero(&rmask, sizeof (rmask));
423 	rmask.rt_sin_len = sizeof (rmask);
424 	rmask.rt_sin_family = AF_INET;
425 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
426 
427 	/*
428 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
429 	 */
430 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
431 	/* kmem_alloc failed */
432 	if (rt == NULL)
433 		return (NULL);
434 
435 	bzero(rt, sizeof (*rt));
436 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
437 	rt->rt_dst = rdst;
438 	irb = &rt->rt_irb;
439 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
440 	irb->irb_ipst = ipst;
441 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
442 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
443 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
444 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
445 	if (rn == NULL) {
446 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
447 		Free(rt, rt_entry_cache);
448 		rt = NULL;
449 		irb = NULL;
450 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
451 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
452 		    ipst->ips_ip_ftable);
453 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
454 			/* found a non-root match */
455 			rt = (struct rt_entry *)rn;
456 		}
457 	}
458 	if (rt != NULL) {
459 		irb = &rt->rt_irb;
460 		irb_refhold(irb);
461 	}
462 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
463 	return (irb);
464 }
465 
466 /*
467  * This function is used when the caller wants to know the outbound
468  * interface for a packet given only the address.
469  * If this is a offlink IP address and there are multiple
470  * routes to this destination, this routine will utilise the
471  * first route it finds to IP address
472  * Return values:
473  * 	0	- FAILURE
474  *	nonzero	- ifindex
475  */
476 uint_t
477 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
478 {
479 	uint_t ifindex = 0;
480 	ire_t *ire;
481 	ill_t *ill;
482 	netstack_t *ns;
483 	ip_stack_t *ipst;
484 
485 	if (zoneid == ALL_ZONES)
486 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
487 	else
488 		ns = netstack_find_by_zoneid(zoneid);
489 	ASSERT(ns != NULL);
490 
491 	/*
492 	 * For exclusive stacks we set the zoneid to zero
493 	 * since IP uses the global zoneid in the exclusive stacks.
494 	 */
495 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
496 		zoneid = GLOBAL_ZONEID;
497 	ipst = ns->netstack_ip;
498 
499 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
500 
501 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
502 		ill = ire_nexthop_ill(ire);
503 		if (ill != NULL) {
504 			ifindex = ill->ill_phyint->phyint_ifindex;
505 			ill_refrele(ill);
506 		}
507 		ire_refrele(ire);
508 	}
509 	netstack_rele(ns);
510 	return (ifindex);
511 }
512 
513 /*
514  * Routine to find the route to a destination. If a ifindex is supplied
515  * it tries to match the route to the corresponding ipif for the ifindex
516  */
517 static	ire_t *
518 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
519 {
520 	ire_t *ire = NULL;
521 	int match_flags;
522 
523 	match_flags = MATCH_IRE_DSTONLY;
524 
525 	/* XXX pass NULL tsl for now */
526 
527 	if (dst_addr->sa_family == AF_INET) {
528 		ire = ire_route_recursive_v4(
529 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
530 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
531 		    NULL, NULL);
532 	} else {
533 		ire = ire_route_recursive_v6(
534 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
535 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
536 		    NULL, NULL);
537 	}
538 	ASSERT(ire != NULL);
539 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
540 		ire_refrele(ire);
541 		return (NULL);
542 	}
543 	return (ire);
544 }
545 
546 /*
547  * This routine is called by IP Filter to send a packet out on the wire
548  * to a specified dstination (which may be onlink or offlink). The ifindex may
549  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
550  * an outgoing interface and requires the nexthop to be on that interface.
551  * IP WILL NOT DO the following to the data packet before sending it out:
552  *	a. manipulate ttl
553  *	b. ipsec work
554  *	c. fragmentation
555  *
556  * If the packet has been prepared for hardware checksum then it will be
557  * passed off to ip_send_align_cksum() to check that the flags set on the
558  * packet are in alignment with the capabilities of the new outgoing NIC.
559  *
560  * Return values:
561  *	0:		IP was able to send of the data pkt
562  *	ECOMM:		Could not send packet
563  *	ENONET		No route to dst. It is up to the caller
564  *			to send icmp unreachable error message,
565  *	EINPROGRESS	The macaddr of the onlink dst or that
566  *			of the offlink dst's nexthop needs to get
567  *			resolved before packet can be sent to dst.
568  *			Thus transmission is not guaranteed.
569  *			Note: No longer have visibility to the ARP queue
570  *			hence no EINPROGRESS.
571  */
572 int
573 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
574     zoneid_t zoneid)
575 {
576 	ipaddr_t nexthop;
577 	netstack_t *ns;
578 	ip_stack_t *ipst;
579 	ip_xmit_attr_t ixas;
580 	int error;
581 
582 	ASSERT(mp != NULL);
583 
584 	if (zoneid == ALL_ZONES)
585 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
586 	else
587 		ns = netstack_find_by_zoneid(zoneid);
588 	ASSERT(ns != NULL);
589 
590 	/*
591 	 * For exclusive stacks we set the zoneid to zero
592 	 * since IP uses the global zoneid in the exclusive stacks.
593 	 */
594 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
595 		zoneid = GLOBAL_ZONEID;
596 	ipst = ns->netstack_ip;
597 
598 	ASSERT(dst_addr->sa_family == AF_INET ||
599 	    dst_addr->sa_family == AF_INET6);
600 
601 	bzero(&ixas, sizeof (ixas));
602 	/*
603 	 * No IPsec, no fragmentation, and don't let any hooks see
604 	 * the packet.
605 	 */
606 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
607 	ixas.ixa_cred = kcred;
608 	ixas.ixa_cpid = NOPID;
609 	ixas.ixa_tsl = NULL;
610 	ixas.ixa_ipst = ipst;
611 	ixas.ixa_ifindex = ifindex;
612 
613 	if (dst_addr->sa_family == AF_INET) {
614 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
615 
616 		ixas.ixa_flags |= IXAF_IS_IPV4;
617 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
618 		if (nexthop != ipha->ipha_dst) {
619 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
620 			ixas.ixa_nexthop_v4 = nexthop;
621 		}
622 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
623 	} else {
624 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
625 		in6_addr_t *nexthop6;
626 
627 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
628 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
629 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
630 			ixas.ixa_nexthop_v6 = *nexthop6;
631 		}
632 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
633 	}
634 	error = ip_output_simple(mp, &ixas);
635 	ixa_cleanup(&ixas);
636 
637 	netstack_rele(ns);
638 	switch (error) {
639 	case 0:
640 		break;
641 
642 	case EHOSTUNREACH:
643 	case ENETUNREACH:
644 		error = ENONET;
645 		break;
646 
647 	default:
648 		error = ECOMM;
649 		break;
650 	}
651 	return (error);
652 }
653 
654 /*
655  * callback function provided by ire_ftable_lookup when calling
656  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
657  * the radix tree.
658  */
659 boolean_t
660 ire_find_best_route(struct radix_node *rn, void *arg)
661 {
662 	struct rt_entry *rt = (struct rt_entry *)rn;
663 	irb_t *irb_ptr;
664 	ire_t *ire;
665 	ire_ftable_args_t *margs = arg;
666 	ipaddr_t match_mask;
667 
668 	ASSERT(rt != NULL);
669 
670 	irb_ptr = &rt->rt_irb;
671 
672 	if (irb_ptr->irb_ire_cnt == 0)
673 		return (B_FALSE);
674 
675 	rw_enter(&irb_ptr->irb_lock, RW_READER);
676 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
677 		if (IRE_IS_CONDEMNED(ire))
678 			continue;
679 		ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
680 		if (margs->ift_flags & MATCH_IRE_MASK)
681 			match_mask = margs->ift_mask;
682 		else
683 			match_mask = ire->ire_mask;
684 
685 		if (ire_match_args(ire, margs->ift_addr, match_mask,
686 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
687 		    margs->ift_zoneid, margs->ift_tsl,
688 		    margs->ift_flags)) {
689 			ire_refhold(ire);
690 			rw_exit(&irb_ptr->irb_lock);
691 			margs->ift_best_ire = ire;
692 			return (B_TRUE);
693 		}
694 	}
695 	rw_exit(&irb_ptr->irb_lock);
696 	return (B_FALSE);
697 }
698 
699 /*
700  * ftable irb_t structures are dynamically allocated, and we need to
701  * check if the irb_t (and associated ftable tree attachment) needs to
702  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
703  * be verified are:
704  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
705  * - no other threads holding references to ire's in the bucket,
706  *   i.e., irb_nire == 0
707  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
708  * - need to hold the global tree lock and irb_lock in write mode.
709  */
710 void
711 irb_refrele_ftable(irb_t *irb)
712 {
713 	for (;;) {
714 		rw_enter(&irb->irb_lock, RW_WRITER);
715 		ASSERT(irb->irb_refcnt != 0);
716 		if (irb->irb_refcnt != 1) {
717 			/*
718 			 * Someone has a reference to this radix node
719 			 * or there is some bucket walker.
720 			 */
721 			irb->irb_refcnt--;
722 			rw_exit(&irb->irb_lock);
723 			return;
724 		} else {
725 			/*
726 			 * There is no other walker, nor is there any
727 			 * other thread that holds a direct ref to this
728 			 * radix node. Do the clean up if needed. Call
729 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
730 			 */
731 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
732 				ire_t *ire_list;
733 
734 				ire_list = ire_unlink(irb);
735 				rw_exit(&irb->irb_lock);
736 
737 				if (ire_list != NULL)
738 					ire_cleanup(ire_list);
739 				/*
740 				 * more CONDEMNED entries could have
741 				 * been added while we dropped the lock,
742 				 * so we have to re-check.
743 				 */
744 				continue;
745 			}
746 
747 			/*
748 			 * Now check if there are still any ires
749 			 * associated with this radix node.
750 			 */
751 			if (irb->irb_nire != 0) {
752 				/*
753 				 * someone is still holding on
754 				 * to ires in this bucket
755 				 */
756 				irb->irb_refcnt--;
757 				rw_exit(&irb->irb_lock);
758 				return;
759 			} else {
760 				/*
761 				 * Everything is clear. Zero walkers,
762 				 * Zero threads with a ref to this
763 				 * radix node, Zero ires associated with
764 				 * this radix node. Due to lock order,
765 				 * check the above conditions again
766 				 * after grabbing all locks in the right order
767 				 */
768 				rw_exit(&irb->irb_lock);
769 				if (irb_inactive(irb))
770 					return;
771 				/*
772 				 * irb_inactive could not free the irb.
773 				 * See if there are any walkers, if not
774 				 * try to clean up again.
775 				 */
776 			}
777 		}
778 	}
779 }
780 
781 /*
782  * IRE iterator used by ire_ftable_lookup to process multiple equal
783  * routes. Given a starting point in the hash list (hash), walk the IREs
784  * in the bucket skipping deleted entries. We treat the bucket as a circular
785  * list for the purposes of walking it.
786  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
787  * not applicable (ire_match_args failed) then it returns a subsequent one.
788  * If we fail to find an IRE we return NULL.
789  *
790  * Assumes that the caller holds a reference on the IRE bucket and a read lock
791  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
792  *
793  * Applies to IPv4 and IPv6.
794  *
795  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
796  * address and bucket, we compare against ire_type for the orig_ire. We also
797  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
798  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
799  *
800  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
801  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
802  * in which the zone has an IP address. We check this for the global zone
803  * even if no shared-IP zones are configured.
804  */
805 ire_t *
806 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
807     ire_t *orig_ire, ip_stack_t *ipst)
808 {
809 	ire_t		*ire, *maybe_ire = NULL;
810 	uint_t		maybe_badcnt;
811 	uint_t		maxwalk;
812 
813 	/* Fold in more bits from the hint/hash */
814 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
815 
816 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
817 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
818 	hash %= maxwalk;
819 	irb_refhold_locked(irb_ptr);
820 	rw_exit(&irb_ptr->irb_lock);
821 
822 	/*
823 	 * Round-robin the routers list looking for a route that
824 	 * matches the passed in parameters.
825 	 * First we skip "hash" number of non-condemned IREs.
826 	 * Then we match the IRE.
827 	 * If we find an ire which has a non-zero ire_badcnt then we remember
828 	 * it and keep on looking for a lower ire_badcnt.
829 	 * If we come to the end of the list we continue (treat the
830 	 * bucket list as a circular list) but we match less than "max"
831 	 * entries.
832 	 */
833 	ire = irb_ptr->irb_ire;
834 	while (maxwalk > 0) {
835 		if (IRE_IS_CONDEMNED(ire))
836 			goto next_ire_skip;
837 
838 		/* Skip the first "hash" entries to do ECMP */
839 		if (hash != 0) {
840 			hash--;
841 			goto next_ire_skip;
842 		}
843 
844 		/* See CGTP comment above */
845 		if (ire->ire_type != orig_ire->ire_type ||
846 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
847 			goto next_ire;
848 
849 		/*
850 		 * Note: Since IPv6 has hash buckets instead of radix
851 		 * buckers we need to explicitly compare the addresses.
852 		 * That makes this less efficient since we will be called
853 		 * even if there is no alternatives just because the
854 		 * bucket has multiple IREs for different addresses.
855 		 */
856 		if (ire->ire_ipversion == IPV6_VERSION) {
857 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
858 			    &ire->ire_addr_v6))
859 				goto next_ire;
860 		}
861 
862 		/*
863 		 * For some reason find_best_route uses ire_mask. We do
864 		 * the same.
865 		 */
866 		if (ire->ire_ipversion == IPV4_VERSION ?
867 		    !ire_match_args(ire, margs->ift_addr,
868 		    ire->ire_mask, margs->ift_gateway,
869 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
870 		    margs->ift_tsl, margs->ift_flags) :
871 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
872 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
873 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
874 		    margs->ift_tsl, margs->ift_flags))
875 			goto next_ire;
876 
877 		if (margs->ift_zoneid != ALL_ZONES &&
878 		    (ire->ire_type & IRE_OFFLINK)) {
879 			/*
880 			 * When we're in a zone, we're only
881 			 * interested in routers that are
882 			 * reachable through ipifs within our zone.
883 			 */
884 			if (ire->ire_ipversion == IPV4_VERSION) {
885 				if (!ire_gateway_ok_zone_v4(
886 				    ire->ire_gateway_addr, margs->ift_zoneid,
887 				    ire->ire_ill, margs->ift_tsl, ipst,
888 				    B_TRUE))
889 					goto next_ire;
890 			} else {
891 				if (!ire_gateway_ok_zone_v6(
892 				    &ire->ire_gateway_addr_v6,
893 				    margs->ift_zoneid, ire->ire_ill,
894 				    margs->ift_tsl, ipst, B_TRUE))
895 					goto next_ire;
896 			}
897 		}
898 		mutex_enter(&ire->ire_lock);
899 		/* Look for stale ire_badcnt and clear */
900 		if (ire->ire_badcnt != 0 &&
901 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
902 		    ipst->ips_ip_ire_badcnt_lifetime))
903 			ire->ire_badcnt = 0;
904 		mutex_exit(&ire->ire_lock);
905 
906 		if (ire->ire_badcnt == 0) {
907 			/* We found one with a zero badcnt; done */
908 			ire_refhold(ire);
909 			/*
910 			 * Care needed since irb_refrele grabs WLOCK to free
911 			 * the irb_t.
912 			 */
913 			if (ire->ire_ipversion == IPV4_VERSION) {
914 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
915 				irb_refrele(irb_ptr);
916 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
917 			} else {
918 				rw_exit(&ipst->ips_ip6_ire_head_lock);
919 				irb_refrele(irb_ptr);
920 				rw_enter(&ipst->ips_ip6_ire_head_lock,
921 				    RW_READER);
922 			}
923 			return (ire);
924 		}
925 		/*
926 		 * keep looking to see if there is a better (lower
927 		 * badcnt) matching IRE, but save this one as a last resort.
928 		 * If we find a lower badcnt pick that one as the last* resort.
929 		 */
930 		if (maybe_ire == NULL) {
931 			maybe_ire = ire;
932 			maybe_badcnt = ire->ire_badcnt;
933 		} else if (ire->ire_badcnt < maybe_badcnt) {
934 			maybe_ire = ire;
935 			maybe_badcnt = ire->ire_badcnt;
936 		}
937 
938 next_ire:
939 		maxwalk--;
940 next_ire_skip:
941 		ire = ire->ire_next;
942 		if (ire == NULL)
943 			ire = irb_ptr->irb_ire;
944 	}
945 	if (maybe_ire != NULL)
946 		ire_refhold(maybe_ire);
947 
948 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
949 	if (ire->ire_ipversion == IPV4_VERSION) {
950 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
951 		irb_refrele(irb_ptr);
952 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
953 	} else {
954 		rw_exit(&ipst->ips_ip6_ire_head_lock);
955 		irb_refrele(irb_ptr);
956 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
957 	}
958 	return (maybe_ire);
959 }
960 
961 void
962 irb_refhold_rn(struct radix_node *rn)
963 {
964 	if ((rn->rn_flags & RNF_ROOT) == 0)
965 		irb_refhold(&((rt_t *)(rn))->rt_irb);
966 }
967 
968 void
969 irb_refrele_rn(struct radix_node *rn)
970 {
971 	if ((rn->rn_flags & RNF_ROOT) == 0)
972 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
973 }
974 
975 
976 /*
977  * ip_select_src_ill() is used by ip_select_route() to find the src_ill
978  * to be used for source-aware routing table lookup. This function will
979  * ignore IPIF_UNNUMBERED interface addresses, and will only return a
980  * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
981  * interfaces).
982  */
983 static ill_t *
984 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
985 {
986 	ipif_t *ipif;
987 	ill_t *ill;
988 	boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
989 	ipaddr_t v4src;
990 
991 	if (isv6) {
992 		ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
993 	} else {
994 		IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
995 		ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
996 	}
997 	if (ipif == NULL)
998 		return (NULL);
999 	ill = ipif->ipif_ill;
1000 	ill_refhold(ill);
1001 	ipif_refrele(ipif);
1002 	return (ill);
1003 }
1004 
1005 /*
1006  * verify that v6src is configured on ill
1007  */
1008 static boolean_t
1009 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1010 {
1011 	ipif_t *ipif;
1012 	ip_stack_t *ipst;
1013 	ipaddr_t v4src;
1014 
1015 	if (ill == NULL)
1016 		return (B_FALSE);
1017 	ipst = ill->ill_ipst;
1018 
1019 	if (ill->ill_isv6) {
1020 		ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1021 	} else {
1022 		IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1023 		ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1024 	}
1025 
1026 	if (ipif != NULL) {
1027 		ipif_refrele(ipif);
1028 		return (B_TRUE);
1029 	} else {
1030 		return (B_FALSE);
1031 	}
1032 }
1033 
1034 /*
1035  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1036  * routes this routine sets up a ire_nce_cache as well. The caller needs to
1037  * lookup an nce for the multicast case.
1038  *
1039  * When src_multihoming is set to 2 (strict src multihoming) we use the source
1040  * address to select the interface and route. If IP_BOUND_IF etc are
1041  * specified, we require that they specify an interface on which the
1042  * source address is assigned.
1043  *
1044  * When src_multihoming is set to 1 (preferred src aware route
1045  * selection)  the unicast lookup prefers a matching source
1046  * (i.e., that the route points out an ill on which the source is assigned), but
1047  * if no such route is found we fallback to not considering the source in the
1048  * route lookup.
1049  *
1050  * We skip the src_multihoming check when the source isn't (yet) set, and
1051  * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1052  * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1053  * when secpolicy_net_rawaccess().
1054  */
1055 ire_t *
1056 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1057     ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1058     int *errorp, boolean_t *multirtp)
1059 {
1060 	uint_t		match_args;
1061 	uint_t		ire_type;
1062 	ill_t		*ill = NULL;
1063 	ire_t		*ire;
1064 	ip_stack_t	*ipst = ixa->ixa_ipst;
1065 	ipaddr_t	v4dst;
1066 	in6_addr_t	v6nexthop;
1067 	iaflags_t	ixaflags = ixa->ixa_flags;
1068 	nce_t		*nce;
1069 	boolean_t	preferred_src_aware = B_FALSE;
1070 	boolean_t	verify_src;
1071 	boolean_t	isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1072 	int		src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1073 
1074 	/*
1075 	 * We only verify that the src has been configured on a selected
1076 	 * interface if the src is not :: or INADDR_ANY, and if the
1077 	 * IXAF_VERIFY_SOURCE flag is set.
1078 	 */
1079 	verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1080 	    (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1081 
1082 	match_args = MATCH_IRE_SECATTR;
1083 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1084 	if (setsrcp != NULL)
1085 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1086 	if (errorp != NULL)
1087 		ASSERT(*errorp == 0);
1088 
1089 	/*
1090 	 * The content of the ixa will be different if IP_NEXTHOP,
1091 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1092 	 */
1093 
1094 	if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1095 		/* Pick up the IRE_MULTICAST for the ill */
1096 		if (ixa->ixa_multicast_ifindex != 0) {
1097 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1098 			    isv6, ipst);
1099 		} else if (ixaflags & IXAF_SCOPEID_SET) {
1100 			/* sin6_scope_id takes precedence over ixa_ifindex */
1101 			ASSERT(ixa->ixa_scopeid != 0);
1102 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1103 			    isv6, ipst);
1104 		} else if (ixa->ixa_ifindex != 0) {
1105 			/*
1106 			 * In the ipmp case, the ixa_ifindex is set to
1107 			 * point at an under_ill and we would return the
1108 			 * ire_multicast() corresponding to that under_ill.
1109 			 */
1110 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1111 			    isv6, ipst);
1112 		} else if (src_multihoming != 0 && verify_src) {
1113 			/* Look up the ill based on the source address */
1114 			ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1115 			/*
1116 			 * Since we looked up the ill from the source there
1117 			 * is no need to verify that the source is on the ill
1118 			 * below.
1119 			 */
1120 			verify_src = B_FALSE;
1121 			if (ill != NULL && IS_VNI(ill)) {
1122 				ill_t *usesrc = ill;
1123 
1124 				ill = ill_lookup_usesrc(usesrc);
1125 				ill_refrele(usesrc);
1126 			}
1127 		} else if (!isv6) {
1128 			ipaddr_t	v4setsrc = INADDR_ANY;
1129 
1130 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1131 			    ipst, multirtp, &v4setsrc);
1132 			if (setsrcp != NULL)
1133 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1134 		} else {
1135 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1136 			    ipst, multirtp, setsrcp);
1137 		}
1138 		if (ill != NULL && IS_VNI(ill)) {
1139 			ill_refrele(ill);
1140 			ill = NULL;
1141 		}
1142 		if (ill == NULL) {
1143 			if (errorp != NULL)
1144 				*errorp = ENXIO;
1145 			/* Get a hold on the IRE_NOROUTE */
1146 			ire = ire_reject(ipst, isv6);
1147 			return (ire);
1148 		}
1149 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
1150 			ill_refrele(ill);
1151 			if (errorp != NULL)
1152 				*errorp = EHOSTUNREACH;
1153 			/* Get a hold on the IRE_NOROUTE */
1154 			ire = ire_reject(ipst, isv6);
1155 			return (ire);
1156 		}
1157 		/*
1158 		 * If we are doing the strictest src_multihoming, then
1159 		 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1160 		 * an interface that is consistent with the source address.
1161 		 */
1162 		if (verify_src && src_multihoming == 2 &&
1163 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1164 			if (errorp != NULL)
1165 				*errorp = EADDRNOTAVAIL;
1166 			ill_refrele(ill);
1167 			/* Get a hold on the IRE_NOROUTE */
1168 			ire = ire_reject(ipst, isv6);
1169 			return (ire);
1170 		}
1171 		/* Get a refcnt on the single IRE_MULTICAST per ill */
1172 		ire = ire_multicast(ill);
1173 		ill_refrele(ill);
1174 		if (generationp != NULL)
1175 			*generationp = ire->ire_generation;
1176 		if (errorp != NULL &&
1177 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1178 			*errorp = EHOSTUNREACH;
1179 		}
1180 		return (ire);
1181 	}
1182 
1183 	/* Now for unicast */
1184 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1185 		if (ixaflags & IXAF_SCOPEID_SET) {
1186 			/* sin6_scope_id takes precedence over ixa_ifindex */
1187 			ASSERT(ixa->ixa_scopeid != 0);
1188 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1189 			    isv6, ipst);
1190 		} else {
1191 			ASSERT(ixa->ixa_ifindex != 0);
1192 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1193 			    isv6, ipst);
1194 		}
1195 		if (ill != NULL && IS_VNI(ill)) {
1196 			ill_refrele(ill);
1197 			ill = NULL;
1198 		}
1199 		if (ill == NULL) {
1200 			if (errorp != NULL)
1201 				*errorp = ENXIO;
1202 			/* Get a hold on the IRE_NOROUTE */
1203 			ire = ire_reject(ipst, isv6);
1204 			return (ire);
1205 		}
1206 
1207 		match_args |= MATCH_IRE_ILL;
1208 
1209 		/*
1210 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1211 		 * so for both of them we need to be able look for an under
1212 		 * interface.
1213 		 */
1214 		if (IS_UNDER_IPMP(ill))
1215 			match_args |= MATCH_IRE_TESTHIDDEN;
1216 
1217 		/*
1218 		 * If we are doing the strictest src_multihoming, then
1219 		 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1220 		 * an interface that is consistent with the source address.
1221 		 */
1222 		if (src_multihoming == 2 &&
1223 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1224 			if (errorp != NULL)
1225 				*errorp = EADDRNOTAVAIL;
1226 			ill_refrele(ill);
1227 			/* Get a hold on the IRE_NOROUTE */
1228 			ire = ire_reject(ipst, isv6);
1229 			return (ire);
1230 		}
1231 	} else if (src_multihoming != 0 && verify_src) {
1232 		/* Look up the ill based on the source address */
1233 		ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1234 		if (ill == NULL) {
1235 			char addrbuf[INET6_ADDRSTRLEN];
1236 
1237 			ip3dbg(("%s not a valid src for unicast",
1238 			    inet_ntop(AF_INET6, &v6src, addrbuf,
1239 			    sizeof (addrbuf))));
1240 			if (errorp != NULL)
1241 				*errorp = EADDRNOTAVAIL;
1242 			/* Get a hold on the IRE_NOROUTE */
1243 			ire = ire_reject(ipst, isv6);
1244 			return (ire);
1245 		}
1246 		match_args |= MATCH_IRE_SRC_ILL;
1247 		preferred_src_aware = (src_multihoming == 1);
1248 	}
1249 
1250 	if (ixaflags & IXAF_NEXTHOP_SET) {
1251 		/* IP_NEXTHOP was set */
1252 		v6nexthop = ixa->ixa_nexthop_v6;
1253 	} else {
1254 		v6nexthop = *v6dst;
1255 	}
1256 
1257 	ire_type = 0;
1258 
1259 	/*
1260 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1261 	 * we only look for an onlink IRE.
1262 	 */
1263 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1264 		match_args |= MATCH_IRE_TYPE;
1265 		ire_type = IRE_ONLINK;
1266 	}
1267 
1268 retry:
1269 	if (!isv6) {
1270 		ipaddr_t	v4nexthop;
1271 		ipaddr_t	v4setsrc = INADDR_ANY;
1272 
1273 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1274 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1275 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1276 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1277 		if (setsrcp != NULL)
1278 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1279 	} else {
1280 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1281 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1282 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1283 	}
1284 
1285 #ifdef DEBUG
1286 	if (match_args & MATCH_IRE_TESTHIDDEN) {
1287 		ip3dbg(("looking for hidden; dst %x ire %p\n",
1288 		    v4dst, (void *)ire));
1289 	}
1290 #endif
1291 	if (ill != NULL) {
1292 		ill_refrele(ill);
1293 		ill = NULL;
1294 	}
1295 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1296 	    (ire->ire_type & IRE_MULTICAST)) {
1297 		if (preferred_src_aware) {
1298 			/*
1299 			 * "Preferred Source Aware" send mode. If we cannot
1300 			 * find an ire whose ire_ill had the desired source
1301 			 * address retry after relaxing the ill matching
1302 			 * constraint.
1303 			 */
1304 			ire_refrele(ire);
1305 			preferred_src_aware = B_FALSE;
1306 			match_args &= ~MATCH_IRE_SRC_ILL;
1307 			goto retry;
1308 		}
1309 		/* No ire_nce_cache */
1310 		return (ire);
1311 	}
1312 
1313 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
1314 	mutex_enter(&ire->ire_lock);
1315 	nce = ire->ire_nce_cache;
1316 	if (nce == NULL || nce->nce_is_condemned) {
1317 		mutex_exit(&ire->ire_lock);
1318 		(void) ire_revalidate_nce(ire);
1319 	} else {
1320 		mutex_exit(&ire->ire_lock);
1321 	}
1322 	return (ire);
1323 }
1324 
1325 /*
1326  * Find a route given some xmit attributes and a packet.
1327  * Generic for IPv4 and IPv6
1328  *
1329  * This never returns NULL. But when it returns the IRE_NOROUTE
1330  * it might set errorp.
1331  */
1332 ire_t *
1333 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1334     int *errorp, boolean_t *multirtp)
1335 {
1336 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1337 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1338 		in6_addr_t	v6dst, v6src;
1339 
1340 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1341 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1342 
1343 		return (ip_select_route(&v6dst, v6src, ixa, generationp,
1344 		    NULL, errorp, multirtp));
1345 	} else {
1346 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
1347 
1348 		return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1349 		    ixa, generationp, NULL, errorp, multirtp));
1350 	}
1351 }
1352 
1353 ire_t *
1354 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1355     uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1356 {
1357 	in6_addr_t	v6dst, v6src;
1358 	ire_t		*ire;
1359 	in6_addr_t	setsrc;
1360 
1361 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1362 
1363 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1364 	IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1365 
1366 	setsrc = ipv6_all_zeros;
1367 	ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1368 	    multirtp);
1369 	if (v4setsrcp != NULL)
1370 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1371 	return (ire);
1372 }
1373 
1374 /*
1375  * Recursively look for a route to the destination. Can also match on
1376  * the zoneid, ill, and label. Used for the data paths. See also
1377  * ire_route_recursive.
1378  *
1379  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1380  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1381  * forwarding.
1382  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1383  * resolve the gateway.
1384  *
1385  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1386  * instead.
1387  *
1388  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1389  * is an error.
1390  * Allow at most one RTF_INDIRECT.
1391  */
1392 ire_t *
1393 ire_route_recursive_impl_v4(ire_t *ire,
1394     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1395     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1396     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1397     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1398 {
1399 	int		i, j;
1400 	ire_t		*ires[MAX_IRE_RECURSION];
1401 	uint_t		generation;
1402 	uint_t		generations[MAX_IRE_RECURSION];
1403 	boolean_t	need_refrele = B_FALSE;
1404 	boolean_t	invalidate = B_FALSE;
1405 	ill_t		*ill = NULL;
1406 	uint_t		maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1407 
1408 	if (setsrcp != NULL)
1409 		ASSERT(*setsrcp == INADDR_ANY);
1410 	if (gwattrp != NULL)
1411 		ASSERT(*gwattrp == NULL);
1412 
1413 	/*
1414 	 * We iterate up to three times to resolve a route, even though
1415 	 * we have four slots in the array. The extra slot is for an
1416 	 * IRE_IF_CLONE we might need to create.
1417 	 */
1418 	i = 0;
1419 	while (i < MAX_IRE_RECURSION - 1) {
1420 		/* ire_ftable_lookup handles round-robin/ECMP */
1421 		if (ire == NULL) {
1422 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1423 			    (ill != NULL? ill : ill_arg), zoneid, tsl,
1424 			    match_args, xmit_hint, ipst, &generation);
1425 		} else {
1426 			/* Caller passed it; extra hold since we will rele */
1427 			ire_refhold(ire);
1428 			if (generationp != NULL)
1429 				generation = *generationp;
1430 			else
1431 				generation = IRE_GENERATION_VERIFY;
1432 		}
1433 		if (ire == NULL) {
1434 			if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1435 				ire = ires[0];
1436 				ire_refhold(ire);
1437 			} else {
1438 				ire = ire_reject(ipst, B_FALSE);
1439 			}
1440 			goto error;
1441 		}
1442 
1443 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1444 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1445 			goto error;
1446 
1447 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1448 		/*
1449 		 * Verify that the IRE_IF_CLONE has a consistent generation
1450 		 * number.
1451 		 */
1452 		if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1453 			ire_refrele(ire);
1454 			ire = NULL;
1455 			continue;
1456 		}
1457 
1458 		/*
1459 		 * Don't allow anything unusual past the first iteration.
1460 		 * After the first lookup, we should no longer look for
1461 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1462 		 * routes.
1463 		 *
1464 		 * In addition, after we have found a direct IRE_OFFLINK,
1465 		 * we should only look for interface or clone routes.
1466 		 */
1467 		match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1468 
1469 		if ((ire->ire_type & IRE_OFFLINK) &&
1470 		    !(ire->ire_flags & RTF_INDIRECT)) {
1471 			ire_type = IRE_IF_ALL;
1472 		} else {
1473 			/*
1474 			 * no more local, loopback, broadcast routes
1475 			 */
1476 			if (!(match_args & MATCH_IRE_TYPE))
1477 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1478 			ire_type &= ~maskoff;
1479 		}
1480 		match_args |= MATCH_IRE_TYPE;
1481 
1482 		/* We have a usable IRE */
1483 		ires[i] = ire;
1484 		generations[i] = generation;
1485 		i++;
1486 
1487 		/* The first RTF_SETSRC address is passed back if setsrcp */
1488 		if ((ire->ire_flags & RTF_SETSRC) &&
1489 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
1490 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1491 			*setsrcp = ire->ire_setsrc_addr;
1492 		}
1493 
1494 		/* The first ire_gw_secattr is passed back if gwattrp */
1495 		if (ire->ire_gw_secattr != NULL &&
1496 		    gwattrp != NULL && *gwattrp == NULL)
1497 			*gwattrp = ire->ire_gw_secattr;
1498 
1499 		/*
1500 		 * Check if we have a short-cut pointer to an IRE for this
1501 		 * destination, and that the cached dependency isn't stale.
1502 		 * In that case we've rejoined an existing tree towards a
1503 		 * parent, thus we don't need to continue the loop to
1504 		 * discover the rest of the tree.
1505 		 */
1506 		mutex_enter(&ire->ire_lock);
1507 		if (ire->ire_dep_parent != NULL &&
1508 		    ire->ire_dep_parent->ire_generation ==
1509 		    ire->ire_dep_parent_generation) {
1510 			mutex_exit(&ire->ire_lock);
1511 			ire = NULL;
1512 			goto done;
1513 		}
1514 		mutex_exit(&ire->ire_lock);
1515 
1516 		/*
1517 		 * If this type should have an ire_nce_cache (even if it
1518 		 * doesn't yet have one) then we are done. Includes
1519 		 * IRE_INTERFACE with a full 32 bit mask.
1520 		 */
1521 		if (ire->ire_nce_capable) {
1522 			ire = NULL;
1523 			goto done;
1524 		}
1525 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1526 		/*
1527 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1528 		 * particular destination
1529 		 */
1530 		if (ire->ire_type & IRE_INTERFACE) {
1531 			in6_addr_t	v6nexthop;
1532 			ire_t		*clone;
1533 
1534 			ASSERT(ire->ire_masklen != IPV4_ABITS);
1535 
1536 			/*
1537 			 * In the case of ip_input and ILLF_FORWARDING not
1538 			 * being set, and in the case of RTM_GET, there is
1539 			 * no point in allocating an IRE_IF_CLONE. We return
1540 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1541 			 * result in a ire_dep_parent which is IRE_IF_*
1542 			 * without an IRE_IF_CLONE.
1543 			 * We recover from that when we need to send packets
1544 			 * by ensuring that the generations become
1545 			 * IRE_GENERATION_VERIFY in this case.
1546 			 */
1547 			if (!(irr_flags & IRR_ALLOCATE)) {
1548 				invalidate = B_TRUE;
1549 				ire = NULL;
1550 				goto done;
1551 			}
1552 
1553 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1554 
1555 			clone = ire_create_if_clone(ire, &v6nexthop,
1556 			    &generation);
1557 			if (clone == NULL) {
1558 				/*
1559 				 * Temporary failure - no memory.
1560 				 * Don't want caller to cache IRE_NOROUTE.
1561 				 */
1562 				invalidate = B_TRUE;
1563 				ire = ire_blackhole(ipst, B_FALSE);
1564 				goto error;
1565 			}
1566 			/*
1567 			 * Make clone next to last entry and the
1568 			 * IRE_INTERFACE the last in the dependency
1569 			 * chain since the clone depends on the
1570 			 * IRE_INTERFACE.
1571 			 */
1572 			ASSERT(i >= 1);
1573 			ASSERT(i < MAX_IRE_RECURSION);
1574 
1575 			ires[i] = ires[i-1];
1576 			generations[i] = generations[i-1];
1577 			ires[i-1] = clone;
1578 			generations[i-1] = generation;
1579 			i++;
1580 
1581 			ire = NULL;
1582 			goto done;
1583 		}
1584 
1585 		/*
1586 		 * We only match on the type and optionally ILL when
1587 		 * recursing. The type match is used by some callers
1588 		 * to exclude certain types (such as IRE_IF_CLONE or
1589 		 * IRE_LOCAL|IRE_LOOPBACK).
1590 		 *
1591 		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1592 		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1593 		 * ire_ill, so we set ill to the ire_ill;
1594 		 */
1595 		match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1596 		nexthop = ire->ire_gateway_addr;
1597 		if (ill == NULL && ire->ire_ill != NULL) {
1598 			ill = ire->ire_ill;
1599 			need_refrele = B_TRUE;
1600 			ill_refhold(ill);
1601 			match_args |= MATCH_IRE_ILL;
1602 		}
1603 		ire = NULL;
1604 	}
1605 	ASSERT(ire == NULL);
1606 	ire = ire_reject(ipst, B_FALSE);
1607 
1608 error:
1609 	ASSERT(ire != NULL);
1610 	if (need_refrele)
1611 		ill_refrele(ill);
1612 
1613 	/*
1614 	 * In the case of MULTIRT we want to try a different IRE the next
1615 	 * time. We let the next packet retry in that case.
1616 	 */
1617 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1618 		(void) ire_no_good(ires[0]);
1619 
1620 cleanup:
1621 	/* cleanup ires[i] */
1622 	ire_dep_unbuild(ires, i);
1623 	for (j = 0; j < i; j++)
1624 		ire_refrele(ires[j]);
1625 
1626 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1627 	    (irr_flags & IRR_INCOMPLETE));
1628 	/*
1629 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1630 	 * ip_select_route since the reject or lack of memory might be gone.
1631 	 */
1632 	if (generationp != NULL)
1633 		*generationp = IRE_GENERATION_VERIFY;
1634 	return (ire);
1635 
1636 done:
1637 	ASSERT(ire == NULL);
1638 	if (need_refrele) {
1639 		ill_refrele(ill);
1640 		ill = NULL;
1641 	}
1642 
1643 	/* Build dependencies */
1644 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1645 		/* Something in chain was condemned; tear it apart */
1646 		ire = ire_reject(ipst, B_FALSE);
1647 		goto cleanup;
1648 	}
1649 
1650 	/*
1651 	 * Release all refholds except the one for ires[0] that we
1652 	 * will return to the caller.
1653 	 */
1654 	for (j = 1; j < i; j++)
1655 		ire_refrele(ires[j]);
1656 
1657 	if (invalidate) {
1658 		/*
1659 		 * Since we needed to allocate but couldn't we need to make
1660 		 * sure that the dependency chain is rebuilt the next time.
1661 		 */
1662 		ire_dep_invalidate_generations(ires[0]);
1663 		generation = IRE_GENERATION_VERIFY;
1664 	} else {
1665 		/*
1666 		 * IREs can have been added or deleted while we did the
1667 		 * recursive lookup and we can't catch those until we've built
1668 		 * the dependencies. We verify the stored
1669 		 * ire_dep_parent_generation to catch any such changes and
1670 		 * return IRE_GENERATION_VERIFY (which will cause
1671 		 * ip_select_route to be called again so we can redo the
1672 		 * recursive lookup next time we send a packet.
1673 		 */
1674 		if (ires[0]->ire_dep_parent == NULL)
1675 			generation = ires[0]->ire_generation;
1676 		else
1677 			generation = ire_dep_validate_generations(ires[0]);
1678 		if (generations[0] != ires[0]->ire_generation) {
1679 			/* Something changed at the top */
1680 			generation = IRE_GENERATION_VERIFY;
1681 		}
1682 	}
1683 	if (generationp != NULL)
1684 		*generationp = generation;
1685 
1686 	return (ires[0]);
1687 }
1688 
1689 ire_t *
1690 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1691     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1692     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1693     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1694 {
1695 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1696 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1697 	    gwattrp, generationp));
1698 }
1699 
1700 /*
1701  * Recursively look for a route to the destination.
1702  * We only handle a destination match here, yet we have the same arguments
1703  * as the full match to allow function pointers to select between the two.
1704  *
1705  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1706  * instead.
1707  *
1708  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1709  * is an error.
1710  * Allow at most one RTF_INDIRECT.
1711  */
1712 ire_t *
1713 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1714     uint32_t xmit_hint, ip_stack_t *ipst)
1715 {
1716 	ire_t	*ire;
1717 	ire_t	*ire1;
1718 	uint_t	generation;
1719 
1720 	/* ire_ftable_lookup handles round-robin/ECMP */
1721 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1722 	    &generation);
1723 	ASSERT(ire != NULL);
1724 	/*
1725 	 * If the IRE has a current cached parent we know that the whole
1726 	 * parent chain is current, hence we don't need to discover and
1727 	 * build any dependencies by doing a recursive lookup.
1728 	 */
1729 	mutex_enter(&ire->ire_lock);
1730 	if (ire->ire_dep_parent != NULL) {
1731 		if (ire->ire_dep_parent->ire_generation ==
1732 		    ire->ire_dep_parent_generation) {
1733 			mutex_exit(&ire->ire_lock);
1734 			return (ire);
1735 		}
1736 		mutex_exit(&ire->ire_lock);
1737 	} else {
1738 		mutex_exit(&ire->ire_lock);
1739 		/*
1740 		 * If this type should have an ire_nce_cache (even if it
1741 		 * doesn't yet have one) then we are done. Includes
1742 		 * IRE_INTERFACE with a full 32 bit mask.
1743 		 */
1744 		if (ire->ire_nce_capable)
1745 			return (ire);
1746 	}
1747 
1748 	/*
1749 	 * Fallback to loop in the normal code starting with the ire
1750 	 * we found. Normally this would return the same ire.
1751 	 */
1752 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1753 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1754 	    &generation);
1755 	ire_refrele(ire);
1756 	return (ire1);
1757 }
1758 
1759 /*
1760  * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1761  * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1762  * are not consistent, and TRUE otherwise.
1763  */
1764 boolean_t
1765 ire_clone_verify(ire_t *ire)
1766 {
1767 	ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1768 	mutex_enter(&ire->ire_lock);
1769 	if (ire->ire_dep_parent != NULL &&
1770 	    ire->ire_dep_parent->ire_generation !=
1771 	    ire->ire_dep_parent_generation) {
1772 		mutex_exit(&ire->ire_lock);
1773 		ire_delete(ire);
1774 		return (B_FALSE);
1775 	}
1776 	mutex_exit(&ire->ire_lock);
1777 	return (B_TRUE);
1778 }
1779