xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ftable.c (revision 2dea4eed7ad1c66ae4770263aa2911815a8b86eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * This file contains consumer routines of the IPv4 forwarding engine
28  */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #include <sys/dlpi.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/policy.h>
38 
39 #include <sys/systm.h>
40 #include <sys/strsun.h>
41 #include <sys/kmem.h>
42 #include <sys/param.h>
43 #include <sys/socket.h>
44 #include <sys/strsubr.h>
45 #include <net/if.h>
46 #include <net/route.h>
47 #include <netinet/in.h>
48 #include <net/if_dl.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/ipsec_impl.h>
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ip6.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/arp.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_ftable.h>
64 #include <inet/ip_rts.h>
65 #include <inet/nd.h>
66 
67 #include <net/pfkeyv2.h>
68 #include <inet/sadb.h>
69 #include <inet/tcp.h>
70 #include <inet/ipclassifier.h>
71 #include <sys/zone.h>
72 #include <net/radix.h>
73 #include <sys/tsol/label.h>
74 #include <sys/tsol/tnet.h>
75 
76 #define	IS_DEFAULT_ROUTE(ire)	\
77 	(((ire)->ire_type & IRE_DEFAULT) || \
78 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
79 
80 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
81 static void	ire_del_host_redir(ire_t *, char *);
82 static boolean_t ire_find_best_route(struct radix_node *, void *);
83 
84 /*
85  * Lookup a route in forwarding table. A specific lookup is indicated by
86  * passing the required parameters and indicating the match required in the
87  * flag field.
88  *
89  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
90  */
91 ire_t *
92 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
93     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
94     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
95 {
96 	ire_t *ire;
97 	struct rt_sockaddr rdst, rmask;
98 	struct rt_entry *rt;
99 	ire_ftable_args_t margs;
100 
101 	ASSERT(ill == NULL || !ill->ill_isv6);
102 
103 	/*
104 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
105 	 * is set.
106 	 */
107 	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
108 		return (NULL);
109 
110 	bzero(&rdst, sizeof (rdst));
111 	rdst.rt_sin_len = sizeof (rdst);
112 	rdst.rt_sin_family = AF_INET;
113 	rdst.rt_sin_addr.s_addr = addr;
114 
115 	bzero(&rmask, sizeof (rmask));
116 	rmask.rt_sin_len = sizeof (rmask);
117 	rmask.rt_sin_family = AF_INET;
118 	rmask.rt_sin_addr.s_addr = mask;
119 
120 	bzero(&margs, sizeof (margs));
121 	margs.ift_addr = addr;
122 	margs.ift_mask = mask;
123 	margs.ift_gateway = gateway;
124 	margs.ift_type = type;
125 	margs.ift_ill = ill;
126 	margs.ift_zoneid = zoneid;
127 	margs.ift_tsl = tsl;
128 	margs.ift_flags = flags;
129 
130 	/*
131 	 * The flags argument passed to ire_ftable_lookup may cause the
132 	 * search to return, not the longest matching prefix, but the
133 	 * "best matching prefix", i.e., the longest prefix that also
134 	 * satisfies constraints imposed via the permutation of flags
135 	 * passed in. To achieve this, we invoke ire_match_args() on
136 	 * each matching leaf in the  radix tree. ire_match_args is
137 	 * invoked by the callback function ire_find_best_route()
138 	 * We hold the global tree lock in read mode when calling
139 	 * rn_match_args. Before dropping the global tree lock, ensure
140 	 * that the radix node can't be deleted by incrementing ire_refcnt.
141 	 */
142 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
143 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
144 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
145 	ire = margs.ift_best_ire;
146 	if (rt == NULL) {
147 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
148 		return (NULL);
149 	}
150 	ASSERT(ire != NULL);
151 
152 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
153 
154 	/*
155 	 * round-robin only if we have more than one route in the bucket.
156 	 * ips_ip_ecmp_behavior controls when we do ECMP
157 	 *	2:	always
158 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
159 	 *	0:	never
160 	 */
161 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
162 		if (ipst->ips_ip_ecmp_behavior == 2 ||
163 		    (ipst->ips_ip_ecmp_behavior == 1 &&
164 		    IS_DEFAULT_ROUTE(ire))) {
165 			ire_t	*next_ire;
166 
167 			margs.ift_best_ire = NULL;
168 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
169 			    xmit_hint, ire, ipst);
170 			if (next_ire == NULL) {
171 				/* keep ire if next_ire is null */
172 				goto done;
173 			}
174 			ire_refrele(ire);
175 			ire = next_ire;
176 		}
177 	}
178 
179 done:
180 	/* Return generation before dropping lock */
181 	if (generationp != NULL)
182 		*generationp = ire->ire_generation;
183 
184 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
185 
186 	/*
187 	 * For shared-IP zones we need additional checks to what was
188 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
189 	 *
190 	 * When ip_restrict_interzone_loopback is set, then
191 	 * we ensure that IRE_LOCAL are only used for loopback
192 	 * between zones when the logical "Ethernet" would
193 	 * have looped them back. That is, if in the absense of
194 	 * the IRE_LOCAL we would have sent to packet out the
195 	 * same ill.
196 	 */
197 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
198 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
199 	    ipst->ips_ip_restrict_interzone_loopback) {
200 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
201 		ASSERT(ire != NULL);
202 	}
203 	return (ire);
204 }
205 
206 /*
207  * This function is called by
208  * ip_input/ire_route_recursive when doing a route lookup on only the
209  * destination address.
210  *
211  * The optimizations of this function over ire_ftable_lookup are:
212  *	o removing unnecessary flag matching
213  *	o doing longest prefix match instead of overloading it further
214  *	  with the unnecessary "best_prefix_match"
215  *
216  * If no route is found we return IRE_NOROUTE.
217  */
218 ire_t *
219 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
220     uint_t *generationp)
221 {
222 	ire_t *ire;
223 	struct rt_sockaddr rdst;
224 	struct rt_entry *rt;
225 	irb_t *irb;
226 
227 	rdst.rt_sin_len = sizeof (rdst);
228 	rdst.rt_sin_family = AF_INET;
229 	rdst.rt_sin_addr.s_addr = addr;
230 
231 	/*
232 	 * This is basically inlining  a simpler version of ire_match_args
233 	 */
234 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
235 
236 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
237 	    ipst->ips_ip_ftable, NULL, NULL);
238 
239 	if (rt == NULL)
240 		goto bad;
241 
242 	irb = &rt->rt_irb;
243 	if (irb->irb_ire_cnt == 0)
244 		goto bad;
245 
246 	rw_enter(&irb->irb_lock, RW_READER);
247 	ire = irb->irb_ire;
248 	if (ire == NULL) {
249 		rw_exit(&irb->irb_lock);
250 		goto bad;
251 	}
252 	while (IRE_IS_CONDEMNED(ire)) {
253 		ire = ire->ire_next;
254 		if (ire == NULL) {
255 			rw_exit(&irb->irb_lock);
256 			goto bad;
257 		}
258 	}
259 
260 	/* we have a ire that matches */
261 	ire_refhold(ire);
262 	rw_exit(&irb->irb_lock);
263 
264 	/*
265 	 * round-robin only if we have more than one route in the bucket.
266 	 * ips_ip_ecmp_behavior controls when we do ECMP
267 	 *	2:	always
268 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
269 	 *	0:	never
270 	 *
271 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
272 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
273 	 * and the IRE_INTERFACESs are likely to be shorter matches.
274 	 */
275 	if (ire->ire_bucket->irb_ire_cnt > 1) {
276 		if (ipst->ips_ip_ecmp_behavior == 2 ||
277 		    (ipst->ips_ip_ecmp_behavior == 1 &&
278 		    IS_DEFAULT_ROUTE(ire))) {
279 			ire_t	*next_ire;
280 			ire_ftable_args_t margs;
281 
282 			bzero(&margs, sizeof (margs));
283 			margs.ift_addr = addr;
284 			margs.ift_zoneid = ALL_ZONES;
285 
286 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
287 			    xmit_hint, ire, ipst);
288 			if (next_ire == NULL) {
289 				/* keep ire if next_ire is null */
290 				if (generationp != NULL)
291 					*generationp = ire->ire_generation;
292 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
293 				return (ire);
294 			}
295 			ire_refrele(ire);
296 			ire = next_ire;
297 		}
298 	}
299 	/* Return generation before dropping lock */
300 	if (generationp != NULL)
301 		*generationp = ire->ire_generation;
302 
303 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
304 
305 	/*
306 	 * Since we only did ALL_ZONES matches there is no special handling
307 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
308 	 */
309 	return (ire);
310 
311 bad:
312 	if (generationp != NULL)
313 		*generationp = IRE_GENERATION_VERIFY;
314 
315 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
316 	return (ire_reject(ipst, B_FALSE));
317 }
318 
319 /*
320  * Find the ill matching a multicast group.
321  * Allows different routes for multicast addresses
322  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
323  * which point at different interfaces. This is used when IP_MULTICAST_IF
324  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
325  * specify the interface to join on.
326  *
327  * Supports link-local addresses by using ire_route_recursive which follows
328  * the ill when recursing.
329  *
330  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
331  * and the MULTIRT property can be different for different groups, we
332  * extract RTF_MULTIRT from the special unicast route added for a group
333  * with CGTP and pass that back in the multirtp argument.
334  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
335  * We have a setsrcp argument for the same reason.
336  */
337 ill_t *
338 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
339     boolean_t *multirtp, ipaddr_t *setsrcp)
340 {
341 	ire_t	*ire;
342 	ill_t	*ill;
343 
344 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
345 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
346 	ASSERT(ire != NULL);
347 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
348 		ire_refrele(ire);
349 		return (NULL);
350 	}
351 
352 	if (multirtp != NULL)
353 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
354 
355 	ill = ire_nexthop_ill(ire);
356 	ire_refrele(ire);
357 	return (ill);
358 }
359 
360 /*
361  * Delete the passed in ire if the gateway addr matches
362  */
363 void
364 ire_del_host_redir(ire_t *ire, char *gateway)
365 {
366 	if ((ire->ire_flags & RTF_DYNAMIC) &&
367 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
368 		ire_delete(ire);
369 }
370 
371 /*
372  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
373  * pointing at the specified gateway and
374  * delete them. This routine is called only
375  * when a default gateway is going away.
376  */
377 void
378 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
379 {
380 	struct rtfuncarg rtfarg;
381 
382 	bzero(&rtfarg, sizeof (rtfarg));
383 	rtfarg.rt_func = ire_del_host_redir;
384 	rtfarg.rt_arg = (void *)&gateway;
385 	rtfarg.rt_zoneid = ALL_ZONES;
386 	rtfarg.rt_ipst = ipst;
387 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
388 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
389 }
390 
391 /*
392  * Obtain the rt_entry and rt_irb for the route to be added to
393  * the ips_ip_ftable.
394  * First attempt to add a node to the radix tree via rn_addroute. If the
395  * route already exists, return the bucket for the existing route.
396  *
397  * Locking notes: Need to hold the global radix tree lock in write mode to
398  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
399  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
400  * while holding the irb_lock, but not the radix tree lock.
401  */
402 irb_t *
403 ire_get_bucket(ire_t *ire)
404 {
405 	struct radix_node *rn;
406 	struct rt_entry *rt;
407 	struct rt_sockaddr rmask, rdst;
408 	irb_t *irb = NULL;
409 	ip_stack_t *ipst = ire->ire_ipst;
410 
411 	ASSERT(ipst->ips_ip_ftable != NULL);
412 
413 	/* first try to see if route exists (based on rtalloc1) */
414 	bzero(&rdst, sizeof (rdst));
415 	rdst.rt_sin_len = sizeof (rdst);
416 	rdst.rt_sin_family = AF_INET;
417 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
418 
419 	bzero(&rmask, sizeof (rmask));
420 	rmask.rt_sin_len = sizeof (rmask);
421 	rmask.rt_sin_family = AF_INET;
422 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
423 
424 	/*
425 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
426 	 */
427 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
428 	/* kmem_alloc failed */
429 	if (rt == NULL)
430 		return (NULL);
431 
432 	bzero(rt, sizeof (*rt));
433 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
434 	rt->rt_dst = rdst;
435 	irb = &rt->rt_irb;
436 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
437 	irb->irb_ipst = ipst;
438 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
439 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
440 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
441 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
442 	if (rn == NULL) {
443 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
444 		Free(rt, rt_entry_cache);
445 		rt = NULL;
446 		irb = NULL;
447 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
448 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
449 		    ipst->ips_ip_ftable);
450 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
451 			/* found a non-root match */
452 			rt = (struct rt_entry *)rn;
453 		}
454 	}
455 	if (rt != NULL) {
456 		irb = &rt->rt_irb;
457 		irb_refhold(irb);
458 	}
459 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
460 	return (irb);
461 }
462 
463 /*
464  * This function is used when the caller wants to know the outbound
465  * interface for a packet given only the address.
466  * If this is a offlink IP address and there are multiple
467  * routes to this destination, this routine will utilise the
468  * first route it finds to IP address
469  * Return values:
470  * 	0	- FAILURE
471  *	nonzero	- ifindex
472  */
473 uint_t
474 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
475 {
476 	uint_t ifindex = 0;
477 	ire_t *ire;
478 	ill_t *ill;
479 	netstack_t *ns;
480 	ip_stack_t *ipst;
481 
482 	if (zoneid == ALL_ZONES)
483 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
484 	else
485 		ns = netstack_find_by_zoneid(zoneid);
486 	ASSERT(ns != NULL);
487 
488 	/*
489 	 * For exclusive stacks we set the zoneid to zero
490 	 * since IP uses the global zoneid in the exclusive stacks.
491 	 */
492 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
493 		zoneid = GLOBAL_ZONEID;
494 	ipst = ns->netstack_ip;
495 
496 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
497 
498 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
499 		ill = ire_nexthop_ill(ire);
500 		if (ill != NULL) {
501 			ifindex = ill->ill_phyint->phyint_ifindex;
502 			ill_refrele(ill);
503 		}
504 		ire_refrele(ire);
505 	}
506 	netstack_rele(ns);
507 	return (ifindex);
508 }
509 
510 /*
511  * Routine to find the route to a destination. If a ifindex is supplied
512  * it tries to match the route to the corresponding ipif for the ifindex
513  */
514 static	ire_t *
515 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
516 {
517 	ire_t *ire = NULL;
518 	int match_flags;
519 
520 	match_flags = MATCH_IRE_DSTONLY;
521 
522 	/* XXX pass NULL tsl for now */
523 
524 	if (dst_addr->sa_family == AF_INET) {
525 		ire = ire_route_recursive_v4(
526 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
527 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
528 		    NULL, NULL);
529 	} else {
530 		ire = ire_route_recursive_v6(
531 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
532 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
533 		    NULL, NULL);
534 	}
535 	ASSERT(ire != NULL);
536 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
537 		ire_refrele(ire);
538 		return (NULL);
539 	}
540 	return (ire);
541 }
542 
543 /*
544  * This routine is called by IP Filter to send a packet out on the wire
545  * to a specified dstination (which may be onlink or offlink). The ifindex may
546  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
547  * an outgoing interface and requires the nexthop to be on that interface.
548  * IP WILL NOT DO the following to the data packet before sending it out:
549  *	a. manipulate ttl
550  *	b. ipsec work
551  *	c. fragmentation
552  *
553  * If the packet has been prepared for hardware checksum then it will be
554  * passed off to ip_send_align_cksum() to check that the flags set on the
555  * packet are in alignment with the capabilities of the new outgoing NIC.
556  *
557  * Return values:
558  *	0:		IP was able to send of the data pkt
559  *	ECOMM:		Could not send packet
560  *	ENONET		No route to dst. It is up to the caller
561  *			to send icmp unreachable error message,
562  *	EINPROGRESS	The macaddr of the onlink dst or that
563  *			of the offlink dst's nexthop needs to get
564  *			resolved before packet can be sent to dst.
565  *			Thus transmission is not guaranteed.
566  *			Note: No longer have visibility to the ARP queue
567  *			hence no EINPROGRESS.
568  */
569 int
570 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
571     zoneid_t zoneid)
572 {
573 	ipaddr_t nexthop;
574 	netstack_t *ns;
575 	ip_stack_t *ipst;
576 	ip_xmit_attr_t ixas;
577 	int error;
578 
579 	ASSERT(mp != NULL);
580 
581 	if (zoneid == ALL_ZONES)
582 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
583 	else
584 		ns = netstack_find_by_zoneid(zoneid);
585 	ASSERT(ns != NULL);
586 
587 	/*
588 	 * For exclusive stacks we set the zoneid to zero
589 	 * since IP uses the global zoneid in the exclusive stacks.
590 	 */
591 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
592 		zoneid = GLOBAL_ZONEID;
593 	ipst = ns->netstack_ip;
594 
595 	ASSERT(dst_addr->sa_family == AF_INET ||
596 	    dst_addr->sa_family == AF_INET6);
597 
598 	bzero(&ixas, sizeof (ixas));
599 	/*
600 	 * No IPsec, no fragmentation, and don't let any hooks see
601 	 * the packet.
602 	 */
603 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
604 	ixas.ixa_cred = kcred;
605 	ixas.ixa_cpid = NOPID;
606 	ixas.ixa_tsl = NULL;
607 	ixas.ixa_ipst = ipst;
608 	ixas.ixa_ifindex = ifindex;
609 
610 	if (dst_addr->sa_family == AF_INET) {
611 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
612 
613 		ixas.ixa_flags |= IXAF_IS_IPV4;
614 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
615 		if (nexthop != ipha->ipha_dst) {
616 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
617 			ixas.ixa_nexthop_v4 = nexthop;
618 		}
619 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
620 	} else {
621 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
622 		in6_addr_t *nexthop6;
623 
624 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
625 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
626 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
627 			ixas.ixa_nexthop_v6 = *nexthop6;
628 		}
629 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
630 	}
631 	error = ip_output_simple(mp, &ixas);
632 	ixa_cleanup(&ixas);
633 
634 	netstack_rele(ns);
635 	switch (error) {
636 	case 0:
637 		break;
638 
639 	case EHOSTUNREACH:
640 	case ENETUNREACH:
641 		error = ENONET;
642 		break;
643 
644 	default:
645 		error = ECOMM;
646 		break;
647 	}
648 	return (error);
649 }
650 
651 /*
652  * callback function provided by ire_ftable_lookup when calling
653  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
654  * the radix tree.
655  */
656 boolean_t
657 ire_find_best_route(struct radix_node *rn, void *arg)
658 {
659 	struct rt_entry *rt = (struct rt_entry *)rn;
660 	irb_t *irb_ptr;
661 	ire_t *ire;
662 	ire_ftable_args_t *margs = arg;
663 	ipaddr_t match_mask;
664 
665 	ASSERT(rt != NULL);
666 
667 	irb_ptr = &rt->rt_irb;
668 
669 	if (irb_ptr->irb_ire_cnt == 0)
670 		return (B_FALSE);
671 
672 	rw_enter(&irb_ptr->irb_lock, RW_READER);
673 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
674 		if (IRE_IS_CONDEMNED(ire))
675 			continue;
676 		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
677 			match_mask = margs->ift_mask;
678 		else
679 			match_mask = ire->ire_mask;
680 
681 		if (ire_match_args(ire, margs->ift_addr, match_mask,
682 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
683 		    margs->ift_zoneid, margs->ift_tsl,
684 		    margs->ift_flags)) {
685 			ire_refhold(ire);
686 			rw_exit(&irb_ptr->irb_lock);
687 			margs->ift_best_ire = ire;
688 			return (B_TRUE);
689 		}
690 	}
691 	rw_exit(&irb_ptr->irb_lock);
692 	return (B_FALSE);
693 }
694 
695 /*
696  * ftable irb_t structures are dynamically allocated, and we need to
697  * check if the irb_t (and associated ftable tree attachment) needs to
698  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
699  * be verified are:
700  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
701  * - no other threads holding references to ire's in the bucket,
702  *   i.e., irb_nire == 0
703  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
704  * - need to hold the global tree lock and irb_lock in write mode.
705  */
706 void
707 irb_refrele_ftable(irb_t *irb)
708 {
709 	for (;;) {
710 		rw_enter(&irb->irb_lock, RW_WRITER);
711 		ASSERT(irb->irb_refcnt != 0);
712 		if (irb->irb_refcnt != 1) {
713 			/*
714 			 * Someone has a reference to this radix node
715 			 * or there is some bucket walker.
716 			 */
717 			irb->irb_refcnt--;
718 			rw_exit(&irb->irb_lock);
719 			return;
720 		} else {
721 			/*
722 			 * There is no other walker, nor is there any
723 			 * other thread that holds a direct ref to this
724 			 * radix node. Do the clean up if needed. Call
725 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
726 			 */
727 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
728 				ire_t *ire_list;
729 
730 				ire_list = ire_unlink(irb);
731 				rw_exit(&irb->irb_lock);
732 
733 				if (ire_list != NULL)
734 					ire_cleanup(ire_list);
735 				/*
736 				 * more CONDEMNED entries could have
737 				 * been added while we dropped the lock,
738 				 * so we have to re-check.
739 				 */
740 				continue;
741 			}
742 
743 			/*
744 			 * Now check if there are still any ires
745 			 * associated with this radix node.
746 			 */
747 			if (irb->irb_nire != 0) {
748 				/*
749 				 * someone is still holding on
750 				 * to ires in this bucket
751 				 */
752 				irb->irb_refcnt--;
753 				rw_exit(&irb->irb_lock);
754 				return;
755 			} else {
756 				/*
757 				 * Everything is clear. Zero walkers,
758 				 * Zero threads with a ref to this
759 				 * radix node, Zero ires associated with
760 				 * this radix node. Due to lock order,
761 				 * check the above conditions again
762 				 * after grabbing all locks in the right order
763 				 */
764 				rw_exit(&irb->irb_lock);
765 				if (irb_inactive(irb))
766 					return;
767 				/*
768 				 * irb_inactive could not free the irb.
769 				 * See if there are any walkers, if not
770 				 * try to clean up again.
771 				 */
772 			}
773 		}
774 	}
775 }
776 
777 /*
778  * IRE iterator used by ire_ftable_lookup to process multiple equal
779  * routes. Given a starting point in the hash list (hash), walk the IREs
780  * in the bucket skipping deleted entries. We treat the bucket as a circular
781  * list for the purposes of walking it.
782  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
783  * not applicable (ire_match_args failed) then it returns a subsequent one.
784  * If we fail to find an IRE we return NULL.
785  *
786  * Assumes that the caller holds a reference on the IRE bucket and a read lock
787  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
788  *
789  * Applies to IPv4 and IPv6.
790  *
791  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
792  * address and bucket, we compare against ire_type for the orig_ire. We also
793  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
794  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
795  *
796  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
797  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
798  * in which the zone has an IP address. We check this for the global zone
799  * even if no shared-IP zones are configured.
800  */
801 ire_t *
802 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
803     ire_t *orig_ire, ip_stack_t *ipst)
804 {
805 	ire_t		*ire, *maybe_ire = NULL;
806 	uint_t		maybe_badcnt;
807 	uint_t		maxwalk;
808 
809 	/* Fold in more bits from the hint/hash */
810 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
811 
812 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
813 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
814 	hash %= maxwalk;
815 	irb_refhold_locked(irb_ptr);
816 	rw_exit(&irb_ptr->irb_lock);
817 
818 	/*
819 	 * Round-robin the routers list looking for a route that
820 	 * matches the passed in parameters.
821 	 * First we skip "hash" number of non-condemned IREs.
822 	 * Then we match the IRE.
823 	 * If we find an ire which has a non-zero ire_badcnt then we remember
824 	 * it and keep on looking for a lower ire_badcnt.
825 	 * If we come to the end of the list we continue (treat the
826 	 * bucket list as a circular list) but we match less than "max"
827 	 * entries.
828 	 */
829 	ire = irb_ptr->irb_ire;
830 	while (maxwalk > 0) {
831 		if (IRE_IS_CONDEMNED(ire))
832 			goto next_ire_skip;
833 
834 		/* Skip the first "hash" entries to do ECMP */
835 		if (hash != 0) {
836 			hash--;
837 			goto next_ire_skip;
838 		}
839 
840 		/* See CGTP comment above */
841 		if (ire->ire_type != orig_ire->ire_type ||
842 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
843 			goto next_ire;
844 
845 		/*
846 		 * Note: Since IPv6 has hash buckets instead of radix
847 		 * buckers we need to explicitly compare the addresses.
848 		 * That makes this less efficient since we will be called
849 		 * even if there is no alternatives just because the
850 		 * bucket has multiple IREs for different addresses.
851 		 */
852 		if (ire->ire_ipversion == IPV6_VERSION) {
853 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
854 			    &ire->ire_addr_v6))
855 				goto next_ire;
856 		}
857 
858 		/*
859 		 * For some reason find_best_route uses ire_mask. We do
860 		 * the same.
861 		 */
862 		if (ire->ire_ipversion == IPV4_VERSION ?
863 		    !ire_match_args(ire, margs->ift_addr,
864 		    ire->ire_mask, margs->ift_gateway,
865 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
866 		    margs->ift_tsl, margs->ift_flags) :
867 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
868 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
869 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
870 		    margs->ift_tsl, margs->ift_flags))
871 			goto next_ire;
872 
873 		if (margs->ift_zoneid != ALL_ZONES &&
874 		    (ire->ire_type & IRE_OFFLINK)) {
875 			/*
876 			 * When we're in a zone, we're only
877 			 * interested in routers that are
878 			 * reachable through ipifs within our zone.
879 			 */
880 			if (ire->ire_ipversion == IPV4_VERSION) {
881 				if (!ire_gateway_ok_zone_v4(
882 				    ire->ire_gateway_addr, margs->ift_zoneid,
883 				    ire->ire_ill, margs->ift_tsl, ipst,
884 				    B_TRUE))
885 					goto next_ire;
886 			} else {
887 				if (!ire_gateway_ok_zone_v6(
888 				    &ire->ire_gateway_addr_v6,
889 				    margs->ift_zoneid, ire->ire_ill,
890 				    margs->ift_tsl, ipst, B_TRUE))
891 					goto next_ire;
892 			}
893 		}
894 		mutex_enter(&ire->ire_lock);
895 		/* Look for stale ire_badcnt and clear */
896 		if (ire->ire_badcnt != 0 &&
897 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
898 		    ipst->ips_ip_ire_badcnt_lifetime))
899 			ire->ire_badcnt = 0;
900 		mutex_exit(&ire->ire_lock);
901 
902 		if (ire->ire_badcnt == 0) {
903 			/* We found one with a zero badcnt; done */
904 			ire_refhold(ire);
905 			/*
906 			 * Care needed since irb_refrele grabs WLOCK to free
907 			 * the irb_t.
908 			 */
909 			if (ire->ire_ipversion == IPV4_VERSION) {
910 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
911 				irb_refrele(irb_ptr);
912 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
913 			} else {
914 				rw_exit(&ipst->ips_ip6_ire_head_lock);
915 				irb_refrele(irb_ptr);
916 				rw_enter(&ipst->ips_ip6_ire_head_lock,
917 				    RW_READER);
918 			}
919 			return (ire);
920 		}
921 		/*
922 		 * keep looking to see if there is a better (lower
923 		 * badcnt) matching IRE, but save this one as a last resort.
924 		 * If we find a lower badcnt pick that one as the last* resort.
925 		 */
926 		if (maybe_ire == NULL) {
927 			maybe_ire = ire;
928 			maybe_badcnt = ire->ire_badcnt;
929 		} else if (ire->ire_badcnt < maybe_badcnt) {
930 			maybe_ire = ire;
931 			maybe_badcnt = ire->ire_badcnt;
932 		}
933 
934 next_ire:
935 		maxwalk--;
936 next_ire_skip:
937 		ire = ire->ire_next;
938 		if (ire == NULL)
939 			ire = irb_ptr->irb_ire;
940 	}
941 	if (maybe_ire != NULL)
942 		ire_refhold(maybe_ire);
943 
944 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
945 	if (ire->ire_ipversion == IPV4_VERSION) {
946 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
947 		irb_refrele(irb_ptr);
948 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
949 	} else {
950 		rw_exit(&ipst->ips_ip6_ire_head_lock);
951 		irb_refrele(irb_ptr);
952 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
953 	}
954 	return (maybe_ire);
955 }
956 
957 void
958 irb_refhold_rn(struct radix_node *rn)
959 {
960 	if ((rn->rn_flags & RNF_ROOT) == 0)
961 		irb_refhold(&((rt_t *)(rn))->rt_irb);
962 }
963 
964 void
965 irb_refrele_rn(struct radix_node *rn)
966 {
967 	if ((rn->rn_flags & RNF_ROOT) == 0)
968 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
969 }
970 
971 /*
972  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
973  * routes this routine sets up a ire_nce_cache as well. The caller needs to
974  * lookup an nce for the multicast case.
975  */
976 ire_t *
977 ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
978     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
979 {
980 	uint_t		match_args;
981 	uint_t		ire_type;
982 	ill_t		*ill;
983 	ire_t		*ire;
984 	ip_stack_t	*ipst = ixa->ixa_ipst;
985 	ipaddr_t	v4dst;
986 	in6_addr_t	v6nexthop;
987 	iaflags_t	ixaflags = ixa->ixa_flags;
988 	nce_t		*nce;
989 
990 	match_args = MATCH_IRE_SECATTR;
991 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
992 	if (setsrcp != NULL)
993 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
994 	if (errorp != NULL)
995 		ASSERT(*errorp == 0);
996 
997 	/*
998 	 * The content of the ixa will be different if IP_NEXTHOP,
999 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1000 	 */
1001 
1002 	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
1003 	    IN6_IS_ADDR_MULTICAST(v6dst)) {
1004 		/* Pick up the IRE_MULTICAST for the ill */
1005 		if (ixa->ixa_multicast_ifindex != 0) {
1006 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1007 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1008 		} else if (ixaflags & IXAF_SCOPEID_SET) {
1009 			/* sin6_scope_id takes precedence over ixa_ifindex */
1010 			ASSERT(ixa->ixa_scopeid != 0);
1011 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1012 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1013 		} else if (ixa->ixa_ifindex != 0) {
1014 			/*
1015 			 * In the ipmp case, the ixa_ifindex is set to
1016 			 * point at an under_ill and we would return the
1017 			 * ire_multicast() corresponding to that under_ill.
1018 			 */
1019 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1020 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1021 		} else if (ixaflags & IXAF_IS_IPV4) {
1022 			ipaddr_t	v4setsrc = INADDR_ANY;
1023 
1024 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
1025 			    multirtp, &v4setsrc);
1026 			if (setsrcp != NULL)
1027 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1028 		} else {
1029 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
1030 			    multirtp, setsrcp);
1031 		}
1032 		if (ill != NULL && IS_VNI(ill)) {
1033 			ill_refrele(ill);
1034 			ill = NULL;
1035 		}
1036 		if (ill == NULL) {
1037 			if (errorp != NULL)
1038 				*errorp = ENXIO;
1039 			/* Get a hold on the IRE_NOROUTE */
1040 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1041 			return (ire);
1042 		}
1043 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
1044 			ill_refrele(ill);
1045 			if (errorp != NULL)
1046 				*errorp = EHOSTUNREACH;
1047 			/* Get a hold on the IRE_NOROUTE */
1048 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1049 			return (ire);
1050 		}
1051 		/* Get a refcnt on the single IRE_MULTICAST per ill */
1052 		ire = ire_multicast(ill);
1053 		ill_refrele(ill);
1054 		if (generationp != NULL)
1055 			*generationp = ire->ire_generation;
1056 		if (errorp != NULL &&
1057 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1058 			*errorp = EHOSTUNREACH;
1059 		}
1060 		return (ire);
1061 	}
1062 
1063 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1064 		if (ixaflags & IXAF_SCOPEID_SET) {
1065 			/* sin6_scope_id takes precedence over ixa_ifindex */
1066 			ASSERT(ixa->ixa_scopeid != 0);
1067 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1068 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1069 		} else {
1070 			ASSERT(ixa->ixa_ifindex != 0);
1071 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1072 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1073 		}
1074 		if (ill != NULL && IS_VNI(ill)) {
1075 			ill_refrele(ill);
1076 			ill = NULL;
1077 		}
1078 		if (ill == NULL) {
1079 			if (errorp != NULL)
1080 				*errorp = ENXIO;
1081 			/* Get a hold on the IRE_NOROUTE */
1082 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1083 			return (ire);
1084 		}
1085 		/*
1086 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1087 		 * so for both of them we need to be able look for an under
1088 		 * interface.
1089 		 */
1090 		if (IS_UNDER_IPMP(ill))
1091 			match_args |= MATCH_IRE_TESTHIDDEN;
1092 	} else {
1093 		ill = NULL;
1094 	}
1095 
1096 	if (ixaflags & IXAF_NEXTHOP_SET) {
1097 		/* IP_NEXTHOP was set */
1098 		v6nexthop = ixa->ixa_nexthop_v6;
1099 	} else {
1100 		v6nexthop = *v6dst;
1101 	}
1102 
1103 	ire_type = 0;
1104 	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
1105 
1106 	/*
1107 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1108 	 * we only look for an onlink IRE.
1109 	 */
1110 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1111 		match_args |= MATCH_IRE_TYPE;
1112 		ire_type = IRE_ONLINK;
1113 	}
1114 
1115 	if (ixaflags & IXAF_IS_IPV4) {
1116 		ipaddr_t	v4nexthop;
1117 		ipaddr_t	v4setsrc = INADDR_ANY;
1118 
1119 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1120 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1121 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1122 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1123 		if (setsrcp != NULL)
1124 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1125 	} else {
1126 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1127 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1128 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1129 	}
1130 
1131 #ifdef DEBUG
1132 	if (match_args & MATCH_IRE_TESTHIDDEN) {
1133 		ip3dbg(("looking for hidden; dst %x ire %p\n",
1134 		    v4dst, (void *)ire));
1135 	}
1136 #endif
1137 
1138 	if (ill != NULL)
1139 		ill_refrele(ill);
1140 
1141 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1142 	    (ire->ire_type & IRE_MULTICAST)) {
1143 		/* No ire_nce_cache */
1144 		return (ire);
1145 	}
1146 
1147 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
1148 	mutex_enter(&ire->ire_lock);
1149 	nce = ire->ire_nce_cache;
1150 	if (nce == NULL || nce->nce_is_condemned) {
1151 		mutex_exit(&ire->ire_lock);
1152 		(void) ire_revalidate_nce(ire);
1153 	} else {
1154 		mutex_exit(&ire->ire_lock);
1155 	}
1156 	return (ire);
1157 }
1158 
1159 /*
1160  * Find a route given some xmit attributes and a packet.
1161  * Generic for IPv4 and IPv6
1162  *
1163  * This never returns NULL. But when it returns the IRE_NOROUTE
1164  * it might set errorp.
1165  */
1166 ire_t *
1167 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1168     int *errorp, boolean_t *multirtp)
1169 {
1170 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1171 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1172 		in6_addr_t	v6dst;
1173 
1174 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1175 
1176 		return (ip_select_route(&v6dst, ixa, generationp,
1177 		    NULL, errorp, multirtp));
1178 	} else {
1179 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
1180 
1181 		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
1182 		    NULL, errorp, multirtp));
1183 	}
1184 }
1185 
1186 ire_t *
1187 ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
1188     ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1189 {
1190 	in6_addr_t	v6dst;
1191 	ire_t		*ire;
1192 	in6_addr_t	setsrc;
1193 
1194 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1195 
1196 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1197 
1198 	setsrc = ipv6_all_zeros;
1199 	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
1200 	    multirtp);
1201 	if (v4setsrcp != NULL)
1202 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1203 	return (ire);
1204 }
1205 
1206 /*
1207  * Recursively look for a route to the destination. Can also match on
1208  * the zoneid, ill, and label. Used for the data paths. See also
1209  * ire_route_recursive.
1210  *
1211  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
1212  *
1213  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1214  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1215  * forwarding.
1216  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1217  * resolve the gateway.
1218  *
1219  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1220  * instead.
1221  *
1222  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1223  * is an error.
1224  * Allow at most one RTF_INDIRECT.
1225  */
1226 ire_t *
1227 ire_route_recursive_impl_v4(ire_t *ire,
1228     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1229     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1230     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1231     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1232 {
1233 	int		i, j;
1234 	ire_t		*ires[MAX_IRE_RECURSION];
1235 	uint_t		generation;
1236 	uint_t		generations[MAX_IRE_RECURSION];
1237 	boolean_t	need_refrele = B_FALSE;
1238 	boolean_t	invalidate = B_FALSE;
1239 	int		prefs[MAX_IRE_RECURSION];
1240 	ill_t		*ill = NULL;
1241 
1242 	if (setsrcp != NULL)
1243 		ASSERT(*setsrcp == INADDR_ANY);
1244 	if (gwattrp != NULL)
1245 		ASSERT(*gwattrp == NULL);
1246 
1247 	if (ill_arg != NULL)
1248 		match_args |= MATCH_IRE_ILL;
1249 
1250 	/*
1251 	 * We iterate up to three times to resolve a route, even though
1252 	 * we have four slots in the array. The extra slot is for an
1253 	 * IRE_IF_CLONE we might need to create.
1254 	 */
1255 	i = 0;
1256 	while (i < MAX_IRE_RECURSION - 1) {
1257 		/* ire_ftable_lookup handles round-robin/ECMP */
1258 		if (ire == NULL) {
1259 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1260 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
1261 			    match_args, xmit_hint, ipst, &generation);
1262 		} else {
1263 			/* Caller passed it; extra hold since we will rele */
1264 			ire_refhold(ire);
1265 			if (generationp != NULL)
1266 				generation = *generationp;
1267 			else
1268 				generation = IRE_GENERATION_VERIFY;
1269 		}
1270 		if (ire == NULL)
1271 			ire = ire_reject(ipst, B_FALSE);
1272 
1273 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1274 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1275 			goto error;
1276 
1277 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1278 
1279 		if (i != 0) {
1280 			prefs[i] = ire_pref(ire);
1281 			/*
1282 			 * Don't allow anything unusual past the first
1283 			 * iteration.
1284 			 */
1285 			if ((ire->ire_type &
1286 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1287 			    prefs[i] <= prefs[i-1]) {
1288 				ire_refrele(ire);
1289 				if (irr_flags & IRR_INCOMPLETE) {
1290 					ire = ires[0];
1291 					ire_refhold(ire);
1292 				} else {
1293 					ire = ire_reject(ipst, B_FALSE);
1294 				}
1295 				goto error;
1296 			}
1297 		}
1298 		/* We have a usable IRE */
1299 		ires[i] = ire;
1300 		generations[i] = generation;
1301 		i++;
1302 
1303 		/* The first RTF_SETSRC address is passed back if setsrcp */
1304 		if ((ire->ire_flags & RTF_SETSRC) &&
1305 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
1306 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1307 			*setsrcp = ire->ire_setsrc_addr;
1308 		}
1309 
1310 		/* The first ire_gw_secattr is passed back if gwattrp */
1311 		if (ire->ire_gw_secattr != NULL &&
1312 		    gwattrp != NULL && *gwattrp == NULL)
1313 			*gwattrp = ire->ire_gw_secattr;
1314 
1315 		/*
1316 		 * Check if we have a short-cut pointer to an IRE for this
1317 		 * destination, and that the cached dependency isn't stale.
1318 		 * In that case we've rejoined an existing tree towards a
1319 		 * parent, thus we don't need to continue the loop to
1320 		 * discover the rest of the tree.
1321 		 */
1322 		mutex_enter(&ire->ire_lock);
1323 		if (ire->ire_dep_parent != NULL &&
1324 		    ire->ire_dep_parent->ire_generation ==
1325 		    ire->ire_dep_parent_generation) {
1326 			mutex_exit(&ire->ire_lock);
1327 			ire = NULL;
1328 			goto done;
1329 		}
1330 		mutex_exit(&ire->ire_lock);
1331 
1332 		/*
1333 		 * If this type should have an ire_nce_cache (even if it
1334 		 * doesn't yet have one) then we are done. Includes
1335 		 * IRE_INTERFACE with a full 32 bit mask.
1336 		 */
1337 		if (ire->ire_nce_capable) {
1338 			ire = NULL;
1339 			goto done;
1340 		}
1341 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1342 		/*
1343 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1344 		 * particular destination
1345 		 */
1346 		if (ire->ire_type & IRE_INTERFACE) {
1347 			in6_addr_t	v6nexthop;
1348 			ire_t		*clone;
1349 
1350 			ASSERT(ire->ire_masklen != IPV4_ABITS);
1351 
1352 			/*
1353 			 * In the case of ip_input and ILLF_FORWARDING not
1354 			 * being set, and in the case of RTM_GET, there is
1355 			 * no point in allocating an IRE_IF_CLONE. We return
1356 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1357 			 * result in a ire_dep_parent which is IRE_IF_*
1358 			 * without an IRE_IF_CLONE.
1359 			 * We recover from that when we need to send packets
1360 			 * by ensuring that the generations become
1361 			 * IRE_GENERATION_VERIFY in this case.
1362 			 */
1363 			if (!(irr_flags & IRR_ALLOCATE)) {
1364 				invalidate = B_TRUE;
1365 				ire = NULL;
1366 				goto done;
1367 			}
1368 
1369 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1370 
1371 			clone = ire_create_if_clone(ire, &v6nexthop,
1372 			    &generation);
1373 			if (clone == NULL) {
1374 				/*
1375 				 * Temporary failure - no memory.
1376 				 * Don't want caller to cache IRE_NOROUTE.
1377 				 */
1378 				invalidate = B_TRUE;
1379 				ire = ire_blackhole(ipst, B_FALSE);
1380 				goto error;
1381 			}
1382 			/*
1383 			 * Make clone next to last entry and the
1384 			 * IRE_INTERFACE the last in the dependency
1385 			 * chain since the clone depends on the
1386 			 * IRE_INTERFACE.
1387 			 */
1388 			ASSERT(i >= 1);
1389 			ASSERT(i < MAX_IRE_RECURSION);
1390 
1391 			ires[i] = ires[i-1];
1392 			generations[i] = generations[i-1];
1393 			ires[i-1] = clone;
1394 			generations[i-1] = generation;
1395 			i++;
1396 
1397 			ire = NULL;
1398 			goto done;
1399 		}
1400 
1401 		/*
1402 		 * We only match on the type and optionally ILL when
1403 		 * recursing. The type match is used by some callers
1404 		 * to exclude certain types (such as IRE_IF_CLONE or
1405 		 * IRE_LOCAL|IRE_LOOPBACK).
1406 		 */
1407 		match_args &= MATCH_IRE_TYPE;
1408 		nexthop = ire->ire_gateway_addr;
1409 		if (ill == NULL && ire->ire_ill != NULL) {
1410 			ill = ire->ire_ill;
1411 			need_refrele = B_TRUE;
1412 			ill_refhold(ill);
1413 			match_args |= MATCH_IRE_ILL;
1414 		}
1415 		/*
1416 		 * We set the prefs[i] value above if i > 0. We've already
1417 		 * done i++ so i is one in the case of the first time around.
1418 		 */
1419 		if (i == 1)
1420 			prefs[0] = ire_pref(ire);
1421 		ire = NULL;
1422 	}
1423 	ASSERT(ire == NULL);
1424 	ire = ire_reject(ipst, B_FALSE);
1425 
1426 error:
1427 	ASSERT(ire != NULL);
1428 	if (need_refrele)
1429 		ill_refrele(ill);
1430 
1431 	/*
1432 	 * In the case of MULTIRT we want to try a different IRE the next
1433 	 * time. We let the next packet retry in that case.
1434 	 */
1435 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1436 		(void) ire_no_good(ires[0]);
1437 
1438 cleanup:
1439 	/* cleanup ires[i] */
1440 	ire_dep_unbuild(ires, i);
1441 	for (j = 0; j < i; j++)
1442 		ire_refrele(ires[j]);
1443 
1444 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1445 	    (irr_flags & IRR_INCOMPLETE));
1446 	/*
1447 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1448 	 * ip_select_route since the reject or lack of memory might be gone.
1449 	 */
1450 	if (generationp != NULL)
1451 		*generationp = IRE_GENERATION_VERIFY;
1452 	return (ire);
1453 
1454 done:
1455 	ASSERT(ire == NULL);
1456 	if (need_refrele) {
1457 		ill_refrele(ill);
1458 		ill = NULL;
1459 	}
1460 
1461 	/* Build dependencies */
1462 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1463 		/* Something in chain was condemned; tear it apart */
1464 		ire = ire_reject(ipst, B_FALSE);
1465 		goto cleanup;
1466 	}
1467 
1468 	/*
1469 	 * Release all refholds except the one for ires[0] that we
1470 	 * will return to the caller.
1471 	 */
1472 	for (j = 1; j < i; j++)
1473 		ire_refrele(ires[j]);
1474 
1475 	if (invalidate) {
1476 		/*
1477 		 * Since we needed to allocate but couldn't we need to make
1478 		 * sure that the dependency chain is rebuilt the next time.
1479 		 */
1480 		ire_dep_invalidate_generations(ires[0]);
1481 		generation = IRE_GENERATION_VERIFY;
1482 	} else {
1483 		/*
1484 		 * IREs can have been added or deleted while we did the
1485 		 * recursive lookup and we can't catch those until we've built
1486 		 * the dependencies. We verify the stored
1487 		 * ire_dep_parent_generation to catch any such changes and
1488 		 * return IRE_GENERATION_VERIFY (which will cause
1489 		 * ip_select_route to be called again so we can redo the
1490 		 * recursive lookup next time we send a packet.
1491 		 */
1492 		if (ires[0]->ire_dep_parent == NULL)
1493 			generation = ires[0]->ire_generation;
1494 		else
1495 			generation = ire_dep_validate_generations(ires[0]);
1496 		if (generations[0] != ires[0]->ire_generation) {
1497 			/* Something changed at the top */
1498 			generation = IRE_GENERATION_VERIFY;
1499 		}
1500 	}
1501 	if (generationp != NULL)
1502 		*generationp = generation;
1503 
1504 	return (ires[0]);
1505 }
1506 
1507 ire_t *
1508 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1509     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1510     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1511     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1512 {
1513 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1514 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1515 	    gwattrp, generationp));
1516 }
1517 
1518 /*
1519  * Recursively look for a route to the destination.
1520  * We only handle a destination match here, yet we have the same arguments
1521  * as the full match to allow function pointers to select between the two.
1522  *
1523  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1524  * instead.
1525  *
1526  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1527  * is an error.
1528  * Allow at most one RTF_INDIRECT.
1529  */
1530 ire_t *
1531 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1532     uint32_t xmit_hint, ip_stack_t *ipst)
1533 {
1534 	ire_t	*ire;
1535 	ire_t	*ire1;
1536 	uint_t	generation;
1537 
1538 	/* ire_ftable_lookup handles round-robin/ECMP */
1539 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1540 	    &generation);
1541 	ASSERT(ire != NULL);
1542 
1543 	/*
1544 	 * If this type should have an ire_nce_cache (even if it
1545 	 * doesn't yet have one) then we are done. Includes
1546 	 * IRE_INTERFACE with a full 32 bit mask.
1547 	 */
1548 	if (ire->ire_nce_capable)
1549 		return (ire);
1550 
1551 	/*
1552 	 * If the IRE has a current cached parent we know that the whole
1553 	 * parent chain is current, hence we don't need to discover and
1554 	 * build any dependencies by doing a recursive lookup.
1555 	 */
1556 	mutex_enter(&ire->ire_lock);
1557 	if (ire->ire_dep_parent != NULL &&
1558 	    ire->ire_dep_parent->ire_generation ==
1559 	    ire->ire_dep_parent_generation) {
1560 		mutex_exit(&ire->ire_lock);
1561 		return (ire);
1562 	}
1563 	mutex_exit(&ire->ire_lock);
1564 
1565 	/*
1566 	 * Fallback to loop in the normal code starting with the ire
1567 	 * we found. Normally this would return the same ire.
1568 	 */
1569 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1570 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1571 	    &generation);
1572 	ire_refrele(ire);
1573 	return (ire1);
1574 }
1575