xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ftable.c (revision bd670b35a010421b6e1a5536c34453a827007c81)
1c793af95Ssangeeta /*
2c793af95Ssangeeta  * CDDL HEADER START
3c793af95Ssangeeta  *
4c793af95Ssangeeta  * The contents of this file are subject to the terms of the
5c793af95Ssangeeta  * Common Development and Distribution License (the "License").
6c793af95Ssangeeta  * You may not use this file except in compliance with the License.
7c793af95Ssangeeta  *
8c793af95Ssangeeta  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c793af95Ssangeeta  * or http://www.opensolaris.org/os/licensing.
10c793af95Ssangeeta  * See the License for the specific language governing permissions
11c793af95Ssangeeta  * and limitations under the License.
12c793af95Ssangeeta  *
13c793af95Ssangeeta  * When distributing Covered Code, include this CDDL HEADER in each
14c793af95Ssangeeta  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c793af95Ssangeeta  * If applicable, add the following below this CDDL HEADER, with the
16c793af95Ssangeeta  * fields enclosed by brackets "[]" replaced with your own identifying
17c793af95Ssangeeta  * information: Portions Copyright [yyyy] [name of copyright owner]
18c793af95Ssangeeta  *
19c793af95Ssangeeta  * CDDL HEADER END
20c793af95Ssangeeta  */
21c793af95Ssangeeta /*
22e11c3f44Smeem  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23c793af95Ssangeeta  * Use is subject to license terms.
24c793af95Ssangeeta  */
25c793af95Ssangeeta 
26c793af95Ssangeeta /*
27c793af95Ssangeeta  * This file contains consumer routines of the IPv4 forwarding engine
28c793af95Ssangeeta  */
29c793af95Ssangeeta 
30c793af95Ssangeeta #include <sys/types.h>
31c793af95Ssangeeta #include <sys/stream.h>
32c793af95Ssangeeta #include <sys/stropts.h>
33c793af95Ssangeeta #include <sys/strlog.h>
34c793af95Ssangeeta #include <sys/dlpi.h>
35c793af95Ssangeeta #include <sys/ddi.h>
36c793af95Ssangeeta #include <sys/cmn_err.h>
37c793af95Ssangeeta #include <sys/policy.h>
38c793af95Ssangeeta 
39c793af95Ssangeeta #include <sys/systm.h>
40c793af95Ssangeeta #include <sys/strsun.h>
41c793af95Ssangeeta #include <sys/kmem.h>
42c793af95Ssangeeta #include <sys/param.h>
43c793af95Ssangeeta #include <sys/socket.h>
44edd26dc5Sdr146992 #include <sys/strsubr.h>
45c793af95Ssangeeta #include <net/if.h>
46c793af95Ssangeeta #include <net/route.h>
47c793af95Ssangeeta #include <netinet/in.h>
48c793af95Ssangeeta #include <net/if_dl.h>
49c793af95Ssangeeta #include <netinet/ip6.h>
50c793af95Ssangeeta #include <netinet/icmp6.h>
51c793af95Ssangeeta 
52*bd670b35SErik Nordmark #include <inet/ipsec_impl.h>
53c793af95Ssangeeta #include <inet/common.h>
54c793af95Ssangeeta #include <inet/mi.h>
55c793af95Ssangeeta #include <inet/mib2.h>
56c793af95Ssangeeta #include <inet/ip.h>
57edd26dc5Sdr146992 #include <inet/ip_impl.h>
58c793af95Ssangeeta #include <inet/ip6.h>
59c793af95Ssangeeta #include <inet/ip_ndp.h>
60c793af95Ssangeeta #include <inet/arp.h>
61c793af95Ssangeeta #include <inet/ip_if.h>
62c793af95Ssangeeta #include <inet/ip_ire.h>
63c793af95Ssangeeta #include <inet/ip_ftable.h>
64c793af95Ssangeeta #include <inet/ip_rts.h>
65c793af95Ssangeeta #include <inet/nd.h>
66c793af95Ssangeeta 
67c793af95Ssangeeta #include <net/pfkeyv2.h>
68c793af95Ssangeeta #include <inet/sadb.h>
69c793af95Ssangeeta #include <inet/tcp.h>
70c793af95Ssangeeta #include <inet/ipclassifier.h>
71c793af95Ssangeeta #include <sys/zone.h>
72c793af95Ssangeeta #include <net/radix.h>
73c793af95Ssangeeta #include <sys/tsol/label.h>
74c793af95Ssangeeta #include <sys/tsol/tnet.h>
75c793af95Ssangeeta 
76c793af95Ssangeeta #define	IS_DEFAULT_ROUTE(ire)	\
77c793af95Ssangeeta 	(((ire)->ire_type & IRE_DEFAULT) || \
78c793af95Ssangeeta 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
79c793af95Ssangeeta 
80f4b3ec61Sdh155122 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
81c793af95Ssangeeta static void	ire_del_host_redir(ire_t *, char *);
82c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *);
83c793af95Ssangeeta 
84c793af95Ssangeeta /*
85c793af95Ssangeeta  * Lookup a route in forwarding table. A specific lookup is indicated by
86c793af95Ssangeeta  * passing the required parameters and indicating the match required in the
87c793af95Ssangeeta  * flag field.
88c793af95Ssangeeta  *
89c793af95Ssangeeta  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
90c793af95Ssangeeta  */
91c793af95Ssangeeta ire_t *
92*bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
93*bd670b35SErik Nordmark     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
94*bd670b35SErik Nordmark     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
95c793af95Ssangeeta {
96*bd670b35SErik Nordmark 	ire_t *ire;
97c793af95Ssangeeta 	struct rt_sockaddr rdst, rmask;
98c793af95Ssangeeta 	struct rt_entry *rt;
99c793af95Ssangeeta 	ire_ftable_args_t margs;
100c793af95Ssangeeta 
101*bd670b35SErik Nordmark 	ASSERT(ill == NULL || !ill->ill_isv6);
102c793af95Ssangeeta 
103c793af95Ssangeeta 	/*
104*bd670b35SErik Nordmark 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
105*bd670b35SErik Nordmark 	 * is set.
106c793af95Ssangeeta 	 */
107*bd670b35SErik Nordmark 	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
108c793af95Ssangeeta 		return (NULL);
109c793af95Ssangeeta 
110c793af95Ssangeeta 	(void) memset(&rdst, 0, sizeof (rdst));
111c793af95Ssangeeta 	rdst.rt_sin_len = sizeof (rdst);
112c793af95Ssangeeta 	rdst.rt_sin_family = AF_INET;
113c793af95Ssangeeta 	rdst.rt_sin_addr.s_addr = addr;
114c793af95Ssangeeta 
115c793af95Ssangeeta 	(void) memset(&rmask, 0, sizeof (rmask));
116c793af95Ssangeeta 	rmask.rt_sin_len = sizeof (rmask);
117c793af95Ssangeeta 	rmask.rt_sin_family = AF_INET;
118c793af95Ssangeeta 	rmask.rt_sin_addr.s_addr = mask;
119c793af95Ssangeeta 
120c793af95Ssangeeta 	(void) memset(&margs, 0, sizeof (margs));
121c793af95Ssangeeta 	margs.ift_addr = addr;
122c793af95Ssangeeta 	margs.ift_mask = mask;
123c793af95Ssangeeta 	margs.ift_gateway = gateway;
124c793af95Ssangeeta 	margs.ift_type = type;
125*bd670b35SErik Nordmark 	margs.ift_ill = ill;
126c793af95Ssangeeta 	margs.ift_zoneid = zoneid;
127c793af95Ssangeeta 	margs.ift_tsl = tsl;
128c793af95Ssangeeta 	margs.ift_flags = flags;
129c793af95Ssangeeta 
130c793af95Ssangeeta 	/*
131c793af95Ssangeeta 	 * The flags argument passed to ire_ftable_lookup may cause the
132c793af95Ssangeeta 	 * search to return, not the longest matching prefix, but the
133c793af95Ssangeeta 	 * "best matching prefix", i.e., the longest prefix that also
134c793af95Ssangeeta 	 * satisfies constraints imposed via the permutation of flags
135c793af95Ssangeeta 	 * passed in. To achieve this, we invoke ire_match_args() on
136c793af95Ssangeeta 	 * each matching leaf in the  radix tree. ire_match_args is
137c793af95Ssangeeta 	 * invoked by the callback function ire_find_best_route()
138c793af95Ssangeeta 	 * We hold the global tree lock in read mode when calling
139c793af95Ssangeeta 	 * rn_match_args. Before dropping the global tree lock, ensure
140c793af95Ssangeeta 	 * that the radix node can't be deleted by incrementing ire_refcnt.
141c793af95Ssangeeta 	 */
142f4b3ec61Sdh155122 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
143f4b3ec61Sdh155122 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
144f4b3ec61Sdh155122 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
145c793af95Ssangeeta 	ire = margs.ift_best_ire;
146c793af95Ssangeeta 	if (rt == NULL) {
147*bd670b35SErik Nordmark 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
148c793af95Ssangeeta 		return (NULL);
149c793af95Ssangeeta 	}
150*bd670b35SErik Nordmark 	ASSERT(ire != NULL);
151c793af95Ssangeeta 
152c793af95Ssangeeta 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
153c793af95Ssangeeta 
154c793af95Ssangeeta 	/*
155c793af95Ssangeeta 	 * round-robin only if we have more than one route in the bucket.
156*bd670b35SErik Nordmark 	 * ips_ip_ecmp_behavior controls when we do ECMP
157*bd670b35SErik Nordmark 	 *	2:	always
158*bd670b35SErik Nordmark 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
159*bd670b35SErik Nordmark 	 *	0:	never
160c793af95Ssangeeta 	 */
161*bd670b35SErik Nordmark 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
162*bd670b35SErik Nordmark 		if (ipst->ips_ip_ecmp_behavior == 2 ||
163*bd670b35SErik Nordmark 		    (ipst->ips_ip_ecmp_behavior == 1 &&
164*bd670b35SErik Nordmark 		    IS_DEFAULT_ROUTE(ire))) {
165c793af95Ssangeeta 			ire_t	*next_ire;
166c793af95Ssangeeta 
167*bd670b35SErik Nordmark 			margs.ift_best_ire = NULL;
168*bd670b35SErik Nordmark 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
169*bd670b35SErik Nordmark 			    xmit_hint, ire, ipst);
170*bd670b35SErik Nordmark 			if (next_ire == NULL) {
171*bd670b35SErik Nordmark 				/* keep ire if next_ire is null */
172*bd670b35SErik Nordmark 				goto done;
173*bd670b35SErik Nordmark 			}
174*bd670b35SErik Nordmark 			ire_refrele(ire);
175c793af95Ssangeeta 			ire = next_ire;
176c793af95Ssangeeta 		}
177c793af95Ssangeeta 	}
178c793af95Ssangeeta 
179*bd670b35SErik Nordmark done:
180*bd670b35SErik Nordmark 	/* Return generation before dropping lock */
181*bd670b35SErik Nordmark 	if (generationp != NULL)
182*bd670b35SErik Nordmark 		*generationp = ire->ire_generation;
183c793af95Ssangeeta 
184*bd670b35SErik Nordmark 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
185e11c3f44Smeem 
186c793af95Ssangeeta 	/*
187*bd670b35SErik Nordmark 	 * For shared-IP zones we need additional checks to what was
188*bd670b35SErik Nordmark 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
189*bd670b35SErik Nordmark 	 *
190*bd670b35SErik Nordmark 	 * When ip_restrict_interzone_loopback is set, then
191*bd670b35SErik Nordmark 	 * we ensure that IRE_LOCAL are only used for loopback
192*bd670b35SErik Nordmark 	 * between zones when the logical "Ethernet" would
193*bd670b35SErik Nordmark 	 * have looped them back. That is, if in the absense of
194*bd670b35SErik Nordmark 	 * the IRE_LOCAL we would have sent to packet out the
195*bd670b35SErik Nordmark 	 * same ill.
196c793af95Ssangeeta 	 */
197*bd670b35SErik Nordmark 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
198*bd670b35SErik Nordmark 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
199*bd670b35SErik Nordmark 	    ipst->ips_ip_restrict_interzone_loopback) {
200*bd670b35SErik Nordmark 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
201*bd670b35SErik Nordmark 		ASSERT(ire != NULL);
202c793af95Ssangeeta 	}
203c793af95Ssangeeta 	return (ire);
204c793af95Ssangeeta }
205c793af95Ssangeeta 
206da14cebeSEric Cheng /*
207da14cebeSEric Cheng  * This function is called by
208*bd670b35SErik Nordmark  * ip_input/ire_route_recursive when doing a route lookup on only the
209*bd670b35SErik Nordmark  * destination address.
210*bd670b35SErik Nordmark  *
211da14cebeSEric Cheng  * The optimizations of this function over ire_ftable_lookup are:
212da14cebeSEric Cheng  *	o removing unnecessary flag matching
213da14cebeSEric Cheng  *	o doing longest prefix match instead of overloading it further
214da14cebeSEric Cheng  *	  with the unnecessary "best_prefix_match"
215*bd670b35SErik Nordmark  *
216*bd670b35SErik Nordmark  * If no route is found we return IRE_NOROUTE.
217da14cebeSEric Cheng  */
218*bd670b35SErik Nordmark ire_t *
219*bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
220*bd670b35SErik Nordmark     uint_t *generationp)
221da14cebeSEric Cheng {
222*bd670b35SErik Nordmark 	ire_t *ire;
223da14cebeSEric Cheng 	struct rt_sockaddr rdst;
224da14cebeSEric Cheng 	struct rt_entry *rt;
225*bd670b35SErik Nordmark 	irb_t *irb;
226da14cebeSEric Cheng 
227da14cebeSEric Cheng 	rdst.rt_sin_len = sizeof (rdst);
228da14cebeSEric Cheng 	rdst.rt_sin_family = AF_INET;
229da14cebeSEric Cheng 	rdst.rt_sin_addr.s_addr = addr;
230da14cebeSEric Cheng 
231da14cebeSEric Cheng 	/*
232da14cebeSEric Cheng 	 * This is basically inlining  a simpler version of ire_match_args
233da14cebeSEric Cheng 	 */
234da14cebeSEric Cheng 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
235da14cebeSEric Cheng 
236da14cebeSEric Cheng 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
237da14cebeSEric Cheng 	    ipst->ips_ip_ftable, NULL, NULL);
238da14cebeSEric Cheng 
239*bd670b35SErik Nordmark 	if (rt == NULL)
240*bd670b35SErik Nordmark 		goto bad;
241*bd670b35SErik Nordmark 
242*bd670b35SErik Nordmark 	irb = &rt->rt_irb;
243*bd670b35SErik Nordmark 	if (irb->irb_ire_cnt == 0)
244*bd670b35SErik Nordmark 		goto bad;
245*bd670b35SErik Nordmark 
246*bd670b35SErik Nordmark 	rw_enter(&irb->irb_lock, RW_READER);
247*bd670b35SErik Nordmark 	ire = irb->irb_ire;
248*bd670b35SErik Nordmark 	if (ire == NULL) {
249*bd670b35SErik Nordmark 		rw_exit(&irb->irb_lock);
250*bd670b35SErik Nordmark 		goto bad;
251da14cebeSEric Cheng 	}
252*bd670b35SErik Nordmark 	while (IRE_IS_CONDEMNED(ire)) {
253*bd670b35SErik Nordmark 		ire = ire->ire_next;
254*bd670b35SErik Nordmark 		if (ire == NULL) {
255*bd670b35SErik Nordmark 			rw_exit(&irb->irb_lock);
256*bd670b35SErik Nordmark 			goto bad;
257*bd670b35SErik Nordmark 		}
258da14cebeSEric Cheng 	}
259da14cebeSEric Cheng 
260da14cebeSEric Cheng 	/* we have a ire that matches */
261*bd670b35SErik Nordmark 	ire_refhold(ire);
262*bd670b35SErik Nordmark 	rw_exit(&irb->irb_lock);
263*bd670b35SErik Nordmark 
264*bd670b35SErik Nordmark 	/*
265*bd670b35SErik Nordmark 	 * round-robin only if we have more than one route in the bucket.
266*bd670b35SErik Nordmark 	 * ips_ip_ecmp_behavior controls when we do ECMP
267*bd670b35SErik Nordmark 	 *	2:	always
268*bd670b35SErik Nordmark 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
269*bd670b35SErik Nordmark 	 *	0:	never
270*bd670b35SErik Nordmark 	 *
271*bd670b35SErik Nordmark 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
272*bd670b35SErik Nordmark 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
273*bd670b35SErik Nordmark 	 * and the IRE_INTERFACESs are likely to be shorter matches.
274*bd670b35SErik Nordmark 	 */
275*bd670b35SErik Nordmark 	if (ire->ire_bucket->irb_ire_cnt > 1) {
276*bd670b35SErik Nordmark 		if (ipst->ips_ip_ecmp_behavior == 2 ||
277*bd670b35SErik Nordmark 		    (ipst->ips_ip_ecmp_behavior == 1 &&
278*bd670b35SErik Nordmark 		    IS_DEFAULT_ROUTE(ire))) {
279*bd670b35SErik Nordmark 			ire_t	*next_ire;
280*bd670b35SErik Nordmark 			ire_ftable_args_t margs;
281*bd670b35SErik Nordmark 
282*bd670b35SErik Nordmark 			(void) memset(&margs, 0, sizeof (margs));
283*bd670b35SErik Nordmark 			margs.ift_addr = addr;
284*bd670b35SErik Nordmark 			margs.ift_zoneid = ALL_ZONES;
285*bd670b35SErik Nordmark 
286*bd670b35SErik Nordmark 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
287*bd670b35SErik Nordmark 			    xmit_hint, ire, ipst);
288*bd670b35SErik Nordmark 			if (next_ire == NULL) {
289*bd670b35SErik Nordmark 				/* keep ire if next_ire is null */
290*bd670b35SErik Nordmark 				if (generationp != NULL)
291*bd670b35SErik Nordmark 					*generationp = ire->ire_generation;
292*bd670b35SErik Nordmark 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
293*bd670b35SErik Nordmark 				return (ire);
294*bd670b35SErik Nordmark 			}
295*bd670b35SErik Nordmark 			ire_refrele(ire);
296*bd670b35SErik Nordmark 			ire = next_ire;
297*bd670b35SErik Nordmark 		}
298*bd670b35SErik Nordmark 	}
299*bd670b35SErik Nordmark 	/* Return generation before dropping lock */
300*bd670b35SErik Nordmark 	if (generationp != NULL)
301*bd670b35SErik Nordmark 		*generationp = ire->ire_generation;
302*bd670b35SErik Nordmark 
303da14cebeSEric Cheng 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
304da14cebeSEric Cheng 
305*bd670b35SErik Nordmark 	/*
306*bd670b35SErik Nordmark 	 * Since we only did ALL_ZONES matches there is no special handling
307*bd670b35SErik Nordmark 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
308*bd670b35SErik Nordmark 	 */
309da14cebeSEric Cheng 	return (ire);
310da14cebeSEric Cheng 
311*bd670b35SErik Nordmark bad:
312*bd670b35SErik Nordmark 	if (generationp != NULL)
313*bd670b35SErik Nordmark 		*generationp = IRE_GENERATION_VERIFY;
314da14cebeSEric Cheng 
315*bd670b35SErik Nordmark 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
316*bd670b35SErik Nordmark 	return (ire_reject(ipst, B_FALSE));
317da14cebeSEric Cheng }
318c793af95Ssangeeta 
319c793af95Ssangeeta /*
320*bd670b35SErik Nordmark  * Find the ill matching a multicast group.
321c793af95Ssangeeta  * Allows different routes for multicast addresses
322c793af95Ssangeeta  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
323c793af95Ssangeeta  * which point at different interfaces. This is used when IP_MULTICAST_IF
324c793af95Ssangeeta  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
325c793af95Ssangeeta  * specify the interface to join on.
326c793af95Ssangeeta  *
327*bd670b35SErik Nordmark  * Supports link-local addresses by using ire_route_recursive which follows
328*bd670b35SErik Nordmark  * the ill when recursing.
329*bd670b35SErik Nordmark  *
330*bd670b35SErik Nordmark  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
331*bd670b35SErik Nordmark  * and the MULTIRT property can be different for different groups, we
332*bd670b35SErik Nordmark  * extract RTF_MULTIRT from the special unicast route added for a group
333*bd670b35SErik Nordmark  * with CGTP and pass that back in the multirtp argument.
334*bd670b35SErik Nordmark  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
335*bd670b35SErik Nordmark  * We have a setsrcp argument for the same reason.
336c793af95Ssangeeta  */
337*bd670b35SErik Nordmark ill_t *
338*bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
339*bd670b35SErik Nordmark     boolean_t *multirtp, ipaddr_t *setsrcp)
340c793af95Ssangeeta {
341c793af95Ssangeeta 	ire_t	*ire;
342*bd670b35SErik Nordmark 	ill_t	*ill;
343c793af95Ssangeeta 
344*bd670b35SErik Nordmark 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
345*bd670b35SErik Nordmark 	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
346*bd670b35SErik Nordmark 	ASSERT(ire != NULL);
347*bd670b35SErik Nordmark 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
348c793af95Ssangeeta 		ire_refrele(ire);
349c793af95Ssangeeta 		return (NULL);
350c793af95Ssangeeta 	}
351*bd670b35SErik Nordmark 
352*bd670b35SErik Nordmark 	if (multirtp != NULL)
353*bd670b35SErik Nordmark 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
354*bd670b35SErik Nordmark 
355*bd670b35SErik Nordmark 	ill = ire_nexthop_ill(ire);
356*bd670b35SErik Nordmark 	ire_refrele(ire);
357*bd670b35SErik Nordmark 	return (ill);
358c793af95Ssangeeta }
359c793af95Ssangeeta 
360c793af95Ssangeeta /*
361c793af95Ssangeeta  * Delete the passed in ire if the gateway addr matches
362c793af95Ssangeeta  */
363c793af95Ssangeeta void
364c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway)
365c793af95Ssangeeta {
3666bdb8e66Sdd193516 	if ((ire->ire_flags & RTF_DYNAMIC) &&
367c793af95Ssangeeta 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
368c793af95Ssangeeta 		ire_delete(ire);
369c793af95Ssangeeta }
370c793af95Ssangeeta 
371c793af95Ssangeeta /*
372*bd670b35SErik Nordmark  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
373c793af95Ssangeeta  * pointing at the specified gateway and
374c793af95Ssangeeta  * delete them. This routine is called only
375c793af95Ssangeeta  * when a default gateway is going away.
376c793af95Ssangeeta  */
377c793af95Ssangeeta void
378f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
379c793af95Ssangeeta {
380c793af95Ssangeeta 	struct rtfuncarg rtfarg;
381c793af95Ssangeeta 
382c793af95Ssangeeta 	(void) memset(&rtfarg, 0, sizeof (rtfarg));
383c793af95Ssangeeta 	rtfarg.rt_func = ire_del_host_redir;
384c793af95Ssangeeta 	rtfarg.rt_arg = (void *)&gateway;
385f4b3ec61Sdh155122 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
386f4b3ec61Sdh155122 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
387c793af95Ssangeeta }
388c793af95Ssangeeta 
389c793af95Ssangeeta /*
390f4b3ec61Sdh155122  * Obtain the rt_entry and rt_irb for the route to be added to
391f4b3ec61Sdh155122  * the ips_ip_ftable.
392c793af95Ssangeeta  * First attempt to add a node to the radix tree via rn_addroute. If the
393c793af95Ssangeeta  * route already exists, return the bucket for the existing route.
394c793af95Ssangeeta  *
395c793af95Ssangeeta  * Locking notes: Need to hold the global radix tree lock in write mode to
396c793af95Ssangeeta  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
397c793af95Ssangeeta  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
398c793af95Ssangeeta  * while holding the irb_lock, but not the radix tree lock.
399c793af95Ssangeeta  */
400c793af95Ssangeeta irb_t *
401c793af95Ssangeeta ire_get_bucket(ire_t *ire)
402c793af95Ssangeeta {
403c793af95Ssangeeta 	struct radix_node *rn;
404c793af95Ssangeeta 	struct rt_entry *rt;
405c793af95Ssangeeta 	struct rt_sockaddr rmask, rdst;
406c793af95Ssangeeta 	irb_t *irb = NULL;
407f4b3ec61Sdh155122 	ip_stack_t *ipst = ire->ire_ipst;
408c793af95Ssangeeta 
409f4b3ec61Sdh155122 	ASSERT(ipst->ips_ip_ftable != NULL);
410c793af95Ssangeeta 
411c793af95Ssangeeta 	/* first try to see if route exists (based on rtalloc1) */
412c793af95Ssangeeta 	(void) memset(&rdst, 0, sizeof (rdst));
413c793af95Ssangeeta 	rdst.rt_sin_len = sizeof (rdst);
414c793af95Ssangeeta 	rdst.rt_sin_family = AF_INET;
415c793af95Ssangeeta 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
416c793af95Ssangeeta 
417c793af95Ssangeeta 	(void) memset(&rmask, 0, sizeof (rmask));
418c793af95Ssangeeta 	rmask.rt_sin_len = sizeof (rmask);
419c793af95Ssangeeta 	rmask.rt_sin_family = AF_INET;
420c793af95Ssangeeta 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
421c793af95Ssangeeta 
422c793af95Ssangeeta 	/*
423c793af95Ssangeeta 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
424c793af95Ssangeeta 	 */
425c793af95Ssangeeta 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
42629bc4795Ssangeeta 	/* kmem_alloc failed */
42729bc4795Ssangeeta 	if (rt == NULL)
42829bc4795Ssangeeta 		return (NULL);
42929bc4795Ssangeeta 
430c793af95Ssangeeta 	(void) memset(rt, 0, sizeof (*rt));
431c793af95Ssangeeta 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
432c793af95Ssangeeta 	rt->rt_dst = rdst;
433c793af95Ssangeeta 	irb = &rt->rt_irb;
434*bd670b35SErik Nordmark 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
435f4b3ec61Sdh155122 	irb->irb_ipst = ipst;
436c793af95Ssangeeta 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
437f4b3ec61Sdh155122 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
438f4b3ec61Sdh155122 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
439f4b3ec61Sdh155122 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
440c793af95Ssangeeta 	if (rn == NULL) {
441f4b3ec61Sdh155122 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
442c793af95Ssangeeta 		Free(rt, rt_entry_cache);
443c793af95Ssangeeta 		rt = NULL;
444c793af95Ssangeeta 		irb = NULL;
445f4b3ec61Sdh155122 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
446f4b3ec61Sdh155122 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
447f4b3ec61Sdh155122 		    ipst->ips_ip_ftable);
448f4b3ec61Sdh155122 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
449c793af95Ssangeeta 			/* found a non-root match */
450c793af95Ssangeeta 			rt = (struct rt_entry *)rn;
451c793af95Ssangeeta 		}
452c793af95Ssangeeta 	}
453c793af95Ssangeeta 	if (rt != NULL) {
454c793af95Ssangeeta 		irb = &rt->rt_irb;
455*bd670b35SErik Nordmark 		irb_refhold(irb);
456c793af95Ssangeeta 	}
457f4b3ec61Sdh155122 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
458c793af95Ssangeeta 	return (irb);
459c793af95Ssangeeta }
460c793af95Ssangeeta 
461c793af95Ssangeeta /*
462c793af95Ssangeeta  * This function is used when the caller wants to know the outbound
463c793af95Ssangeeta  * interface for a packet given only the address.
464c793af95Ssangeeta  * If this is a offlink IP address and there are multiple
465c793af95Ssangeeta  * routes to this destination, this routine will utilise the
466c793af95Ssangeeta  * first route it finds to IP address
467c793af95Ssangeeta  * Return values:
468c793af95Ssangeeta  * 	0	- FAILURE
469c793af95Ssangeeta  *	nonzero	- ifindex
470c793af95Ssangeeta  */
471c793af95Ssangeeta uint_t
472c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
473c793af95Ssangeeta {
474c793af95Ssangeeta 	uint_t ifindex = 0;
475c793af95Ssangeeta 	ire_t *ire;
476c793af95Ssangeeta 	ill_t *ill;
477f4b3ec61Sdh155122 	netstack_t *ns;
478f4b3ec61Sdh155122 	ip_stack_t *ipst;
479c793af95Ssangeeta 
480f4b3ec61Sdh155122 	if (zoneid == ALL_ZONES)
481f4b3ec61Sdh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
482f4b3ec61Sdh155122 	else
483f4b3ec61Sdh155122 		ns = netstack_find_by_zoneid(zoneid);
484f4b3ec61Sdh155122 	ASSERT(ns != NULL);
485f4b3ec61Sdh155122 
486f4b3ec61Sdh155122 	/*
487f4b3ec61Sdh155122 	 * For exclusive stacks we set the zoneid to zero
488f4b3ec61Sdh155122 	 * since IP uses the global zoneid in the exclusive stacks.
489f4b3ec61Sdh155122 	 */
490f4b3ec61Sdh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
491f4b3ec61Sdh155122 		zoneid = GLOBAL_ZONEID;
492f4b3ec61Sdh155122 	ipst = ns->netstack_ip;
493c793af95Ssangeeta 
494c793af95Ssangeeta 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
495c793af95Ssangeeta 
496f4b3ec61Sdh155122 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
497*bd670b35SErik Nordmark 		ill = ire_nexthop_ill(ire);
498*bd670b35SErik Nordmark 		if (ill != NULL) {
499c793af95Ssangeeta 			ifindex = ill->ill_phyint->phyint_ifindex;
500*bd670b35SErik Nordmark 			ill_refrele(ill);
501*bd670b35SErik Nordmark 		}
502c793af95Ssangeeta 		ire_refrele(ire);
503c793af95Ssangeeta 	}
504f4b3ec61Sdh155122 	netstack_rele(ns);
505c793af95Ssangeeta 	return (ifindex);
506c793af95Ssangeeta }
507c793af95Ssangeeta 
508c793af95Ssangeeta /*
509c793af95Ssangeeta  * Routine to find the route to a destination. If a ifindex is supplied
510*bd670b35SErik Nordmark  * it tries to match the route to the corresponding ipif for the ifindex
511c793af95Ssangeeta  */
512c793af95Ssangeeta static	ire_t *
513f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
514c793af95Ssangeeta {
515c793af95Ssangeeta 	ire_t *ire = NULL;
516c793af95Ssangeeta 	int match_flags;
517c793af95Ssangeeta 
518*bd670b35SErik Nordmark 	match_flags = MATCH_IRE_DSTONLY;
519c793af95Ssangeeta 
520c793af95Ssangeeta 	/* XXX pass NULL tsl for now */
521c793af95Ssangeeta 
522c793af95Ssangeeta 	if (dst_addr->sa_family == AF_INET) {
523*bd670b35SErik Nordmark 		ire = ire_route_recursive_v4(
524*bd670b35SErik Nordmark 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
525*bd670b35SErik Nordmark 		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
526*bd670b35SErik Nordmark 		    NULL);
527c793af95Ssangeeta 	} else {
528*bd670b35SErik Nordmark 		ire = ire_route_recursive_v6(
529*bd670b35SErik Nordmark 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
530*bd670b35SErik Nordmark 		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
531*bd670b35SErik Nordmark 		    NULL);
532*bd670b35SErik Nordmark 	}
533*bd670b35SErik Nordmark 	ASSERT(ire != NULL);
534*bd670b35SErik Nordmark 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
535*bd670b35SErik Nordmark 		ire_refrele(ire);
536*bd670b35SErik Nordmark 		return (NULL);
537c793af95Ssangeeta 	}
538c793af95Ssangeeta 	return (ire);
539c793af95Ssangeeta }
540c793af95Ssangeeta 
541c793af95Ssangeeta /*
542c793af95Ssangeeta  * This routine is called by IP Filter to send a packet out on the wire
543*bd670b35SErik Nordmark  * to a specified dstination (which may be onlink or offlink). The ifindex may
544*bd670b35SErik Nordmark  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
545c793af95Ssangeeta  * an outgoing interface and requires the nexthop to be on that interface.
546c793af95Ssangeeta  * IP WILL NOT DO the following to the data packet before sending it out:
547c793af95Ssangeeta  *	a. manipulate ttl
548edd26dc5Sdr146992  *	b. ipsec work
549edd26dc5Sdr146992  *	c. fragmentation
550edd26dc5Sdr146992  *
551edd26dc5Sdr146992  * If the packet has been prepared for hardware checksum then it will be
552edd26dc5Sdr146992  * passed off to ip_send_align_cksum() to check that the flags set on the
553edd26dc5Sdr146992  * packet are in alignment with the capabilities of the new outgoing NIC.
554c793af95Ssangeeta  *
555c793af95Ssangeeta  * Return values:
556c793af95Ssangeeta  *	0:		IP was able to send of the data pkt
557c793af95Ssangeeta  *	ECOMM:		Could not send packet
558c793af95Ssangeeta  *	ENONET		No route to dst. It is up to the caller
559c793af95Ssangeeta  *			to send icmp unreachable error message,
560c793af95Ssangeeta  *	EINPROGRESS	The macaddr of the onlink dst or that
561c793af95Ssangeeta  *			of the offlink dst's nexthop needs to get
562c793af95Ssangeeta  *			resolved before packet can be sent to dst.
563c793af95Ssangeeta  *			Thus transmission is not guaranteed.
564*bd670b35SErik Nordmark  *			Note: No longer have visibility to the ARP queue
565*bd670b35SErik Nordmark  *			hence no EINPROGRESS.
566c793af95Ssangeeta  */
567c793af95Ssangeeta int
568c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
569c793af95Ssangeeta     zoneid_t zoneid)
570c793af95Ssangeeta {
571*bd670b35SErik Nordmark 	ipaddr_t nexthop;
572f4b3ec61Sdh155122 	netstack_t *ns;
573f4b3ec61Sdh155122 	ip_stack_t *ipst;
574*bd670b35SErik Nordmark 	ip_xmit_attr_t ixas;
575*bd670b35SErik Nordmark 	int error;
576c793af95Ssangeeta 
577c793af95Ssangeeta 	ASSERT(mp != NULL);
578c793af95Ssangeeta 
579f4b3ec61Sdh155122 	if (zoneid == ALL_ZONES)
580f4b3ec61Sdh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
581f4b3ec61Sdh155122 	else
582f4b3ec61Sdh155122 		ns = netstack_find_by_zoneid(zoneid);
583f4b3ec61Sdh155122 	ASSERT(ns != NULL);
584f4b3ec61Sdh155122 
585f4b3ec61Sdh155122 	/*
586f4b3ec61Sdh155122 	 * For exclusive stacks we set the zoneid to zero
587f4b3ec61Sdh155122 	 * since IP uses the global zoneid in the exclusive stacks.
588f4b3ec61Sdh155122 	 */
589f4b3ec61Sdh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
590f4b3ec61Sdh155122 		zoneid = GLOBAL_ZONEID;
591f4b3ec61Sdh155122 	ipst = ns->netstack_ip;
592f4b3ec61Sdh155122 
593c793af95Ssangeeta 	ASSERT(dst_addr->sa_family == AF_INET ||
594c793af95Ssangeeta 	    dst_addr->sa_family == AF_INET6);
595c793af95Ssangeeta 
596*bd670b35SErik Nordmark 	bzero(&ixas, sizeof (ixas));
597*bd670b35SErik Nordmark 	/*
598*bd670b35SErik Nordmark 	 * No IPsec, no fragmentation, and don't let any hooks see
599*bd670b35SErik Nordmark 	 * the packet.
600*bd670b35SErik Nordmark 	 */
601*bd670b35SErik Nordmark 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
602*bd670b35SErik Nordmark 	ixas.ixa_cred = kcred;
603*bd670b35SErik Nordmark 	ixas.ixa_cpid = NOPID;
604*bd670b35SErik Nordmark 	ixas.ixa_tsl = NULL;
605*bd670b35SErik Nordmark 	ixas.ixa_ipst = ipst;
606*bd670b35SErik Nordmark 	ixas.ixa_ifindex = ifindex;
607*bd670b35SErik Nordmark 
608c793af95Ssangeeta 	if (dst_addr->sa_family == AF_INET) {
609*bd670b35SErik Nordmark 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
610*bd670b35SErik Nordmark 
611*bd670b35SErik Nordmark 		ixas.ixa_flags |= IXAF_IS_IPV4;
612*bd670b35SErik Nordmark 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
613*bd670b35SErik Nordmark 		if (nexthop != ipha->ipha_dst) {
614*bd670b35SErik Nordmark 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
615*bd670b35SErik Nordmark 			ixas.ixa_nexthop_v4 = nexthop;
616*bd670b35SErik Nordmark 		}
617*bd670b35SErik Nordmark 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
618c793af95Ssangeeta 	} else {
619*bd670b35SErik Nordmark 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
620*bd670b35SErik Nordmark 		in6_addr_t *nexthop6;
621*bd670b35SErik Nordmark 
622*bd670b35SErik Nordmark 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
623*bd670b35SErik Nordmark 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
624*bd670b35SErik Nordmark 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
625*bd670b35SErik Nordmark 			ixas.ixa_nexthop_v6 = *nexthop6;
626c793af95Ssangeeta 		}
627*bd670b35SErik Nordmark 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
628c793af95Ssangeeta 	}
629*bd670b35SErik Nordmark 	error = ip_output_simple(mp, &ixas);
630*bd670b35SErik Nordmark 	ixa_cleanup(&ixas);
631c793af95Ssangeeta 
632f4b3ec61Sdh155122 	netstack_rele(ns);
633*bd670b35SErik Nordmark 	switch (error) {
634*bd670b35SErik Nordmark 	case 0:
635*bd670b35SErik Nordmark 		break;
636*bd670b35SErik Nordmark 
637*bd670b35SErik Nordmark 	case EHOSTUNREACH:
638*bd670b35SErik Nordmark 	case ENETUNREACH:
639*bd670b35SErik Nordmark 		error = ENONET;
640*bd670b35SErik Nordmark 		break;
641*bd670b35SErik Nordmark 
642*bd670b35SErik Nordmark 	default:
643*bd670b35SErik Nordmark 		error = ECOMM;
644*bd670b35SErik Nordmark 		break;
645c793af95Ssangeeta 	}
646*bd670b35SErik Nordmark 	return (error);
647edd26dc5Sdr146992 }
648edd26dc5Sdr146992 
649c793af95Ssangeeta /*
650c793af95Ssangeeta  * callback function provided by ire_ftable_lookup when calling
651c793af95Ssangeeta  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
652c793af95Ssangeeta  * the radix tree.
653c793af95Ssangeeta  */
654c793af95Ssangeeta boolean_t
655c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg)
656c793af95Ssangeeta {
657c793af95Ssangeeta 	struct rt_entry *rt = (struct rt_entry *)rn;
658c793af95Ssangeeta 	irb_t *irb_ptr;
659c793af95Ssangeeta 	ire_t *ire;
660c793af95Ssangeeta 	ire_ftable_args_t *margs = arg;
661c793af95Ssangeeta 	ipaddr_t match_mask;
662c793af95Ssangeeta 
663c793af95Ssangeeta 	ASSERT(rt != NULL);
664c793af95Ssangeeta 
665c793af95Ssangeeta 	irb_ptr = &rt->rt_irb;
666c793af95Ssangeeta 
667c793af95Ssangeeta 	if (irb_ptr->irb_ire_cnt == 0)
668c793af95Ssangeeta 		return (B_FALSE);
669c793af95Ssangeeta 
670c793af95Ssangeeta 	rw_enter(&irb_ptr->irb_lock, RW_READER);
671c793af95Ssangeeta 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
672*bd670b35SErik Nordmark 		if (IRE_IS_CONDEMNED(ire))
673c793af95Ssangeeta 			continue;
674*bd670b35SErik Nordmark 		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
675c793af95Ssangeeta 			match_mask = margs->ift_mask;
676c793af95Ssangeeta 		else
677c793af95Ssangeeta 			match_mask = ire->ire_mask;
678c793af95Ssangeeta 
679c793af95Ssangeeta 		if (ire_match_args(ire, margs->ift_addr, match_mask,
680*bd670b35SErik Nordmark 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
681*bd670b35SErik Nordmark 		    margs->ift_zoneid, margs->ift_tsl,
682*bd670b35SErik Nordmark 		    margs->ift_flags)) {
683*bd670b35SErik Nordmark 			ire_refhold(ire);
684c793af95Ssangeeta 			rw_exit(&irb_ptr->irb_lock);
685c793af95Ssangeeta 			margs->ift_best_ire = ire;
686c793af95Ssangeeta 			return (B_TRUE);
687c793af95Ssangeeta 		}
688c793af95Ssangeeta 	}
689c793af95Ssangeeta 	rw_exit(&irb_ptr->irb_lock);
690c793af95Ssangeeta 	return (B_FALSE);
691c793af95Ssangeeta }
692c793af95Ssangeeta 
693c793af95Ssangeeta /*
694c793af95Ssangeeta  * ftable irb_t structures are dynamically allocated, and we need to
695c793af95Ssangeeta  * check if the irb_t (and associated ftable tree attachment) needs to
696c793af95Ssangeeta  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
697c793af95Ssangeeta  * be verified are:
698c793af95Ssangeeta  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
699c793af95Ssangeeta  * - no other threads holding references to ire's in the bucket,
700c793af95Ssangeeta  *   i.e., irb_nire == 0
701c793af95Ssangeeta  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
702c793af95Ssangeeta  * - need to hold the global tree lock and irb_lock in write mode.
703c793af95Ssangeeta  */
704c793af95Ssangeeta void
705c793af95Ssangeeta irb_refrele_ftable(irb_t *irb)
706c793af95Ssangeeta {
707c793af95Ssangeeta 	for (;;) {
708c793af95Ssangeeta 		rw_enter(&irb->irb_lock, RW_WRITER);
709c793af95Ssangeeta 		ASSERT(irb->irb_refcnt != 0);
710c793af95Ssangeeta 		if (irb->irb_refcnt != 1) {
711c793af95Ssangeeta 			/*
712c793af95Ssangeeta 			 * Someone has a reference to this radix node
713c793af95Ssangeeta 			 * or there is some bucket walker.
714c793af95Ssangeeta 			 */
715c793af95Ssangeeta 			irb->irb_refcnt--;
716c793af95Ssangeeta 			rw_exit(&irb->irb_lock);
717c793af95Ssangeeta 			return;
718c793af95Ssangeeta 		} else {
719c793af95Ssangeeta 			/*
720c793af95Ssangeeta 			 * There is no other walker, nor is there any
721c793af95Ssangeeta 			 * other thread that holds a direct ref to this
722c793af95Ssangeeta 			 * radix node. Do the clean up if needed. Call
723c793af95Ssangeeta 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
724c793af95Ssangeeta 			 */
725c793af95Ssangeeta 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
726c793af95Ssangeeta 				ire_t *ire_list;
727c793af95Ssangeeta 
728c793af95Ssangeeta 				ire_list = ire_unlink(irb);
729c793af95Ssangeeta 				rw_exit(&irb->irb_lock);
730c793af95Ssangeeta 
731c793af95Ssangeeta 				if (ire_list != NULL)
732c793af95Ssangeeta 					ire_cleanup(ire_list);
733c793af95Ssangeeta 				/*
734c793af95Ssangeeta 				 * more CONDEMNED entries could have
735c793af95Ssangeeta 				 * been added while we dropped the lock,
736c793af95Ssangeeta 				 * so we have to re-check.
737c793af95Ssangeeta 				 */
738c793af95Ssangeeta 				continue;
739c793af95Ssangeeta 			}
740c793af95Ssangeeta 
741c793af95Ssangeeta 			/*
742c793af95Ssangeeta 			 * Now check if there are still any ires
743c793af95Ssangeeta 			 * associated with this radix node.
744c793af95Ssangeeta 			 */
745c793af95Ssangeeta 			if (irb->irb_nire != 0) {
746c793af95Ssangeeta 				/*
747c793af95Ssangeeta 				 * someone is still holding on
748c793af95Ssangeeta 				 * to ires in this bucket
749c793af95Ssangeeta 				 */
750c793af95Ssangeeta 				irb->irb_refcnt--;
751c793af95Ssangeeta 				rw_exit(&irb->irb_lock);
752c793af95Ssangeeta 				return;
753c793af95Ssangeeta 			} else {
754c793af95Ssangeeta 				/*
755c793af95Ssangeeta 				 * Everything is clear. Zero walkers,
756c793af95Ssangeeta 				 * Zero threads with a ref to this
757c793af95Ssangeeta 				 * radix node, Zero ires associated with
758c793af95Ssangeeta 				 * this radix node. Due to lock order,
759c793af95Ssangeeta 				 * check the above conditions again
760c793af95Ssangeeta 				 * after grabbing all locks in the right order
761c793af95Ssangeeta 				 */
762c793af95Ssangeeta 				rw_exit(&irb->irb_lock);
763c793af95Ssangeeta 				if (irb_inactive(irb))
764c793af95Ssangeeta 					return;
765c793af95Ssangeeta 				/*
766c793af95Ssangeeta 				 * irb_inactive could not free the irb.
767c793af95Ssangeeta 				 * See if there are any walkers, if not
768c793af95Ssangeeta 				 * try to clean up again.
769c793af95Ssangeeta 				 */
770c793af95Ssangeeta 			}
771c793af95Ssangeeta 		}
772c793af95Ssangeeta 	}
773c793af95Ssangeeta }
774c793af95Ssangeeta 
775c793af95Ssangeeta /*
776*bd670b35SErik Nordmark  * IRE iterator used by ire_ftable_lookup to process multiple equal
777*bd670b35SErik Nordmark  * routes. Given a starting point in the hash list (hash), walk the IREs
778*bd670b35SErik Nordmark  * in the bucket skipping deleted entries. We treat the bucket as a circular
779*bd670b35SErik Nordmark  * list for the purposes of walking it.
780*bd670b35SErik Nordmark  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
781*bd670b35SErik Nordmark  * not applicable (ire_match_args failed) then it returns a subsequent one.
782*bd670b35SErik Nordmark  * If we fail to find an IRE we return NULL.
783c793af95Ssangeeta  *
784*bd670b35SErik Nordmark  * Assumes that the caller holds a reference on the IRE bucket and a read lock
785*bd670b35SErik Nordmark  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
786*bd670b35SErik Nordmark  *
787*bd670b35SErik Nordmark  * Applies to IPv4 and IPv6.
788*bd670b35SErik Nordmark  *
789*bd670b35SErik Nordmark  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
790*bd670b35SErik Nordmark  * address and bucket, we compare against ire_type for the orig_ire. We also
791*bd670b35SErik Nordmark  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
792*bd670b35SErik Nordmark  * first in the bucket. Thus we compare that ire_flags match the orig_ire.
793*bd670b35SErik Nordmark  *
794*bd670b35SErik Nordmark  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
795*bd670b35SErik Nordmark  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
796*bd670b35SErik Nordmark  * in which the zone has an IP address. We check this for the global zone
797*bd670b35SErik Nordmark  * even if no shared-IP zones are configured.
798c793af95Ssangeeta  */
799c793af95Ssangeeta ire_t *
800*bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
801*bd670b35SErik Nordmark     ire_t *orig_ire, ip_stack_t *ipst)
802c793af95Ssangeeta {
803c793af95Ssangeeta 	ire_t		*ire, *maybe_ire = NULL;
804*bd670b35SErik Nordmark 	uint_t		maybe_badcnt;
805*bd670b35SErik Nordmark 	uint_t		maxwalk;
806*bd670b35SErik Nordmark 
807*bd670b35SErik Nordmark 	/* Fold in more bits from the hint/hash */
808*bd670b35SErik Nordmark 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
809c793af95Ssangeeta 
810c793af95Ssangeeta 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
811*bd670b35SErik Nordmark 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
812*bd670b35SErik Nordmark 	hash %= maxwalk;
813*bd670b35SErik Nordmark 	irb_refhold_locked(irb_ptr);
814c793af95Ssangeeta 	rw_exit(&irb_ptr->irb_lock);
815c793af95Ssangeeta 
816c793af95Ssangeeta 	/*
817c793af95Ssangeeta 	 * Round-robin the routers list looking for a route that
818c793af95Ssangeeta 	 * matches the passed in parameters.
819*bd670b35SErik Nordmark 	 * First we skip "hash" number of non-condemned IREs.
820*bd670b35SErik Nordmark 	 * Then we match the IRE.
821*bd670b35SErik Nordmark 	 * If we find an ire which has a non-zero ire_badcnt then we remember
822*bd670b35SErik Nordmark 	 * it and keep on looking for a lower ire_badcnt.
823*bd670b35SErik Nordmark 	 * If we come to the end of the list we continue (treat the
824*bd670b35SErik Nordmark 	 * bucket list as a circular list) but we match less than "max"
825*bd670b35SErik Nordmark 	 * entries.
826c793af95Ssangeeta 	 */
827*bd670b35SErik Nordmark 	ire = irb_ptr->irb_ire;
828*bd670b35SErik Nordmark 	while (maxwalk > 0) {
829*bd670b35SErik Nordmark 		if (IRE_IS_CONDEMNED(ire))
830*bd670b35SErik Nordmark 			goto next_ire_skip;
831c793af95Ssangeeta 
832*bd670b35SErik Nordmark 		/* Skip the first "hash" entries to do ECMP */
833*bd670b35SErik Nordmark 		if (hash != 0) {
834*bd670b35SErik Nordmark 			hash--;
835*bd670b35SErik Nordmark 			goto next_ire_skip;
836*bd670b35SErik Nordmark 		}
837*bd670b35SErik Nordmark 
838*bd670b35SErik Nordmark 		/* See CGTP comment above */
839*bd670b35SErik Nordmark 		if (ire->ire_type != orig_ire->ire_type ||
840*bd670b35SErik Nordmark 		    ire->ire_flags != orig_ire->ire_flags)
841c793af95Ssangeeta 			goto next_ire;
842c793af95Ssangeeta 
843c793af95Ssangeeta 		/*
844*bd670b35SErik Nordmark 		 * Note: Since IPv6 has hash buckets instead of radix
845*bd670b35SErik Nordmark 		 * buckers we need to explicitly compare the addresses.
846*bd670b35SErik Nordmark 		 * That makes this less efficient since we will be called
847*bd670b35SErik Nordmark 		 * even if there is no alternatives just because the
848*bd670b35SErik Nordmark 		 * bucket has multiple IREs for different addresses.
849c793af95Ssangeeta 		 */
850*bd670b35SErik Nordmark 		if (ire->ire_ipversion == IPV6_VERSION) {
851*bd670b35SErik Nordmark 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
852*bd670b35SErik Nordmark 			    &ire->ire_addr_v6))
853c793af95Ssangeeta 				goto next_ire;
854c793af95Ssangeeta 		}
855c793af95Ssangeeta 
856c793af95Ssangeeta 		/*
857*bd670b35SErik Nordmark 		 * For some reason find_best_route uses ire_mask. We do
858*bd670b35SErik Nordmark 		 * the same.
859*bd670b35SErik Nordmark 		 */
860*bd670b35SErik Nordmark 		if (ire->ire_ipversion == IPV4_VERSION ?
861*bd670b35SErik Nordmark 		    !ire_match_args(ire, margs->ift_addr,
862*bd670b35SErik Nordmark 		    ire->ire_mask, margs->ift_gateway,
863*bd670b35SErik Nordmark 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
864*bd670b35SErik Nordmark 		    margs->ift_tsl, margs->ift_flags) :
865*bd670b35SErik Nordmark 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
866*bd670b35SErik Nordmark 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
867*bd670b35SErik Nordmark 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
868*bd670b35SErik Nordmark 		    margs->ift_tsl, margs->ift_flags))
869*bd670b35SErik Nordmark 			goto next_ire;
870*bd670b35SErik Nordmark 
871*bd670b35SErik Nordmark 		if (margs->ift_zoneid != ALL_ZONES &&
872*bd670b35SErik Nordmark 		    (ire->ire_type & IRE_OFFLINK)) {
873*bd670b35SErik Nordmark 			/*
874*bd670b35SErik Nordmark 			 * When we're in a zone, we're only
875c793af95Ssangeeta 			 * interested in routers that are
876c793af95Ssangeeta 			 * reachable through ipifs within our zone.
877c793af95Ssangeeta 			 */
878*bd670b35SErik Nordmark 			if (ire->ire_ipversion == IPV4_VERSION) {
879*bd670b35SErik Nordmark 				if (!ire_gateway_ok_zone_v4(
880*bd670b35SErik Nordmark 				    ire->ire_gateway_addr, margs->ift_zoneid,
881*bd670b35SErik Nordmark 				    ire->ire_ill, margs->ift_tsl, ipst,
882*bd670b35SErik Nordmark 				    B_TRUE))
883*bd670b35SErik Nordmark 					goto next_ire;
884*bd670b35SErik Nordmark 			} else {
885*bd670b35SErik Nordmark 				if (!ire_gateway_ok_zone_v6(
886*bd670b35SErik Nordmark 				    &ire->ire_gateway_addr_v6,
887*bd670b35SErik Nordmark 				    margs->ift_zoneid, ire->ire_ill,
888*bd670b35SErik Nordmark 				    margs->ift_tsl, ipst, B_TRUE))
889*bd670b35SErik Nordmark 					goto next_ire;
890*bd670b35SErik Nordmark 			}
891*bd670b35SErik Nordmark 		}
892*bd670b35SErik Nordmark 		mutex_enter(&ire->ire_lock);
893*bd670b35SErik Nordmark 		/* Look for stale ire_badcnt and clear */
894*bd670b35SErik Nordmark 		if (ire->ire_badcnt != 0 &&
895*bd670b35SErik Nordmark 		    (TICK_TO_SEC(lbolt64) - ire->ire_last_badcnt >
896*bd670b35SErik Nordmark 		    ipst->ips_ip_ire_badcnt_lifetime))
897*bd670b35SErik Nordmark 			ire->ire_badcnt = 0;
898*bd670b35SErik Nordmark 		mutex_exit(&ire->ire_lock);
899e11c3f44Smeem 
900*bd670b35SErik Nordmark 		if (ire->ire_badcnt == 0) {
901*bd670b35SErik Nordmark 			/* We found one with a zero badcnt; done */
902*bd670b35SErik Nordmark 			ire_refhold(ire);
903*bd670b35SErik Nordmark 			/*
904*bd670b35SErik Nordmark 			 * Care needed since irb_refrele grabs WLOCK to free
905*bd670b35SErik Nordmark 			 * the irb_t.
906*bd670b35SErik Nordmark 			 */
907*bd670b35SErik Nordmark 			if (ire->ire_ipversion == IPV4_VERSION) {
908*bd670b35SErik Nordmark 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
909*bd670b35SErik Nordmark 				irb_refrele(irb_ptr);
910*bd670b35SErik Nordmark 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
911*bd670b35SErik Nordmark 			} else {
912*bd670b35SErik Nordmark 				rw_exit(&ipst->ips_ip6_ire_head_lock);
913*bd670b35SErik Nordmark 				irb_refrele(irb_ptr);
914*bd670b35SErik Nordmark 				rw_enter(&ipst->ips_ip6_ire_head_lock,
915*bd670b35SErik Nordmark 				    RW_READER);
916*bd670b35SErik Nordmark 			}
917c793af95Ssangeeta 			return (ire);
918c793af95Ssangeeta 		}
919*bd670b35SErik Nordmark 		/*
920*bd670b35SErik Nordmark 		 * keep looking to see if there is a better (lower
921*bd670b35SErik Nordmark 		 * badcnt) matching IRE, but save this one as a last resort.
922*bd670b35SErik Nordmark 		 * If we find a lower badcnt pick that one as the last* resort.
923*bd670b35SErik Nordmark 		 */
924*bd670b35SErik Nordmark 		if (maybe_ire == NULL) {
925*bd670b35SErik Nordmark 			maybe_ire = ire;
926*bd670b35SErik Nordmark 			maybe_badcnt = ire->ire_badcnt;
927*bd670b35SErik Nordmark 		} else if (ire->ire_badcnt < maybe_badcnt) {
928*bd670b35SErik Nordmark 			maybe_ire = ire;
929*bd670b35SErik Nordmark 			maybe_badcnt = ire->ire_badcnt;
930*bd670b35SErik Nordmark 		}
931*bd670b35SErik Nordmark 
932c793af95Ssangeeta next_ire:
933*bd670b35SErik Nordmark 		maxwalk--;
934*bd670b35SErik Nordmark next_ire_skip:
935*bd670b35SErik Nordmark 		ire = ire->ire_next;
936*bd670b35SErik Nordmark 		if (ire == NULL)
937*bd670b35SErik Nordmark 			ire = irb_ptr->irb_ire;
938c793af95Ssangeeta 	}
939c793af95Ssangeeta 	if (maybe_ire != NULL)
940*bd670b35SErik Nordmark 		ire_refhold(maybe_ire);
941*bd670b35SErik Nordmark 
942*bd670b35SErik Nordmark 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
943*bd670b35SErik Nordmark 	if (ire->ire_ipversion == IPV4_VERSION) {
944*bd670b35SErik Nordmark 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
945*bd670b35SErik Nordmark 		irb_refrele(irb_ptr);
946*bd670b35SErik Nordmark 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
947*bd670b35SErik Nordmark 	} else {
948*bd670b35SErik Nordmark 		rw_exit(&ipst->ips_ip6_ire_head_lock);
949*bd670b35SErik Nordmark 		irb_refrele(irb_ptr);
950*bd670b35SErik Nordmark 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
951*bd670b35SErik Nordmark 	}
952c793af95Ssangeeta 	return (maybe_ire);
953c793af95Ssangeeta }
9542679e103Ssowmini 
9552679e103Ssowmini void
9562679e103Ssowmini irb_refhold_rn(struct radix_node *rn)
9572679e103Ssowmini {
9582679e103Ssowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
959*bd670b35SErik Nordmark 		irb_refhold(&((rt_t *)(rn))->rt_irb);
9602679e103Ssowmini }
9612679e103Ssowmini 
9622679e103Ssowmini void
9632679e103Ssowmini irb_refrele_rn(struct radix_node *rn)
9642679e103Ssowmini {
9652679e103Ssowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
9662679e103Ssowmini 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
9672679e103Ssowmini }
968*bd670b35SErik Nordmark 
969*bd670b35SErik Nordmark /*
970*bd670b35SErik Nordmark  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
971*bd670b35SErik Nordmark  * routes this routine sets up a ire_nce_cache as well. The caller needs to
972*bd670b35SErik Nordmark  * lookup an nce for the multicast case.
973*bd670b35SErik Nordmark  */
974*bd670b35SErik Nordmark ire_t *
975*bd670b35SErik Nordmark ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
976*bd670b35SErik Nordmark     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
977*bd670b35SErik Nordmark {
978*bd670b35SErik Nordmark 	uint_t		match_args;
979*bd670b35SErik Nordmark 	uint_t		ire_type;
980*bd670b35SErik Nordmark 	ill_t		*ill;
981*bd670b35SErik Nordmark 	ire_t		*ire;
982*bd670b35SErik Nordmark 	ip_stack_t	*ipst = ixa->ixa_ipst;
983*bd670b35SErik Nordmark 	ipaddr_t	v4dst;
984*bd670b35SErik Nordmark 	in6_addr_t	v6nexthop;
985*bd670b35SErik Nordmark 	iaflags_t	ixaflags = ixa->ixa_flags;
986*bd670b35SErik Nordmark 	nce_t		*nce;
987*bd670b35SErik Nordmark 
988*bd670b35SErik Nordmark 	match_args = MATCH_IRE_SECATTR;
989*bd670b35SErik Nordmark 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
990*bd670b35SErik Nordmark 	if (setsrcp != NULL)
991*bd670b35SErik Nordmark 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
992*bd670b35SErik Nordmark 	if (errorp != NULL)
993*bd670b35SErik Nordmark 		ASSERT(*errorp == 0);
994*bd670b35SErik Nordmark 
995*bd670b35SErik Nordmark 	/*
996*bd670b35SErik Nordmark 	 * The content of the ixa will be different if IP_NEXTHOP,
997*bd670b35SErik Nordmark 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
998*bd670b35SErik Nordmark 	 */
999*bd670b35SErik Nordmark 
1000*bd670b35SErik Nordmark 	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
1001*bd670b35SErik Nordmark 	    IN6_IS_ADDR_MULTICAST(v6dst)) {
1002*bd670b35SErik Nordmark 		/* Pick up the IRE_MULTICAST for the ill */
1003*bd670b35SErik Nordmark 		if (ixa->ixa_multicast_ifindex != 0) {
1004*bd670b35SErik Nordmark 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1005*bd670b35SErik Nordmark 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1006*bd670b35SErik Nordmark 		} else if (ixaflags & IXAF_SCOPEID_SET) {
1007*bd670b35SErik Nordmark 			/* sin6_scope_id takes precedence over ixa_ifindex */
1008*bd670b35SErik Nordmark 			ASSERT(ixa->ixa_scopeid != 0);
1009*bd670b35SErik Nordmark 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1010*bd670b35SErik Nordmark 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1011*bd670b35SErik Nordmark 		} else if (ixa->ixa_ifindex != 0) {
1012*bd670b35SErik Nordmark 			/*
1013*bd670b35SErik Nordmark 			 * In the ipmp case, the ixa_ifindex is set to
1014*bd670b35SErik Nordmark 			 * point at an under_ill and we would return the
1015*bd670b35SErik Nordmark 			 * ire_multicast() corresponding to that under_ill.
1016*bd670b35SErik Nordmark 			 */
1017*bd670b35SErik Nordmark 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1018*bd670b35SErik Nordmark 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1019*bd670b35SErik Nordmark 		} else if (ixaflags & IXAF_IS_IPV4) {
1020*bd670b35SErik Nordmark 			ipaddr_t	v4setsrc = INADDR_ANY;
1021*bd670b35SErik Nordmark 
1022*bd670b35SErik Nordmark 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
1023*bd670b35SErik Nordmark 			    multirtp, &v4setsrc);
1024*bd670b35SErik Nordmark 			if (setsrcp != NULL)
1025*bd670b35SErik Nordmark 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1026*bd670b35SErik Nordmark 		} else {
1027*bd670b35SErik Nordmark 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
1028*bd670b35SErik Nordmark 			    multirtp, setsrcp);
1029*bd670b35SErik Nordmark 		}
1030*bd670b35SErik Nordmark 		if (ill != NULL && IS_VNI(ill)) {
1031*bd670b35SErik Nordmark 			ill_refrele(ill);
1032*bd670b35SErik Nordmark 			ill = NULL;
1033*bd670b35SErik Nordmark 		}
1034*bd670b35SErik Nordmark 		if (ill == NULL) {
1035*bd670b35SErik Nordmark 			if (errorp != NULL)
1036*bd670b35SErik Nordmark 				*errorp = ENXIO;
1037*bd670b35SErik Nordmark 			/* Get a hold on the IRE_NOROUTE */
1038*bd670b35SErik Nordmark 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1039*bd670b35SErik Nordmark 			return (ire);
1040*bd670b35SErik Nordmark 		}
1041*bd670b35SErik Nordmark 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
1042*bd670b35SErik Nordmark 			ill_refrele(ill);
1043*bd670b35SErik Nordmark 			if (errorp != NULL)
1044*bd670b35SErik Nordmark 				*errorp = EHOSTUNREACH;
1045*bd670b35SErik Nordmark 			/* Get a hold on the IRE_NOROUTE */
1046*bd670b35SErik Nordmark 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1047*bd670b35SErik Nordmark 			return (ire);
1048*bd670b35SErik Nordmark 		}
1049*bd670b35SErik Nordmark 		/* Get a refcnt on the single IRE_MULTICAST per ill */
1050*bd670b35SErik Nordmark 		ire = ire_multicast(ill);
1051*bd670b35SErik Nordmark 		ill_refrele(ill);
1052*bd670b35SErik Nordmark 		if (generationp != NULL)
1053*bd670b35SErik Nordmark 			*generationp = ire->ire_generation;
1054*bd670b35SErik Nordmark 		if (errorp != NULL &&
1055*bd670b35SErik Nordmark 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1056*bd670b35SErik Nordmark 			*errorp = EHOSTUNREACH;
1057*bd670b35SErik Nordmark 		}
1058*bd670b35SErik Nordmark 		return (ire);
1059*bd670b35SErik Nordmark 	}
1060*bd670b35SErik Nordmark 
1061*bd670b35SErik Nordmark 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1062*bd670b35SErik Nordmark 		if (ixaflags & IXAF_SCOPEID_SET) {
1063*bd670b35SErik Nordmark 			/* sin6_scope_id takes precedence over ixa_ifindex */
1064*bd670b35SErik Nordmark 			ASSERT(ixa->ixa_scopeid != 0);
1065*bd670b35SErik Nordmark 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1066*bd670b35SErik Nordmark 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1067*bd670b35SErik Nordmark 		} else {
1068*bd670b35SErik Nordmark 			ASSERT(ixa->ixa_ifindex != 0);
1069*bd670b35SErik Nordmark 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1070*bd670b35SErik Nordmark 			    !(ixaflags & IXAF_IS_IPV4), ipst);
1071*bd670b35SErik Nordmark 		}
1072*bd670b35SErik Nordmark 		if (ill != NULL && IS_VNI(ill)) {
1073*bd670b35SErik Nordmark 			ill_refrele(ill);
1074*bd670b35SErik Nordmark 			ill = NULL;
1075*bd670b35SErik Nordmark 		}
1076*bd670b35SErik Nordmark 		if (ill == NULL) {
1077*bd670b35SErik Nordmark 			if (errorp != NULL)
1078*bd670b35SErik Nordmark 				*errorp = ENXIO;
1079*bd670b35SErik Nordmark 			/* Get a hold on the IRE_NOROUTE */
1080*bd670b35SErik Nordmark 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
1081*bd670b35SErik Nordmark 			return (ire);
1082*bd670b35SErik Nordmark 		}
1083*bd670b35SErik Nordmark 		/*
1084*bd670b35SErik Nordmark 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1085*bd670b35SErik Nordmark 		 * so for both of them we need to be able look for an under
1086*bd670b35SErik Nordmark 		 * interface.
1087*bd670b35SErik Nordmark 		 */
1088*bd670b35SErik Nordmark 		if (IS_UNDER_IPMP(ill))
1089*bd670b35SErik Nordmark 			match_args |= MATCH_IRE_TESTHIDDEN;
1090*bd670b35SErik Nordmark 	} else {
1091*bd670b35SErik Nordmark 		ill = NULL;
1092*bd670b35SErik Nordmark 	}
1093*bd670b35SErik Nordmark 
1094*bd670b35SErik Nordmark 	if (ixaflags & IXAF_NEXTHOP_SET) {
1095*bd670b35SErik Nordmark 		/* IP_NEXTHOP was set */
1096*bd670b35SErik Nordmark 		v6nexthop = ixa->ixa_nexthop_v6;
1097*bd670b35SErik Nordmark 	} else {
1098*bd670b35SErik Nordmark 		v6nexthop = *v6dst;
1099*bd670b35SErik Nordmark 	}
1100*bd670b35SErik Nordmark 
1101*bd670b35SErik Nordmark 	ire_type = 0;
1102*bd670b35SErik Nordmark 	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
1103*bd670b35SErik Nordmark 
1104*bd670b35SErik Nordmark 	/*
1105*bd670b35SErik Nordmark 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1106*bd670b35SErik Nordmark 	 * we only look for an onlink IRE.
1107*bd670b35SErik Nordmark 	 */
1108*bd670b35SErik Nordmark 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1109*bd670b35SErik Nordmark 		match_args |= MATCH_IRE_TYPE;
1110*bd670b35SErik Nordmark 		ire_type = IRE_ONLINK;
1111*bd670b35SErik Nordmark 	}
1112*bd670b35SErik Nordmark 
1113*bd670b35SErik Nordmark 	if (ixaflags & IXAF_IS_IPV4) {
1114*bd670b35SErik Nordmark 		ipaddr_t	v4nexthop;
1115*bd670b35SErik Nordmark 		ipaddr_t	v4setsrc = INADDR_ANY;
1116*bd670b35SErik Nordmark 
1117*bd670b35SErik Nordmark 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1118*bd670b35SErik Nordmark 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1119*bd670b35SErik Nordmark 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
1120*bd670b35SErik Nordmark 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1121*bd670b35SErik Nordmark 		if (setsrcp != NULL)
1122*bd670b35SErik Nordmark 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1123*bd670b35SErik Nordmark 	} else {
1124*bd670b35SErik Nordmark 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1125*bd670b35SErik Nordmark 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
1126*bd670b35SErik Nordmark 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1127*bd670b35SErik Nordmark 	}
1128*bd670b35SErik Nordmark 
1129*bd670b35SErik Nordmark #ifdef DEBUG
1130*bd670b35SErik Nordmark 	if (match_args & MATCH_IRE_TESTHIDDEN) {
1131*bd670b35SErik Nordmark 		ip3dbg(("looking for hidden; dst %x ire %p\n",
1132*bd670b35SErik Nordmark 		    v4dst, (void *)ire));
1133*bd670b35SErik Nordmark 	}
1134*bd670b35SErik Nordmark #endif
1135*bd670b35SErik Nordmark 
1136*bd670b35SErik Nordmark 	if (ill != NULL)
1137*bd670b35SErik Nordmark 		ill_refrele(ill);
1138*bd670b35SErik Nordmark 
1139*bd670b35SErik Nordmark 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1140*bd670b35SErik Nordmark 	    (ire->ire_type & IRE_MULTICAST)) {
1141*bd670b35SErik Nordmark 		/* No ire_nce_cache */
1142*bd670b35SErik Nordmark 		return (ire);
1143*bd670b35SErik Nordmark 	}
1144*bd670b35SErik Nordmark 
1145*bd670b35SErik Nordmark 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
1146*bd670b35SErik Nordmark 	mutex_enter(&ire->ire_lock);
1147*bd670b35SErik Nordmark 	nce = ire->ire_nce_cache;
1148*bd670b35SErik Nordmark 	if (nce == NULL || nce->nce_is_condemned) {
1149*bd670b35SErik Nordmark 		mutex_exit(&ire->ire_lock);
1150*bd670b35SErik Nordmark 		(void) ire_revalidate_nce(ire);
1151*bd670b35SErik Nordmark 	} else {
1152*bd670b35SErik Nordmark 		mutex_exit(&ire->ire_lock);
1153*bd670b35SErik Nordmark 	}
1154*bd670b35SErik Nordmark 	return (ire);
1155*bd670b35SErik Nordmark }
1156*bd670b35SErik Nordmark 
1157*bd670b35SErik Nordmark /*
1158*bd670b35SErik Nordmark  * Find a route given some xmit attributes and a packet.
1159*bd670b35SErik Nordmark  * Generic for IPv4 and IPv6
1160*bd670b35SErik Nordmark  *
1161*bd670b35SErik Nordmark  * This never returns NULL. But when it returns the IRE_NOROUTE
1162*bd670b35SErik Nordmark  * it might set errorp.
1163*bd670b35SErik Nordmark  */
1164*bd670b35SErik Nordmark ire_t *
1165*bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1166*bd670b35SErik Nordmark     int *errorp, boolean_t *multirtp)
1167*bd670b35SErik Nordmark {
1168*bd670b35SErik Nordmark 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1169*bd670b35SErik Nordmark 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1170*bd670b35SErik Nordmark 		in6_addr_t	v6dst;
1171*bd670b35SErik Nordmark 
1172*bd670b35SErik Nordmark 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1173*bd670b35SErik Nordmark 
1174*bd670b35SErik Nordmark 		return (ip_select_route(&v6dst, ixa, generationp,
1175*bd670b35SErik Nordmark 		    NULL, errorp, multirtp));
1176*bd670b35SErik Nordmark 	} else {
1177*bd670b35SErik Nordmark 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
1178*bd670b35SErik Nordmark 
1179*bd670b35SErik Nordmark 		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
1180*bd670b35SErik Nordmark 		    NULL, errorp, multirtp));
1181*bd670b35SErik Nordmark 	}
1182*bd670b35SErik Nordmark }
1183*bd670b35SErik Nordmark 
1184*bd670b35SErik Nordmark ire_t *
1185*bd670b35SErik Nordmark ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
1186*bd670b35SErik Nordmark     ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1187*bd670b35SErik Nordmark {
1188*bd670b35SErik Nordmark 	in6_addr_t	v6dst;
1189*bd670b35SErik Nordmark 	ire_t		*ire;
1190*bd670b35SErik Nordmark 	in6_addr_t	setsrc;
1191*bd670b35SErik Nordmark 
1192*bd670b35SErik Nordmark 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1193*bd670b35SErik Nordmark 
1194*bd670b35SErik Nordmark 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1195*bd670b35SErik Nordmark 
1196*bd670b35SErik Nordmark 	setsrc = ipv6_all_zeros;
1197*bd670b35SErik Nordmark 	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
1198*bd670b35SErik Nordmark 	    multirtp);
1199*bd670b35SErik Nordmark 	if (v4setsrcp != NULL)
1200*bd670b35SErik Nordmark 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1201*bd670b35SErik Nordmark 	return (ire);
1202*bd670b35SErik Nordmark }
1203*bd670b35SErik Nordmark 
1204*bd670b35SErik Nordmark /*
1205*bd670b35SErik Nordmark  * Recursively look for a route to the destination. Can also match on
1206*bd670b35SErik Nordmark  * the zoneid, ill, and label. Used for the data paths. See also
1207*bd670b35SErik Nordmark  * ire_route_recursive.
1208*bd670b35SErik Nordmark  *
1209*bd670b35SErik Nordmark  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
1210*bd670b35SErik Nordmark  *
1211*bd670b35SErik Nordmark  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1212*bd670b35SErik Nordmark  * instead.
1213*bd670b35SErik Nordmark  *
1214*bd670b35SErik Nordmark  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1215*bd670b35SErik Nordmark  * is an error.
1216*bd670b35SErik Nordmark  * Allow at most one RTF_INDIRECT.
1217*bd670b35SErik Nordmark  */
1218*bd670b35SErik Nordmark ire_t *
1219*bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire,
1220*bd670b35SErik Nordmark     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1221*bd670b35SErik Nordmark     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1222*bd670b35SErik Nordmark     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1223*bd670b35SErik Nordmark     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1224*bd670b35SErik Nordmark {
1225*bd670b35SErik Nordmark 	int		i, j;
1226*bd670b35SErik Nordmark 	ire_t		*ires[MAX_IRE_RECURSION];
1227*bd670b35SErik Nordmark 	uint_t		generation;
1228*bd670b35SErik Nordmark 	uint_t		generations[MAX_IRE_RECURSION];
1229*bd670b35SErik Nordmark 	boolean_t	need_refrele = B_FALSE;
1230*bd670b35SErik Nordmark 	boolean_t	invalidate = B_FALSE;
1231*bd670b35SErik Nordmark 	int		prefs[MAX_IRE_RECURSION];
1232*bd670b35SErik Nordmark 	ill_t		*ill = NULL;
1233*bd670b35SErik Nordmark 
1234*bd670b35SErik Nordmark 	if (setsrcp != NULL)
1235*bd670b35SErik Nordmark 		ASSERT(*setsrcp == INADDR_ANY);
1236*bd670b35SErik Nordmark 	if (gwattrp != NULL)
1237*bd670b35SErik Nordmark 		ASSERT(*gwattrp == NULL);
1238*bd670b35SErik Nordmark 
1239*bd670b35SErik Nordmark 	if (ill_arg != NULL)
1240*bd670b35SErik Nordmark 		match_args |= MATCH_IRE_ILL;
1241*bd670b35SErik Nordmark 
1242*bd670b35SErik Nordmark 	/*
1243*bd670b35SErik Nordmark 	 * We iterate up to three times to resolve a route, even though
1244*bd670b35SErik Nordmark 	 * we have four slots in the array. The extra slot is for an
1245*bd670b35SErik Nordmark 	 * IRE_IF_CLONE we might need to create.
1246*bd670b35SErik Nordmark 	 */
1247*bd670b35SErik Nordmark 	i = 0;
1248*bd670b35SErik Nordmark 	while (i < MAX_IRE_RECURSION - 1) {
1249*bd670b35SErik Nordmark 		/* ire_ftable_lookup handles round-robin/ECMP */
1250*bd670b35SErik Nordmark 		if (ire == NULL) {
1251*bd670b35SErik Nordmark 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1252*bd670b35SErik Nordmark 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
1253*bd670b35SErik Nordmark 			    match_args, xmit_hint, ipst, &generation);
1254*bd670b35SErik Nordmark 		} else {
1255*bd670b35SErik Nordmark 			/* Caller passed it; extra hold since we will rele */
1256*bd670b35SErik Nordmark 			ire_refhold(ire);
1257*bd670b35SErik Nordmark 			if (generationp != NULL)
1258*bd670b35SErik Nordmark 				generation = *generationp;
1259*bd670b35SErik Nordmark 			else
1260*bd670b35SErik Nordmark 				generation = IRE_GENERATION_VERIFY;
1261*bd670b35SErik Nordmark 		}
1262*bd670b35SErik Nordmark 		if (ire == NULL)
1263*bd670b35SErik Nordmark 			ire = ire_reject(ipst, B_FALSE);
1264*bd670b35SErik Nordmark 
1265*bd670b35SErik Nordmark 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1266*bd670b35SErik Nordmark 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1267*bd670b35SErik Nordmark 			goto error;
1268*bd670b35SErik Nordmark 
1269*bd670b35SErik Nordmark 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1270*bd670b35SErik Nordmark 
1271*bd670b35SErik Nordmark 		prefs[i] = ire_pref(ire);
1272*bd670b35SErik Nordmark 		if (i != 0) {
1273*bd670b35SErik Nordmark 			/*
1274*bd670b35SErik Nordmark 			 * Don't allow anything unusual past the first
1275*bd670b35SErik Nordmark 			 * iteration.
1276*bd670b35SErik Nordmark 			 */
1277*bd670b35SErik Nordmark 			if ((ire->ire_type &
1278*bd670b35SErik Nordmark 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1279*bd670b35SErik Nordmark 			    prefs[i] <= prefs[i-1]) {
1280*bd670b35SErik Nordmark 				ire_refrele(ire);
1281*bd670b35SErik Nordmark 				ire = ire_reject(ipst, B_FALSE);
1282*bd670b35SErik Nordmark 				goto error;
1283*bd670b35SErik Nordmark 			}
1284*bd670b35SErik Nordmark 		}
1285*bd670b35SErik Nordmark 		/* We have a usable IRE */
1286*bd670b35SErik Nordmark 		ires[i] = ire;
1287*bd670b35SErik Nordmark 		generations[i] = generation;
1288*bd670b35SErik Nordmark 		i++;
1289*bd670b35SErik Nordmark 
1290*bd670b35SErik Nordmark 		/* The first RTF_SETSRC address is passed back if setsrcp */
1291*bd670b35SErik Nordmark 		if ((ire->ire_flags & RTF_SETSRC) &&
1292*bd670b35SErik Nordmark 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
1293*bd670b35SErik Nordmark 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1294*bd670b35SErik Nordmark 			*setsrcp = ire->ire_setsrc_addr;
1295*bd670b35SErik Nordmark 		}
1296*bd670b35SErik Nordmark 
1297*bd670b35SErik Nordmark 		/* The first ire_gw_secattr is passed back if gwattrp */
1298*bd670b35SErik Nordmark 		if (ire->ire_gw_secattr != NULL &&
1299*bd670b35SErik Nordmark 		    gwattrp != NULL && *gwattrp == NULL)
1300*bd670b35SErik Nordmark 			*gwattrp = ire->ire_gw_secattr;
1301*bd670b35SErik Nordmark 
1302*bd670b35SErik Nordmark 		/*
1303*bd670b35SErik Nordmark 		 * Check if we have a short-cut pointer to an IRE for this
1304*bd670b35SErik Nordmark 		 * destination, and that the cached dependency isn't stale.
1305*bd670b35SErik Nordmark 		 * In that case we've rejoined an existing tree towards a
1306*bd670b35SErik Nordmark 		 * parent, thus we don't need to continue the loop to
1307*bd670b35SErik Nordmark 		 * discover the rest of the tree.
1308*bd670b35SErik Nordmark 		 */
1309*bd670b35SErik Nordmark 		mutex_enter(&ire->ire_lock);
1310*bd670b35SErik Nordmark 		if (ire->ire_dep_parent != NULL &&
1311*bd670b35SErik Nordmark 		    ire->ire_dep_parent->ire_generation ==
1312*bd670b35SErik Nordmark 		    ire->ire_dep_parent_generation) {
1313*bd670b35SErik Nordmark 			mutex_exit(&ire->ire_lock);
1314*bd670b35SErik Nordmark 			ire = NULL;
1315*bd670b35SErik Nordmark 			goto done;
1316*bd670b35SErik Nordmark 		}
1317*bd670b35SErik Nordmark 		mutex_exit(&ire->ire_lock);
1318*bd670b35SErik Nordmark 
1319*bd670b35SErik Nordmark 		/*
1320*bd670b35SErik Nordmark 		 * If this type should have an ire_nce_cache (even if it
1321*bd670b35SErik Nordmark 		 * doesn't yet have one) then we are done. Includes
1322*bd670b35SErik Nordmark 		 * IRE_INTERFACE with a full 32 bit mask.
1323*bd670b35SErik Nordmark 		 */
1324*bd670b35SErik Nordmark 		if (ire->ire_nce_capable) {
1325*bd670b35SErik Nordmark 			ire = NULL;
1326*bd670b35SErik Nordmark 			goto done;
1327*bd670b35SErik Nordmark 		}
1328*bd670b35SErik Nordmark 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1329*bd670b35SErik Nordmark 		/*
1330*bd670b35SErik Nordmark 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1331*bd670b35SErik Nordmark 		 * particular destination
1332*bd670b35SErik Nordmark 		 */
1333*bd670b35SErik Nordmark 		if (ire->ire_type & IRE_INTERFACE) {
1334*bd670b35SErik Nordmark 			in6_addr_t	v6nexthop;
1335*bd670b35SErik Nordmark 			ire_t		*clone;
1336*bd670b35SErik Nordmark 
1337*bd670b35SErik Nordmark 			ASSERT(ire->ire_masklen != IPV4_ABITS);
1338*bd670b35SErik Nordmark 
1339*bd670b35SErik Nordmark 			/*
1340*bd670b35SErik Nordmark 			 * In the case of ip_input and ILLF_FORWARDING not
1341*bd670b35SErik Nordmark 			 * being set, and in the case of RTM_GET,
1342*bd670b35SErik Nordmark 			 * there is no point in allocating
1343*bd670b35SErik Nordmark 			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
1344*bd670b35SErik Nordmark 			 * Note that !allocate can result in a ire_dep_parent
1345*bd670b35SErik Nordmark 			 * which is IRE_IF_* without an IRE_IF_CLONE.
1346*bd670b35SErik Nordmark 			 * We recover from that when we need to send packets
1347*bd670b35SErik Nordmark 			 * by ensuring that the generations become
1348*bd670b35SErik Nordmark 			 * IRE_GENERATION_VERIFY in this case.
1349*bd670b35SErik Nordmark 			 */
1350*bd670b35SErik Nordmark 			if (!allocate) {
1351*bd670b35SErik Nordmark 				invalidate = B_TRUE;
1352*bd670b35SErik Nordmark 				ire = NULL;
1353*bd670b35SErik Nordmark 				goto done;
1354*bd670b35SErik Nordmark 			}
1355*bd670b35SErik Nordmark 
1356*bd670b35SErik Nordmark 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1357*bd670b35SErik Nordmark 
1358*bd670b35SErik Nordmark 			clone = ire_create_if_clone(ire, &v6nexthop,
1359*bd670b35SErik Nordmark 			    &generation);
1360*bd670b35SErik Nordmark 			if (clone == NULL) {
1361*bd670b35SErik Nordmark 				/*
1362*bd670b35SErik Nordmark 				 * Temporary failure - no memory.
1363*bd670b35SErik Nordmark 				 * Don't want caller to cache IRE_NOROUTE.
1364*bd670b35SErik Nordmark 				 */
1365*bd670b35SErik Nordmark 				invalidate = B_TRUE;
1366*bd670b35SErik Nordmark 				ire = ire_blackhole(ipst, B_FALSE);
1367*bd670b35SErik Nordmark 				goto error;
1368*bd670b35SErik Nordmark 			}
1369*bd670b35SErik Nordmark 			/*
1370*bd670b35SErik Nordmark 			 * Make clone next to last entry and the
1371*bd670b35SErik Nordmark 			 * IRE_INTERFACE the last in the dependency
1372*bd670b35SErik Nordmark 			 * chain since the clone depends on the
1373*bd670b35SErik Nordmark 			 * IRE_INTERFACE.
1374*bd670b35SErik Nordmark 			 */
1375*bd670b35SErik Nordmark 			ASSERT(i >= 1);
1376*bd670b35SErik Nordmark 			ASSERT(i < MAX_IRE_RECURSION);
1377*bd670b35SErik Nordmark 
1378*bd670b35SErik Nordmark 			ires[i] = ires[i-1];
1379*bd670b35SErik Nordmark 			generations[i] = generations[i-1];
1380*bd670b35SErik Nordmark 			ires[i-1] = clone;
1381*bd670b35SErik Nordmark 			generations[i-1] = generation;
1382*bd670b35SErik Nordmark 			i++;
1383*bd670b35SErik Nordmark 
1384*bd670b35SErik Nordmark 			ire = NULL;
1385*bd670b35SErik Nordmark 			goto done;
1386*bd670b35SErik Nordmark 		}
1387*bd670b35SErik Nordmark 
1388*bd670b35SErik Nordmark 		/*
1389*bd670b35SErik Nordmark 		 * We only match on the type and optionally ILL when
1390*bd670b35SErik Nordmark 		 * recursing. The type match is used by some callers
1391*bd670b35SErik Nordmark 		 * to exclude certain types (such as IRE_IF_CLONE or
1392*bd670b35SErik Nordmark 		 * IRE_LOCAL|IRE_LOOPBACK).
1393*bd670b35SErik Nordmark 		 */
1394*bd670b35SErik Nordmark 		match_args &= MATCH_IRE_TYPE;
1395*bd670b35SErik Nordmark 		nexthop = ire->ire_gateway_addr;
1396*bd670b35SErik Nordmark 		if (ill == NULL && ire->ire_ill != NULL) {
1397*bd670b35SErik Nordmark 			ill = ire->ire_ill;
1398*bd670b35SErik Nordmark 			need_refrele = B_TRUE;
1399*bd670b35SErik Nordmark 			ill_refhold(ill);
1400*bd670b35SErik Nordmark 			match_args |= MATCH_IRE_ILL;
1401*bd670b35SErik Nordmark 		}
1402*bd670b35SErik Nordmark 		ire = NULL;
1403*bd670b35SErik Nordmark 	}
1404*bd670b35SErik Nordmark 	ASSERT(ire == NULL);
1405*bd670b35SErik Nordmark 	ire = ire_reject(ipst, B_FALSE);
1406*bd670b35SErik Nordmark 
1407*bd670b35SErik Nordmark error:
1408*bd670b35SErik Nordmark 	ASSERT(ire != NULL);
1409*bd670b35SErik Nordmark 	if (need_refrele)
1410*bd670b35SErik Nordmark 		ill_refrele(ill);
1411*bd670b35SErik Nordmark 
1412*bd670b35SErik Nordmark 	/*
1413*bd670b35SErik Nordmark 	 * In the case of MULTIRT we want to try a different IRE the next
1414*bd670b35SErik Nordmark 	 * time. We let the next packet retry in that case.
1415*bd670b35SErik Nordmark 	 */
1416*bd670b35SErik Nordmark 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1417*bd670b35SErik Nordmark 		(void) ire_no_good(ires[0]);
1418*bd670b35SErik Nordmark 
1419*bd670b35SErik Nordmark cleanup:
1420*bd670b35SErik Nordmark 	/* cleanup ires[i] */
1421*bd670b35SErik Nordmark 	ire_dep_unbuild(ires, i);
1422*bd670b35SErik Nordmark 	for (j = 0; j < i; j++)
1423*bd670b35SErik Nordmark 		ire_refrele(ires[j]);
1424*bd670b35SErik Nordmark 
1425*bd670b35SErik Nordmark 	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
1426*bd670b35SErik Nordmark 	/*
1427*bd670b35SErik Nordmark 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1428*bd670b35SErik Nordmark 	 * ip_select_route since the reject or lack of memory might be gone.
1429*bd670b35SErik Nordmark 	 */
1430*bd670b35SErik Nordmark 	if (generationp != NULL)
1431*bd670b35SErik Nordmark 		*generationp = IRE_GENERATION_VERIFY;
1432*bd670b35SErik Nordmark 	return (ire);
1433*bd670b35SErik Nordmark 
1434*bd670b35SErik Nordmark done:
1435*bd670b35SErik Nordmark 	ASSERT(ire == NULL);
1436*bd670b35SErik Nordmark 	if (need_refrele) {
1437*bd670b35SErik Nordmark 		ill_refrele(ill);
1438*bd670b35SErik Nordmark 		ill = NULL;
1439*bd670b35SErik Nordmark 	}
1440*bd670b35SErik Nordmark 
1441*bd670b35SErik Nordmark 	/* Build dependencies */
1442*bd670b35SErik Nordmark 	if (!ire_dep_build(ires, generations, i)) {
1443*bd670b35SErik Nordmark 		/* Something in chain was condemned; tear it apart */
1444*bd670b35SErik Nordmark 		ire = ire_reject(ipst, B_FALSE);
1445*bd670b35SErik Nordmark 		goto cleanup;
1446*bd670b35SErik Nordmark 	}
1447*bd670b35SErik Nordmark 
1448*bd670b35SErik Nordmark 	/*
1449*bd670b35SErik Nordmark 	 * Release all refholds except the one for ires[0] that we
1450*bd670b35SErik Nordmark 	 * will return to the caller.
1451*bd670b35SErik Nordmark 	 */
1452*bd670b35SErik Nordmark 	for (j = 1; j < i; j++)
1453*bd670b35SErik Nordmark 		ire_refrele(ires[j]);
1454*bd670b35SErik Nordmark 
1455*bd670b35SErik Nordmark 	if (invalidate) {
1456*bd670b35SErik Nordmark 		/*
1457*bd670b35SErik Nordmark 		 * Since we needed to allocate but couldn't we need to make
1458*bd670b35SErik Nordmark 		 * sure that the dependency chain is rebuilt the next time.
1459*bd670b35SErik Nordmark 		 */
1460*bd670b35SErik Nordmark 		ire_dep_invalidate_generations(ires[0]);
1461*bd670b35SErik Nordmark 		generation = IRE_GENERATION_VERIFY;
1462*bd670b35SErik Nordmark 	} else {
1463*bd670b35SErik Nordmark 		/*
1464*bd670b35SErik Nordmark 		 * IREs can have been added or deleted while we did the
1465*bd670b35SErik Nordmark 		 * recursive lookup and we can't catch those until we've built
1466*bd670b35SErik Nordmark 		 * the dependencies. We verify the stored
1467*bd670b35SErik Nordmark 		 * ire_dep_parent_generation to catch any such changes and
1468*bd670b35SErik Nordmark 		 * return IRE_GENERATION_VERIFY (which will cause
1469*bd670b35SErik Nordmark 		 * ip_select_route to be called again so we can redo the
1470*bd670b35SErik Nordmark 		 * recursive lookup next time we send a packet.
1471*bd670b35SErik Nordmark 		 */
1472*bd670b35SErik Nordmark 		generation = ire_dep_validate_generations(ires[0]);
1473*bd670b35SErik Nordmark 		if (generations[0] != ires[0]->ire_generation) {
1474*bd670b35SErik Nordmark 			/* Something changed at the top */
1475*bd670b35SErik Nordmark 			generation = IRE_GENERATION_VERIFY;
1476*bd670b35SErik Nordmark 		}
1477*bd670b35SErik Nordmark 	}
1478*bd670b35SErik Nordmark 	if (generationp != NULL)
1479*bd670b35SErik Nordmark 		*generationp = generation;
1480*bd670b35SErik Nordmark 
1481*bd670b35SErik Nordmark 	return (ires[0]);
1482*bd670b35SErik Nordmark }
1483*bd670b35SErik Nordmark 
1484*bd670b35SErik Nordmark ire_t *
1485*bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1486*bd670b35SErik Nordmark     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1487*bd670b35SErik Nordmark     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1488*bd670b35SErik Nordmark     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1489*bd670b35SErik Nordmark {
1490*bd670b35SErik Nordmark 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1491*bd670b35SErik Nordmark 	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
1492*bd670b35SErik Nordmark 	    gwattrp, generationp));
1493*bd670b35SErik Nordmark }
1494*bd670b35SErik Nordmark 
1495*bd670b35SErik Nordmark /*
1496*bd670b35SErik Nordmark  * Recursively look for a route to the destination.
1497*bd670b35SErik Nordmark  * We only handle a destination match here, yet we have the same arguments
1498*bd670b35SErik Nordmark  * as the full match to allow function pointers to select between the two.
1499*bd670b35SErik Nordmark  *
1500*bd670b35SErik Nordmark  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1501*bd670b35SErik Nordmark  * instead.
1502*bd670b35SErik Nordmark  *
1503*bd670b35SErik Nordmark  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1504*bd670b35SErik Nordmark  * is an error.
1505*bd670b35SErik Nordmark  * Allow at most one RTF_INDIRECT.
1506*bd670b35SErik Nordmark  */
1507*bd670b35SErik Nordmark ire_t *
1508*bd670b35SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate,
1509*bd670b35SErik Nordmark     uint32_t xmit_hint, ip_stack_t *ipst)
1510*bd670b35SErik Nordmark {
1511*bd670b35SErik Nordmark 	ire_t	*ire;
1512*bd670b35SErik Nordmark 	ire_t	*ire1;
1513*bd670b35SErik Nordmark 	uint_t	generation;
1514*bd670b35SErik Nordmark 
1515*bd670b35SErik Nordmark 	/* ire_ftable_lookup handles round-robin/ECMP */
1516*bd670b35SErik Nordmark 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1517*bd670b35SErik Nordmark 	    &generation);
1518*bd670b35SErik Nordmark 	ASSERT(ire != NULL);
1519*bd670b35SErik Nordmark 
1520*bd670b35SErik Nordmark 	/*
1521*bd670b35SErik Nordmark 	 * If this type should have an ire_nce_cache (even if it
1522*bd670b35SErik Nordmark 	 * doesn't yet have one) then we are done. Includes
1523*bd670b35SErik Nordmark 	 * IRE_INTERFACE with a full 32 bit mask.
1524*bd670b35SErik Nordmark 	 */
1525*bd670b35SErik Nordmark 	if (ire->ire_nce_capable)
1526*bd670b35SErik Nordmark 		return (ire);
1527*bd670b35SErik Nordmark 
1528*bd670b35SErik Nordmark 	/*
1529*bd670b35SErik Nordmark 	 * If the IRE has a current cached parent we know that the whole
1530*bd670b35SErik Nordmark 	 * parent chain is current, hence we don't need to discover and
1531*bd670b35SErik Nordmark 	 * build any dependencies by doing a recursive lookup.
1532*bd670b35SErik Nordmark 	 */
1533*bd670b35SErik Nordmark 	mutex_enter(&ire->ire_lock);
1534*bd670b35SErik Nordmark 	if (ire->ire_dep_parent != NULL &&
1535*bd670b35SErik Nordmark 	    ire->ire_dep_parent->ire_generation ==
1536*bd670b35SErik Nordmark 	    ire->ire_dep_parent_generation) {
1537*bd670b35SErik Nordmark 		mutex_exit(&ire->ire_lock);
1538*bd670b35SErik Nordmark 		return (ire);
1539*bd670b35SErik Nordmark 	}
1540*bd670b35SErik Nordmark 	mutex_exit(&ire->ire_lock);
1541*bd670b35SErik Nordmark 
1542*bd670b35SErik Nordmark 	/*
1543*bd670b35SErik Nordmark 	 * Fallback to loop in the normal code starting with the ire
1544*bd670b35SErik Nordmark 	 * we found. Normally this would return the same ire.
1545*bd670b35SErik Nordmark 	 */
1546*bd670b35SErik Nordmark 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1547*bd670b35SErik Nordmark 	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
1548*bd670b35SErik Nordmark 	    &generation);
1549*bd670b35SErik Nordmark 	ire_refrele(ire);
1550*bd670b35SErik Nordmark 	return (ire1);
1551*bd670b35SErik Nordmark }
1552